8#ifndef META_OCEAN_CV_FRAME_FILTER_SEPARABLE_H 
    9#define META_OCEAN_CV_FRAME_FILTER_SEPARABLE_H 
   86        static bool isFilterSymmetric(
const T* filterValues, 
const size_t size);
 
   96        static T sumFilterValues(
const T* filterValues, 
const size_t size);
 
  125        static bool filter(
const Frame& source, 
Frame& target, 
const std::vector<unsigned int>& horizontalFilter, 
const std::vector<unsigned int>& verticalFilter, 
Worker* worker = 
nullptr, 
ReusableMemory* reusableMemory = 
nullptr, 
const ProcessorInstructions processorInstructions = Processor::get().instructions());
 
  166        template <
typename T, 
typename TFilter>
 
  167        static bool filter(
const T* source, T* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const TFilter* horizontalFilter, 
const unsigned int horizontalFilterSize, 
const TFilter* verticalFilter, 
const unsigned int verticalFilterSize, 
Worker* worker = 
nullptr, 
ReusableMemory* reusableMemory = 
nullptr, 
const ProcessorInstructions processorInstructions = Processor::get().instructions());
 
  223        template <
typename T>
 
  224        static bool filterUniversal(
const T* source, T* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const float* horizontalFilter, 
const unsigned int horizontalFilterSize, 
const float* verticalFilter, 
const unsigned int verticalFilterSize, 
Worker* worker = 
nullptr);
 
  249        template <
typename T, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
  250        static void filter(
const T* source, T* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const TFilter* horizontalFilter, 
const unsigned int horizontalFilterSize, 
const TFilter* verticalFilter, 
const unsigned int verticalFilterSize, 
ReusableMemory* reusableMemory = 
nullptr, 
Worker* worker = 
nullptr);
 
  258        template <
typename T, ProcessorInstructions tProcessorInstructions>
 
  268        template <
typename T, ProcessorInstructions tProcessorInstructions>
 
  280        template <
typename T>
 
  281        static void fillLeftExtraBorder(
const T* source, 
const unsigned int channels, 
const unsigned int pixels, T* extendedRowLeft);
 
  292        template <
typename T>
 
  293        static void fillRightExtraBorder(
const T* sourceEnd, 
const unsigned int channels, 
const unsigned int pixels, T* extendedRowRight);
 
  311        template <
typename TSource, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
  332        template <
typename TSource, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
  355        template <
typename TSource, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
  380        template <
typename TSource, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
  396        template <
typename TSource, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
  397        static OCEAN_FORCE_INLINE 
void filterHorizontalRowOneBlockWith4Elements(
const TSource* 
const source, TFilter* 
const target, 
const unsigned int channels, 
const TFilter* 
const filter, 
const unsigned int filterSize, 
const bool isSymmetric);
 
  412        template <
typename TSource, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
  413        static OCEAN_FORCE_INLINE 
void filterHorizontalRowOneBlockWith8Elements(
const TSource* 
const source, TFilter* 
const target, 
const unsigned int channels, 
const TFilter* 
const filter, 
const unsigned int filterSize, 
const bool isSymmetric);
 
  428        template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
  444        template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
  460        template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
  478        template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
  479        static OCEAN_FORCE_INLINE 
void filterVerticalCoreRow32BitPerChannelFloat(
const TSource* source, TTarget* target, 
const unsigned int width, 
const unsigned int channels, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric, 
const unsigned int sourcePaddingElements);
 
  496        template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
  497        static OCEAN_FORCE_INLINE 
void filterVerticalBorderRow8Elements32BitPerChannelFloat(
const TSource* source, TTarget* target, 
const unsigned int sourceStrideElements, 
const unsigned int height, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric);
 
  514        template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
  515        static OCEAN_FORCE_INLINE 
void filterVerticalBorderRow16Elements32BitPerChannelFloat(
const TSource* source, TTarget* target, 
const unsigned int sourceStrideElements, 
const unsigned int height, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric);
 
  534        template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
  535        static OCEAN_FORCE_INLINE 
void filterVerticalBorderRow32BitPerChannelFloat(
const TSource* source, TTarget* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric, 
const unsigned int sourcePaddingElements);
 
  555        template <
typename TSource, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
  556        static void filterHorizontalSubset(
const TSource* source, TFilter* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const TFilter* filter, 
const unsigned int filterSize, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const unsigned int firstRow, 
const unsigned int numberRows);
 
  577        template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
  578        static void filterVerticalSubset(
const TSource* source, TTarget* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const float* filter, 
const unsigned int filterSize, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
unsigned int firstRow, 
const unsigned int numberRows);
 
  595        template <
typename T, 
typename TIntermediate>
 
  596        static void filterUniversalHorizontalSubset(
const T* source, TIntermediate* target, 
const unsigned int width, 
const unsigned int channels, 
const float* horizontalFilter, 
const unsigned int filterSize, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const unsigned int firstRow, 
const unsigned int numberRows);
 
  614        template <
typename T, 
typename TIntermediate>
 
  615        static void filterUniversalVerticalSubset(
const TIntermediate* source, T* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const float* verticalFilter, 
const unsigned int filterSize, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const unsigned int firstRow, 
const unsigned int numberRows);
 
  628        static inline unsigned int mirroredBorderLocationLeft(
const int value);
 
  643        static inline unsigned int mirroredBorderLocationRight(
const unsigned int value, 
const unsigned int size);
 
 
  646#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 10 
  666#elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10 
  691    ocean_assert(filterValues != 
nullptr);
 
  692    ocean_assert(size >= 1 && size % 2 == 1);
 
  694    for (
size_t n = 0; n < size / 2; ++n)
 
 
  708    ocean_assert(filterValues != 
nullptr);
 
  709    ocean_assert(size >= 1);
 
  711    T sum = filterValues[0];
 
  713    for (
size_t n = 1; n < size; ++n)
 
  715        sum += filterValues[n];
 
 
  721#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20 
  728    value = _mm_setzero_si128();
 
 
  736    value = _mm_set_ps1(0.0f);
 
 
  742    _mm_storeu_si128((__m128i*)target, value);
 
 
  746OCEAN_FORCE_INLINE 
void FrameFilterSeparable::writeSIMD<float, PI_SSE_2>(
const SIMD32x4<float>::Type& value, 
float* target)
 
  748    _mm_storeu_si128((__m128i*)target, _mm_castps_si128(value));
 
 
  753#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10 
  758    value = vdupq_n_u32(0u);
 
 
  764    value = vdupq_n_f32(0.0f);
 
 
  770    vst1q_u32(target, value);
 
 
  774OCEAN_FORCE_INLINE 
void FrameFilterSeparable::writeSIMD<float, PI_NEON>(
const SIMD32x4<float>::Type& value, 
float* target)
 
  776    vst1q_f32(target, value);
 
 
  784    ocean_assert(source != 
nullptr && extendedRow != 
nullptr);
 
  786    for (
unsigned int n = 0u; n < pixels; ++n)
 
  788        memcpy(extendedRow + n * channels, source + (pixels - n - 1u) * channels, 
sizeof(T) * channels);
 
 
  795    ocean_assert(sourceEnd != 
nullptr && extendedRow != 
nullptr);
 
  797    for (
unsigned int n = 0u; n < pixels; ++n)
 
  799        memcpy(extendedRow + n * channels, sourceEnd - (n + 1u) * 
int(channels), 
sizeof(T) * channels);
 
 
  803template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
  806    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
  807    ocean_assert(channels >= 1u);
 
  808    ocean_assert(filterSize % 2u == 1u);
 
  810    const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
 
  812    unsigned int remainingElements = width * channels;
 
  814    while (remainingElements >= 16u)
 
  816        filterVerticalCoreRow16Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, 
filter, filterSize, isSymmetric);
 
  821        remainingElements -= 16u;
 
  824    while (remainingElements >= 8u)
 
  826        filterVerticalCoreRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, 
filter, filterSize, isSymmetric);
 
  831        remainingElements -= 8u;
 
  834#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20 
  836    while (remainingElements >= 4u)
 
  838        filterVerticalCoreRow4Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, 
filter, filterSize, isSymmetric);
 
  843        remainingElements -= 4u;
 
  846    ocean_assert(width * channels >= 4u);
 
  847    ocean_assert(remainingElements < 4u);
 
  849    if (remainingElements != 0u)
 
  851        const unsigned int shift = 4u - remainingElements;
 
  853        filterVerticalCoreRow4Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements, 
filter, filterSize, isSymmetric);
 
  858    ocean_assert(width * channels >= 8u);
 
  859    ocean_assert(remainingElements < 8u);
 
  861    if (remainingElements != 0u)
 
  863        const unsigned int shift = 8u - remainingElements;
 
  865        filterVerticalCoreRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements, 
filter, filterSize, isSymmetric);
 
 
  871#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20 
  874OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalCoreRow4Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(
const unsigned int* source, uint8_t* target, 
const unsigned int sourceStrideElements, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
  876    ocean_assert(source != 
nullptr && target != 
nullptr);
 
  877    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
  897    const unsigned int filterSize_2 = filterSize / 2u;
 
  899    const __m128i* sourceBlock = (
const __m128i*)source;
 
  902    __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
 
  905    __m128 source128 = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock));
 
  906    __m128 result128 = _mm_mul_ps(source128, filterFactor_32x4);
 
  909    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
  911        const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
 
  912        const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
 
  917            filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
 
  921            __m128i source128i = _mm_add_epi32(_mm_loadu_si128(sourceMinus), _mm_loadu_si128(sourcePlus));
 
  923            result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128i), filterFactor_32x4));
 
  928            __m128 filterFactor128Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
 
  929            __m128 filterFactor128Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
 
  931            __m128i source128iMinus = _mm_loadu_si128(sourceMinus);
 
  932            __m128i source128iPlus = _mm_loadu_si128(sourcePlus);
 
  934            result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128iMinus), filterFactor128Minus));
 
  935            result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128iPlus), filterFactor128Plus));
 
  941    __m128i source128i = _mm_cvtps_epi32(result128);
 
  942    source128i = _mm_packs_epi32(source128i, source128i);
 
  943    source128i = _mm_packus_epi16(source128i, source128i);
 
  945    *((
unsigned int*)target) = SSE::value_u32<0u>(source128i);
 
 
  949OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalCoreRow4Elements32BitPerChannelFloat<float, float, PI_SSE_2>(
const float* source, 
float* target, 
const unsigned int sourceStrideElements, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
  951    ocean_assert(source != 
nullptr && target != 
nullptr);
 
  952    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
  967    const unsigned int filterSize_2 = filterSize / 2u;
 
  969    const __m128i* sourceBlock = (
const __m128i*)source;
 
  972    __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
 
  975    __m128 source128 = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
 
  976    __m128 result128 = _mm_mul_ps(source128, filterFactor_32x4);
 
  979    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
  981        const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
 
  982        const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
 
  987            filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
 
  991            source128 = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus)));
 
  993            result128 = _mm_add_ps(result128, _mm_mul_ps(source128, filterFactor_32x4));
 
  998            __m128 filterFactor_32x4Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
 
  999            __m128 filterFactor_32x4Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1002            __m128 source128Minus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus));
 
 1003            __m128 source128Plus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus));
 
 1005            result128 = _mm_add_ps(result128, _mm_mul_ps(source128Minus, filterFactor_32x4Minus));
 
 1006            result128 = _mm_add_ps(result128, _mm_mul_ps(source128Plus, filterFactor_32x4Plus));
 
 1010    writeSIMD<float, PI_SSE_2>(result128, target);
 
 
 1014OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(
const unsigned int* source, uint8_t* target, 
const unsigned int sourceStrideElements, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1016    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 1017    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 1038    const unsigned int filterSize_2 = filterSize / 2u;
 
 1040    const __m128i* sourceBlock = (
const __m128i*)source;
 
 1043    __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
 
 1046    __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
 
 1047    __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
 
 1050    __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
 
 1051    __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
 
 1054    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1056        const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
 
 1057        const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
 
 1062            filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1065            __m128i source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 0), _mm_loadu_si128(sourcePlus + 0));
 
 1066            __m128i source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 1), _mm_loadu_si128(sourcePlus + 1));
 
 1068            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
 
 1069            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
 
 1074            __m128 filterFactor128Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
 
 1075            __m128 filterFactor128Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1078            __m128i source128aiMinus =_mm_loadu_si128(sourceMinus + 0);
 
 1079            __m128i source128aiPlus = _mm_loadu_si128(sourcePlus + 0);
 
 1080            __m128i source128biMinus = _mm_loadu_si128(sourceMinus + 1);
 
 1081            __m128i source128biPlus = _mm_loadu_si128(sourcePlus + 1);
 
 1083            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
 
 1084            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
 
 1086            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
 
 1087            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
 
 1092    __m128i result128 = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
 
 1093    result128 = _mm_packus_epi16(result128, result128);
 
 1095    _mm_storel_epi64((__m128i*)target, result128);
 
 
 1099OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<float, float, PI_SSE_2>(
const float* source, 
float* target, 
const unsigned int sourceStrideElements, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1101    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 1102    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 1117    const unsigned int filterSize_2 = filterSize / 2u;
 
 1119    const __m128i* sourceBlock = (
const __m128i*)source;
 
 1122    __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
 
 1125    __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
 
 1126    __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
 
 1129    __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
 
 1130    __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
 
 1133    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1135        const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
 
 1136        const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
 
 1141            filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1145            source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0)));
 
 1146            source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1)));
 
 1148            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
 
 1149            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
 
 1154            __m128 filterFactor_32x4Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
 
 1155            __m128 filterFactor_32x4Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1158            __m128 source128aMinus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0));
 
 1159            __m128 source128aPlus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0));
 
 1160            __m128 source128bMinus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1));
 
 1161            __m128 source128bPlus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1));
 
 1163            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source128aMinus, filterFactor_32x4Minus));
 
 1164            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source128aPlus, filterFactor_32x4Plus));
 
 1166            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source128bMinus, filterFactor_32x4Minus));
 
 1167            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source128bPlus, filterFactor_32x4Plus));
 
 1171    writeSIMD<float, PI_SSE_2>(result_32x4a, target +  0);
 
 1172    writeSIMD<float, PI_SSE_2>(result_32x4b, target +  4);
 
 
 1177#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10 
 1180OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(
const unsigned int* source, uint8_t* target, 
const unsigned int sourceStrideElements, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1182    const unsigned int filterSize_2 = filterSize / 2u;
 
 1185    float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
 
 1188    float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
 
 1189    float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
 
 1191    float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
 
 1192    float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
 
 1195    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1197        const unsigned int* sourceMinus = source - sourceStrideElements * i;
 
 1198        const unsigned int* sourcePlus = source + sourceStrideElements * i;
 
 1203            filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 1206            uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
 
 1207            uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
 
 1209            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
 
 1210            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
 
 1216            float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
 
 1217            float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 1219            uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
 
 1220            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
 
 1222            uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
 
 1223            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
 
 1225            uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
 
 1226            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
 
 1228            uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
 
 1229            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
 
 1234    uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
 
 1236    uint8x8_t result64 = vqmovn_u16(result128ab);
 
 1238    vst1_u8(target, result64);
 
 
 1242OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<float, float, PI_NEON>(
const float* source, 
float* target, 
const unsigned int sourceStrideElements, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1244    const unsigned int filterSize_2 = filterSize / 2u;
 
 1247    float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
 
 1250    float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
 
 1251    float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
 
 1253    float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
 
 1254    float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
 
 1257    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1259        const float* sourceMinus = source - sourceStrideElements * i;
 
 1260        const float* sourcePlus = source + sourceStrideElements * i;
 
 1265            filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 1268            source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
 
 1269            source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
 
 1271            result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
 
 1272            result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
 
 1278            float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
 
 1279            float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 1281            float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
 
 1282            float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
 
 1284            float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
 
 1285            float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
 
 1287            result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
 
 1288            result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
 
 1290            result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
 
 1291            result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
 
 1295    vst1q_f32(target + 0, result_32x4a);
 
 1296    vst1q_f32(target + 4, result_32x4b);
 
 
 1301#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20 
 1304OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(
const unsigned int* source, uint8_t* target, 
const unsigned int sourceStrideElements, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1306    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 1307    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 1348    const unsigned int filterSize_2 = filterSize / 2u;
 
 1350    const __m128i* sourceBlock = (
const __m128i*)source;
 
 1353    __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
 
 1356    __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
 
 1357    __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
 
 1360    __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
 
 1361    __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
 
 1363    __m128 source_32x4c = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 2));
 
 1364    __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
 
 1366    __m128 source_32x4d = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 3));
 
 1367    __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
 
 1370    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1372        const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
 
 1373        const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
 
 1378            filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1382            __m128i source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 0), _mm_loadu_si128(sourcePlus + 0));
 
 1383            __m128i source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 1), _mm_loadu_si128(sourcePlus + 1));
 
 1385            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
 
 1386            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
 
 1388            source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 2), _mm_loadu_si128(sourcePlus + 2));
 
 1389            source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 3), _mm_loadu_si128(sourcePlus + 3));
 
 1391            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
 
 1392            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
 
 1397            __m128 filterFactor128Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
 
 1398            __m128 filterFactor128Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1400            __m128i source128aiMinus = _mm_loadu_si128(sourceMinus + 0);
 
 1401            __m128i source128aiPlus = _mm_loadu_si128(sourcePlus + 0);
 
 1403            __m128i source128biMinus = _mm_loadu_si128(sourceMinus + 1);
 
 1404            __m128i source128biPlus = _mm_loadu_si128(sourcePlus + 1);
 
 1406            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
 
 1407            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
 
 1409            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
 
 1410            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
 
 1412            __m128i source128ciMinus = _mm_loadu_si128(sourceMinus + 2);
 
 1413            __m128i source128ciPlus = _mm_loadu_si128(sourcePlus + 2);
 
 1415            __m128i source128diMinus = _mm_loadu_si128(sourceMinus + 3);
 
 1416            __m128i source128diPlus = _mm_loadu_si128(sourcePlus + 3);
 
 1418            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciMinus), filterFactor128Minus));
 
 1419            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diMinus), filterFactor128Minus));
 
 1421            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciPlus), filterFactor128Plus));
 
 1422            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diPlus), filterFactor128Plus));
 
 1427    __m128i result128ab = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
 
 1428    __m128i result128cd = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4c), _mm_cvtps_epi32(result_32x4d));
 
 1429    __m128i result128 = _mm_packus_epi16(result128ab, result128cd);
 
 1431    _mm_storeu_si128((__m128i*)target, result128);
 
 
 1435OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<float, float, PI_SSE_2>(
const float* source, 
float* target, 
const unsigned int sourceStrideElements, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1437    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 1438    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 1473    const unsigned int filterSize_2 = filterSize / 2u;
 
 1475    const __m128i* sourceBlock = (
const __m128i*)source;
 
 1478    __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
 
 1481    __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
 
 1482    __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
 
 1485    __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
 
 1486    __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
 
 1488    __m128 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 2));
 
 1489    __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
 
 1491    __m128 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 3));
 
 1492    __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
 
 1495    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1497        const __m128i* sourceMinus = (
const __m128i*)(source - sourceStrideElements * i);
 
 1498        const __m128i* sourcePlus = (
const __m128i*)(source + sourceStrideElements * i);
 
 1503            filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1507            source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0)));
 
 1508            source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1)));
 
 1510            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
 
 1511            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
 
 1513            source_32x4c = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 2)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 2)));
 
 1514            source_32x4d = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 3)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 3)));
 
 1516            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4));
 
 1517            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4));
 
 1522            __m128 filterFactor_32x4Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
 
 1523            __m128 filterFactor_32x4Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1525            source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0));
 
 1526            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
 
 1528            source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1));
 
 1529            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
 
 1531            source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 2));
 
 1532            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Minus));
 
 1534            source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 3));
 
 1535            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Minus));
 
 1537            source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0));
 
 1538            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
 
 1540            source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1));
 
 1541            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
 
 1543            source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 2));
 
 1544            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Plus));
 
 1546            source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 3));
 
 1547            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Plus));
 
 1551    writeSIMD<float, PI_SSE_2>(result_32x4a, target +  0);
 
 1552    writeSIMD<float, PI_SSE_2>(result_32x4b, target +  4);
 
 1553    writeSIMD<float, PI_SSE_2>(result_32x4c, target +  8);
 
 1554    writeSIMD<float, PI_SSE_2>(result_32x4d, target + 12);
 
 
 1559#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10 
 1562OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(
const unsigned int* source, uint8_t* target, 
const unsigned int sourceStrideElements, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1564    const unsigned int filterSize_2 = filterSize / 2u;
 
 1567    float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
 
 1570    float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
 
 1571    float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
 
 1573    float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
 
 1574    float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
 
 1576    float32x4_t source_32x4c = vcvtq_f32_u32(vld1q_u32(source + 4 * 2));
 
 1577    float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
 
 1579    float32x4_t source_32x4d = vcvtq_f32_u32(vld1q_u32(source + 4 * 3));
 
 1580    float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
 
 1583    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1585        const unsigned int* sourceMinus = source - sourceStrideElements * i;
 
 1586        const unsigned int* sourcePlus = source + sourceStrideElements * i;
 
 1591            filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 1594            uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
 
 1595            uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
 
 1597            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
 
 1598            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
 
 1600            source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 2), vld1q_u32(sourcePlus + 4 * 2));
 
 1601            source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 3), vld1q_u32(sourcePlus + 4 * 3));
 
 1603            result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ai), filterFactor_32x4);
 
 1604            result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128bi), filterFactor_32x4);
 
 1610            float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
 
 1611            float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 1613            uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
 
 1614            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
 
 1616            uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
 
 1617            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
 
 1619            uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
 
 1620            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
 
 1622            uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
 
 1623            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
 
 1625            uint32x4_t source128ciMinus = vld1q_u32(sourceMinus + 4 * 2);
 
 1626            result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciMinus), filterFactor128Minus);
 
 1628            uint32x4_t source128diMinus = vld1q_u32(sourceMinus + 4 * 3);
 
 1629            result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diMinus), filterFactor128Minus);
 
 1631            uint32x4_t source128ciPlus = vld1q_u32(sourcePlus + 4 * 2);
 
 1632            result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciPlus), filterFactor128Plus);
 
 1634            uint32x4_t source128diPlus = vld1q_u32(sourcePlus + 4 * 3);
 
 1635            result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diPlus), filterFactor128Plus);
 
 1640    uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
 
 1641    uint16x8_t result128cd = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4c)), vqmovn_u32(vcvtq_u32_f32(result_32x4d)));
 
 1643    uint8x16_t result128 = vcombine_u8(vqmovn_u16(result128ab), vqmovn_u16(result128cd));
 
 1645    vst1q_u8(target, result128);
 
 
 1649    OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<float, float, PI_NEON>(
const float* source, 
float* target, 
const unsigned int sourceStrideElements, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1651    const unsigned int filterSize_2 = filterSize / 2u;
 
 1654    float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
 
 1657    float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
 
 1658    float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
 
 1660    float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
 
 1661    float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
 
 1663    float32x4_t source_32x4c = vld1q_f32(source + 4 * 2);
 
 1664    float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
 
 1666    float32x4_t source_32x4d = vld1q_f32(source + 4 * 3);
 
 1667    float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
 
 1670    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1672        const float* sourceMinus = source - sourceStrideElements * i;
 
 1673        const float* sourcePlus = source + sourceStrideElements * i;
 
 1678            filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 1681            source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
 
 1682            source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
 
 1684            result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
 
 1685            result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
 
 1687            source_32x4c = vaddq_f32(vld1q_f32(sourceMinus + 4 * 2), vld1q_f32(sourcePlus + 4 * 2));
 
 1688            source_32x4d = vaddq_f32(vld1q_f32(sourceMinus + 4 * 3), vld1q_f32(sourcePlus + 4 * 3));
 
 1690            result_32x4c = vmlaq_f32(result_32x4c, source_32x4c, filterFactor_32x4);
 
 1691            result_32x4d = vmlaq_f32(result_32x4d, source_32x4d, filterFactor_32x4);
 
 1697            float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
 
 1698            float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 1700            float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
 
 1701            float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
 
 1703            float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
 
 1704            float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
 
 1706            result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
 
 1707            result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
 
 1709            result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
 
 1710            result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
 
 1712            source128aMinus = vld1q_f32(sourceMinus + 4 * 2);
 
 1713            source128aPlus = vld1q_f32(sourcePlus + 4 * 2);
 
 1715            source128bMinus = vld1q_f32(sourceMinus + 4 * 3);
 
 1716            source128bPlus = vld1q_f32(sourcePlus + 4 * 3);
 
 1718            result_32x4c = vmlaq_f32(result_32x4c, source128aMinus, filterFactor128Minus);
 
 1719            result_32x4d = vmlaq_f32(result_32x4d, source128bMinus, filterFactor128Minus);
 
 1721            result_32x4c = vmlaq_f32(result_32x4c, source128aPlus, filterFactor128Plus);
 
 1722            result_32x4d = vmlaq_f32(result_32x4d, source128bPlus, filterFactor128Plus);
 
 1726    vst1q_f32(target +  0, result_32x4a);
 
 1727    vst1q_f32(target +  4, result_32x4b);
 
 1728    vst1q_f32(target +  8, result_32x4c);
 
 1729    vst1q_f32(target + 12, result_32x4d);
 
 
 1734#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20 
 1737OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(
const unsigned int* source, uint8_t* target, 
const unsigned int sourceStrideElements, 
const unsigned int height, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1739    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 1740    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 1761    const unsigned int filterSize_2 = filterSize / 2u;
 
 1764    ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
 
 1766    const __m128i* sourceBlock = (
const __m128i*)source;
 
 1769    __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
 
 1772    __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
 
 1773    __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
 
 1776    __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
 
 1777    __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
 
 1779    __m128i source128ai, source128bi;
 
 1782    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1789        const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
 
 1790        const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
 
 1795            filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1799            source128ai = _mm_add_epi32(_mm_loadu_si128((
const __m128i*)sourceMinus + 0), _mm_loadu_si128((
const __m128i*)sourcePlus + 0));
 
 1800            source128bi = _mm_add_epi32(_mm_loadu_si128((
const __m128i*)sourceMinus + 1), _mm_loadu_si128((
const __m128i*)sourcePlus + 1));
 
 1802            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
 
 1803            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
 
 1809            __m128 filterFactor128Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
 
 1810            __m128 filterFactor128Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1814            source128ai = _mm_loadu_si128((
const __m128i*)sourceMinus + 0);
 
 1815            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor128Minus));
 
 1817            source128bi = _mm_loadu_si128((
const __m128i*)sourceMinus + 1);
 
 1818            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor128Minus));
 
 1820            source128ai = _mm_loadu_si128((
const __m128i*)sourcePlus + 0);
 
 1821            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor128Plus));
 
 1823            source128bi = _mm_loadu_si128((
const __m128i*)sourcePlus + 1);
 
 1824            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor128Plus));
 
 1829    __m128i result128 = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
 
 1830    result128 = _mm_packus_epi16(result128, result128);
 
 1832    _mm_storel_epi64((__m128i*)target, result128);
 
 
 1836OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<float, float, PI_SSE_2>(
const float* source, 
float* target, 
const unsigned int sourceStrideElements, 
const unsigned int height, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1838    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 1839    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 1854    const unsigned int filterSize_2 = filterSize / 2u;
 
 1857    ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
 
 1859    const __m128i* sourceBlock = (
const __m128i*)source;
 
 1862    __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
 
 1865    __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
 
 1866    __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
 
 1869    __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
 
 1870    __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
 
 1873    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1880        const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
 
 1881        const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
 
 1886            filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1890            source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 0)));
 
 1891            source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 1)));
 
 1893            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
 
 1894            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
 
 1899            __m128 filterFactor_32x4Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
 
 1900            __m128 filterFactor_32x4Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 1902            source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 0));
 
 1903            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
 
 1905            source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 1));
 
 1906            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
 
 1908            source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 0));
 
 1909            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
 
 1911            source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 1));
 
 1912            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
 
 1916    writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
 
 1917    writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
 
 
 1922#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10 
 1925OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(
const unsigned int* source, uint8_t* target, 
const unsigned int sourceStrideElements, 
const unsigned int height, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 1927    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 1928    ocean_assert(filterSize % 2u == 1u);
 
 1930    const unsigned int filterSize_2 = filterSize / 2u;
 
 1933    ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
 
 1936    float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
 
 1939    float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
 
 1940    float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
 
 1942    float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
 
 1943    float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
 
 1946    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 1953        const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
 
 1954        const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
 
 1959            filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 1963            uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
 
 1964            uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
 
 1966            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
 
 1967            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
 
 1973            float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
 
 1974            float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 1976            uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
 
 1977            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
 
 1979            uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
 
 1980            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
 
 1982            uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
 
 1983            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
 
 1985            uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
 
 1986            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
 
 1991    uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
 
 1993    uint8x8_t result64 = vqmovn_u16(result128ab);
 
 1995    vst1_u8(target, result64);
 
 
 1999OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<float, float, PI_NEON>(
const float* source, 
float* target, 
const unsigned int sourceStrideElements, 
const unsigned int height, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 2001    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 2002    ocean_assert(filterSize % 2u == 1u);
 
 2004    const unsigned int filterSize_2 = filterSize / 2u;
 
 2007    ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
 
 2010    float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
 
 2013    float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
 
 2014    float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
 
 2016    float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
 
 2017    float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
 
 2020    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 2027        const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
 
 2028        const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
 
 2033            filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 2037            source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
 
 2038            source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
 
 2040            result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
 
 2041            result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
 
 2047            float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
 
 2048            float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 2050            float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
 
 2051            float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
 
 2053            float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
 
 2054            float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
 
 2056            result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
 
 2057            result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
 
 2059            result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
 
 2060            result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
 
 2064    vst1q_f32(target + 0, result_32x4a);
 
 2065    vst1q_f32(target + 4, result_32x4b);
 
 
 2070#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20 
 2073OCEAN_FORCE_INLINE 
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(
const uint8_t* source, 
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target)
 
 2088    __m128i filterFactor_32x4 = _mm_set1_epi32(
int(filterFactor));
 
 2091    __m128i source128 = _mm_set1_epi32(*((
const int*)source));
 
 2094    source128 = _mm_unpacklo_epi8(source128, _mm_setzero_si128());
 
 2097    source128 = _mm_unpackhi_epi16(source128, _mm_setzero_si128());
 
 2100    source128 = _mm_madd_epi16(source128, filterFactor_32x4);
 
 2103    target = _mm_add_epi32(target, source128);
 
 
 2107OCEAN_FORCE_INLINE 
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_SSE_2>(
const float* source, 
const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
 
 2123    __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
 
 2126    __m128 source_32x4 = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)source));
 
 2129    source_32x4 = _mm_mul_ps(source_32x4, filterFactor_32x4);
 
 2132    target_32x4 = _mm_add_ps(target_32x4, source_32x4);
 
 
 2136OCEAN_FORCE_INLINE 
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(
const uint8_t* sourceLeft, 
const uint8_t* sourceRight, 
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target)
 
 2151    __m128i filterFactor_32x4 = _mm_set1_epi32(
int(filterFactor));
 
 2154    __m128i source128 = _mm_add_epi16(_mm_unpacklo_epi8(_mm_set1_epi32(*((
const int*)sourceLeft)), _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_set1_epi32(*((
const int*)sourceRight)), _mm_setzero_si128()));
 
 2157    source128 = _mm_unpackhi_epi16(source128, _mm_setzero_si128());
 
 2160    source128 = _mm_madd_epi16(source128, filterFactor_32x4);
 
 2163    target = _mm_add_epi32(target, source128);
 
 
 2167OCEAN_FORCE_INLINE 
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_SSE_2>(
const float* sourceLeft, 
const float* sourceRight, 
const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
 
 2183    __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
 
 2186    __m128 source_32x4 = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceLeft)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceRight)));
 
 2189    source_32x4 = _mm_mul_ps(source_32x4, filterFactor_32x4);
 
 2192    target_32x4 = _mm_add_ps(target_32x4, source_32x4);
 
 
 2196OCEAN_FORCE_INLINE 
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(
const uint8_t* source, 
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
 
 2213    __m128i filterFactor_32x4 = _mm_set1_epi32(
int(filterFactor));
 
 2216    __m128i source_32x4a = _mm_loadl_epi64((
const __m128i*)source);
 
 2219    source_32x4a = _mm_unpacklo_epi8(source_32x4a, _mm_setzero_si128());
 
 2222    __m128i source_32x4b = _mm_unpackhi_epi16(source_32x4a, _mm_setzero_si128());
 
 2223    source_32x4a = _mm_unpacklo_epi16(source_32x4a, _mm_setzero_si128());
 
 2226    source_32x4a = _mm_madd_epi16(source_32x4a, filterFactor_32x4);
 
 2227    source_32x4b = _mm_madd_epi16(source_32x4b, filterFactor_32x4);
 
 2230    target_32x4a = _mm_add_epi32(target_32x4a, source_32x4a);
 
 2231    target_32x4b = _mm_add_epi32(target_32x4b, source_32x4b);
 
 
 2235OCEAN_FORCE_INLINE 
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_SSE_2>(
const float* source, 
const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
 
 2251    __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
 
 2254    __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)source + 0));
 
 2255    __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)source + 1));
 
 2258    source_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
 
 2259    source_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
 
 2262    target_32x4a = _mm_add_ps(target_32x4a, source_32x4a);
 
 2263    target_32x4b = _mm_add_ps(target_32x4b, source_32x4b);
 
 
 2267OCEAN_FORCE_INLINE 
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(
const uint8_t* sourceLeft, 
const uint8_t* sourceRight, 
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
 
 2284    __m128i filterFactor_32x4 = _mm_set1_epi32(
int(filterFactor));
 
 2287    __m128i source_32x4a = _mm_add_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((
const __m128i*)sourceLeft), _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_loadl_epi64((
const __m128i*)sourceRight), _mm_setzero_si128()));
 
 2290    __m128i source_32x4b = _mm_unpackhi_epi16(source_32x4a, _mm_setzero_si128());
 
 2291    source_32x4a = _mm_unpacklo_epi16(source_32x4a, _mm_setzero_si128());
 
 2294    source_32x4a = _mm_madd_epi16(source_32x4a, filterFactor_32x4);
 
 2295    source_32x4b = _mm_madd_epi16(source_32x4b, filterFactor_32x4);
 
 2298    target_32x4a = _mm_add_epi32(target_32x4a, source_32x4a);
 
 2299    target_32x4b = _mm_add_epi32(target_32x4b, source_32x4b);
 
 
 2303OCEAN_FORCE_INLINE 
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_SSE_2>(
const float* sourceLeft, 
const float* sourceRight, 
const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
 
 2319    __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
 
 2322    __m128 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceLeft + 0)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceRight + 0)));
 
 2323    __m128 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceLeft + 1)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceRight + 1)));
 
 2326    source_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
 
 2327    source_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
 
 2330    target_32x4a = _mm_add_ps(target_32x4a, source_32x4a);
 
 2331    target_32x4b = _mm_add_ps(target_32x4b, source_32x4b);
 
 
 2336#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10 
 2339OCEAN_FORCE_INLINE 
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(
const uint8_t* source, 
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4)
 
 2341    ocean_assert(filterFactor <= 0xFFFFu);
 
 2344    const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
 
 2346#if defined(__aarch64__) 
 2349    const uint16x8_t source16_8 = vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(*((
const uint32_t*)source))));
 
 2353    uint32_t sourceValue;
 
 2354    ((uint8_t*)&sourceValue)[0] = source[0];
 
 2355    ((uint8_t*)&sourceValue)[1] = source[1];
 
 2356    ((uint8_t*)&sourceValue)[2] = source[2];
 
 2357    ((uint8_t*)&sourceValue)[3] = source[3];
 
 2359    const uint16x8_t source16_8 = vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(sourceValue)));
 
 2364    target_32x4 = vmlal_u16(target_32x4, vget_low_u16(source16_8), filterFactor16_4);
 
 
 2368OCEAN_FORCE_INLINE 
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_NEON>(
const float* source, 
const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
 
 2371    const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
 
 2374    const float32x4_t source128 = vld1q_f32(source);
 
 2377    target_32x4 = vmlaq_f32(target_32x4, source128, filterFactor_32x4);
 
 
 2381OCEAN_FORCE_INLINE 
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(
const uint8_t* sourceLeft, 
const uint8_t* sourceRight, 
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4)
 
 2383    ocean_assert(filterFactor <= 0xFFFFu);
 
 2386    const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
 
 2388#if defined(__aarch64__) 
 2391    const uint16x8_t source16_8 = vaddl_u8(vreinterpret_u8_u32(vdup_n_u32(*((
const uint32_t*)sourceLeft))), vreinterpret_u8_u32(vdup_n_u32(*((
const uint32_t*)sourceRight))));
 
 2395    uint32_t sourceValueLeft;
 
 2396    ((uint8_t*)&sourceValueLeft)[0] = sourceLeft[0];
 
 2397    ((uint8_t*)&sourceValueLeft)[1] = sourceLeft[1];
 
 2398    ((uint8_t*)&sourceValueLeft)[2] = sourceLeft[2];
 
 2399    ((uint8_t*)&sourceValueLeft)[3] = sourceLeft[3];
 
 2401    uint32_t sourceValueRight;
 
 2402    ((uint8_t*)&sourceValueRight)[0] = sourceRight[0];
 
 2403    ((uint8_t*)&sourceValueRight)[1] = sourceRight[1];
 
 2404    ((uint8_t*)&sourceValueRight)[2] = sourceRight[2];
 
 2405    ((uint8_t*)&sourceValueRight)[3] = sourceRight[3];
 
 2408    const uint16x8_t source16_8 = vaddl_u8(vreinterpret_u8_u32(vdup_n_u32(sourceValueLeft)), vreinterpret_u8_u32(vdup_n_u32(sourceValueRight)));
 
 2413    target_32x4 = vmlal_u16(target_32x4, vget_low_u16(source16_8), filterFactor16_4);
 
 
 2417OCEAN_FORCE_INLINE 
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_NEON>(
const float* sourceLeft, 
const float* sourceRight, 
const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
 
 2420    const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
 
 2423    const float32x4_t source_32x4 = vaddq_f32(vld1q_f32(sourceLeft), vld1q_f32(sourceRight));
 
 2426    target_32x4 = vmlaq_f32(target_32x4, source_32x4, filterFactor_32x4);
 
 
 2430OCEAN_FORCE_INLINE 
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(
const uint8_t* source, 
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
 
 2432    ocean_assert(filterFactor <= 0xFFFFu);
 
 2435    const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
 
 2438    const uint16x8_t source16_8 = vmovl_u8(vld1_u8(source));
 
 2441    target_32x4a = vmlal_u16(target_32x4a, vget_low_u16(source16_8), filterFactor16_4);
 
 2442    target_32x4b = vmlal_u16(target_32x4b, vget_high_u16(source16_8), filterFactor16_4);
 
 
 2446OCEAN_FORCE_INLINE 
void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_NEON>(
const float* source, 
const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
 
 2449    const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
 
 2452    const float32x4_t source_32x4a = vld1q_f32(source + 0);
 
 2453    const float32x4_t source_32x4b = vld1q_f32(source + 4);
 
 2456    target_32x4a = vmlaq_f32(target_32x4a, source_32x4a, filterFactor_32x4);
 
 2457    target_32x4b = vmlaq_f32(target_32x4b, source_32x4b, filterFactor_32x4);
 
 
 2461OCEAN_FORCE_INLINE 
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(
const uint8_t* sourceLeft, 
const uint8_t* sourceRight, 
const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
 
 2463    ocean_assert(filterFactor <= 0xFFFFu);
 
 2466    const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
 
 2469    const uint16x8_t source16_8 = vaddl_u8(vld1_u8(sourceLeft), vld1_u8(sourceRight));
 
 2472    target_32x4a = vmlal_u16(target_32x4a, vget_low_u16(source16_8), filterFactor16_4);
 
 2473    target_32x4b = vmlal_u16(target_32x4b, vget_high_u16(source16_8), filterFactor16_4);
 
 
 2477OCEAN_FORCE_INLINE 
void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_NEON>(
const float* sourceLeft, 
const float* sourceRight, 
const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
 
 2480    const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
 
 2483    const float32x4_t source_32x4a = vaddq_f32(vld1q_f32(sourceLeft + 0), vld1q_f32(sourceRight + 0));
 
 2484    const float32x4_t source_32x4b = vaddq_f32(vld1q_f32(sourceLeft + 4), vld1q_f32(sourceRight + 4));
 
 2487    target_32x4a = vmlaq_f32(target_32x4a, source_32x4a, filterFactor_32x4);
 
 2488    target_32x4b = vmlaq_f32(target_32x4b, source_32x4b, filterFactor_32x4);
 
 
 2493template <
typename TSource, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
 2522    ocean_assert(source != 
nullptr && 
filter != 
nullptr);
 
 2523    ocean_assert(channels >= 1u);
 
 2524    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 2528    setSIMDZero<TFilter, tProcessorInstructions>(target_32x4);
 
 2532        const unsigned int filterSize_2 = filterSize / 2u;
 
 2535        for (
unsigned int n = 0u; n < filterSize_2; ++n)
 
 2537            symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, source + (filterSize - n - 1) * channels, 
filter[n], target_32x4);
 
 2541        asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + filterSize_2 * channels, 
filter[filterSize_2], target_32x4);
 
 2546        for (
unsigned int n = 0u; n < filterSize; ++n)
 
 2548            asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, 
filter[n], target_32x4);
 
 2552    writeSIMD<TFilter, tProcessorInstructions>(target_32x4, target);
 
 
 2555template <
typename TSource, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
 2589    ocean_assert(source != 
nullptr && 
filter != 
nullptr);
 
 2590    ocean_assert(channels >= 1u);
 
 2591    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 2595    setSIMDZero<TFilter, tProcessorInstructions>(target_32x4a);
 
 2596    setSIMDZero<TFilter, tProcessorInstructions>(target_32x4b);
 
 2600        const unsigned int filterSize_2 = filterSize / 2u;
 
 2603        for (
unsigned int n = 0u; n < filterSize_2; ++n)
 
 2605            symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, source + (filterSize - n - 1) * channels, 
filter[n], target_32x4a, target_32x4b);
 
 2609        asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + filterSize_2 * channels, 
filter[filterSize_2], target_32x4a, target_32x4b);
 
 2614        for (
unsigned int n = 0u; n < filterSize; ++n)
 
 2616            asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, 
filter[n], target_32x4a, target_32x4b);
 
 2620    writeSIMD<TFilter, tProcessorInstructions>(target_32x4a, target + 0);
 
 2621    writeSIMD<TFilter, tProcessorInstructions>(target_32x4b, target + 4);
 
 
 2624#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20 
 2627OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(
const unsigned int* source, uint8_t* target, 
const unsigned int sourceStrideElements, 
const unsigned int height, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 2629    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 2630    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 2668    const unsigned int filterSize_2 = filterSize / 2u;
 
 2671    ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
 
 2673    const __m128i* sourceBlock = (
const __m128i*)source;
 
 2676    __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
 
 2679    __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
 
 2680    __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
 
 2683    __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
 
 2684    __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
 
 2686    __m128 source_32x4c = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 2));
 
 2687    __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
 
 2689    __m128 source_32x4d = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 3));
 
 2690    __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
 
 2692    __m128i source128ai, source128bi;
 
 2695    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 2702        const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
 
 2703        const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
 
 2708            filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 2712            source128ai = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 0), _mm_loadu_si128((__m128i*)sourcePlus + 0));
 
 2713            source128bi = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 1), _mm_loadu_si128((__m128i*)sourcePlus + 1));
 
 2715            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
 
 2716            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
 
 2718            source128ai = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 2), _mm_loadu_si128((__m128i*)sourcePlus + 2));
 
 2719            source128bi = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 3), _mm_loadu_si128((__m128i*)sourcePlus + 3));
 
 2721            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
 
 2722            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
 
 2727            __m128 filterFactor128Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
 
 2728            __m128 filterFactor128Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 2730            __m128i source128aiMinus = _mm_loadu_si128((__m128i*)sourceMinus + 0);
 
 2731            __m128i source128aiPlus = _mm_loadu_si128((__m128i*)sourcePlus + 0);
 
 2733            __m128i source128biMinus = _mm_loadu_si128((__m128i*)sourceMinus + 1);
 
 2734            __m128i source128biPlus = _mm_loadu_si128((__m128i*)sourcePlus + 1);
 
 2736            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
 
 2737            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
 
 2739            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
 
 2740            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
 
 2742            __m128i source128ciMinus = _mm_loadu_si128((__m128i*)sourceMinus + 2);
 
 2743            __m128i source128ciPlus = _mm_loadu_si128((__m128i*)sourcePlus + 2);
 
 2745            __m128i source128diMinus = _mm_loadu_si128((__m128i*)sourceMinus + 3);
 
 2746            __m128i source128diPlus = _mm_loadu_si128((__m128i*)sourcePlus + 3);
 
 2748            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciMinus), filterFactor128Minus));
 
 2749            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciPlus), filterFactor128Plus));
 
 2751            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diMinus), filterFactor128Minus));
 
 2752            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diPlus), filterFactor128Plus));
 
 2757    __m128i result128ab = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
 
 2758    __m128i result128cd = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4c), _mm_cvtps_epi32(result_32x4d));
 
 2759    __m128i result128 = _mm_packus_epi16(result128ab, result128cd);
 
 2761    _mm_storeu_si128((__m128i*)target, result128);
 
 
 2765OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<float, float, PI_SSE_2>(
const float* source, 
float* target, 
const unsigned int sourceStrideElements, 
const unsigned int height, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 2767    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 2768    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 2800    const unsigned int filterSize_2 = filterSize / 2u;
 
 2803    ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
 
 2805    const __m128i* sourceBlock = (
const __m128i*)source;
 
 2808    __m128 filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2]);
 
 2811    __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
 
 2812    __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
 
 2815    __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
 
 2816    __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
 
 2818    __m128 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 2));
 
 2819    __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
 
 2821    __m128 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 3));
 
 2822    __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
 
 2825    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 2832        const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
 
 2833        const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
 
 2838            filterFactor_32x4 = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 2842            source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 0)));
 
 2843            source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 1)));
 
 2845            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
 
 2846            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
 
 2848            source_32x4c = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 2)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 2)));
 
 2849            source_32x4d = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 3)), _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 3)));
 
 2851            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4));
 
 2852            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4));
 
 2857            __m128 filterFactor_32x4Minus = _mm_set_ps1(
filter[filterSize_2 - i]);
 
 2858            __m128 filterFactor_32x4Plus = _mm_set_ps1(
filter[filterSize_2 + i]);
 
 2860            source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 0));
 
 2861            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
 
 2863            source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 1));
 
 2864            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
 
 2866            source_32x4c = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 2));
 
 2867            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Minus));
 
 2869            source_32x4d = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourceMinus + 3));
 
 2870            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Minus));
 
 2872            source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 0));
 
 2873            result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
 
 2875            source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 1));
 
 2876            result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
 
 2878            source_32x4c = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 2));
 
 2879            result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Plus));
 
 2881            source_32x4d = _mm_castsi128_ps(_mm_loadu_si128((
const __m128i*)sourcePlus + 3));
 
 2882            result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Plus));
 
 2886    writeSIMD<float, PI_SSE_2>(result_32x4a, target +  0);
 
 2887    writeSIMD<float, PI_SSE_2>(result_32x4b, target +  4);
 
 2888    writeSIMD<float, PI_SSE_2>(result_32x4c, target +  8);
 
 2889    writeSIMD<float, PI_SSE_2>(result_32x4d, target + 12);
 
 
 2894#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10 
 2897OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(
const unsigned int* source, uint8_t* target, 
const unsigned int sourceStrideElements, 
const unsigned int height, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 2899    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 2900    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 2919    const unsigned int filterSize_2 = filterSize / 2u;
 
 2922    ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
 
 2925    float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
 
 2928    float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
 
 2929    float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
 
 2931    float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
 
 2932    float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
 
 2934    float32x4_t source_32x4c = vcvtq_f32_u32(vld1q_u32(source + 4 * 2));
 
 2935    float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
 
 2937    float32x4_t source_32x4d = vcvtq_f32_u32(vld1q_u32(source + 4 * 3));
 
 2938    float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
 
 2941    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 2948        const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
 
 2949        const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
 
 2954            filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 2958            uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
 
 2959            uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
 
 2961            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
 
 2962            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
 
 2964            source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 2), vld1q_u32(sourcePlus + 4 * 2));
 
 2965            source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 3), vld1q_u32(sourcePlus + 4 * 3));
 
 2967            result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ai), filterFactor_32x4);
 
 2968            result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128bi), filterFactor_32x4);
 
 2974            float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
 
 2975            float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 2977            uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
 
 2978            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
 
 2980            uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
 
 2981            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
 
 2983            uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
 
 2984            result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
 
 2986            uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
 
 2987            result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
 
 2989            uint32x4_t source128ciMinus = vld1q_u32(sourceMinus + 4 * 2);
 
 2990            result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciMinus), filterFactor128Minus);
 
 2992            uint32x4_t source128diMinus = vld1q_u32(sourceMinus + 4 * 3);
 
 2993            result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diMinus), filterFactor128Minus);
 
 2995            uint32x4_t source128ciPlus = vld1q_u32(sourcePlus + 4 * 2);
 
 2996            result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciPlus), filterFactor128Plus);
 
 2998            uint32x4_t source128diPlus = vld1q_u32(sourcePlus + 4 * 3);
 
 2999            result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diPlus), filterFactor128Plus);
 
 3004    uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
 
 3005    uint16x8_t result128cd = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4c)), vqmovn_u32(vcvtq_u32_f32(result_32x4d)));
 
 3007    uint8x16_t result128 = vcombine_u8(vqmovn_u16(result128ab), vqmovn_u16(result128cd));
 
 3009    vst1q_u8(target, result128);
 
 
 3013OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<float, float, PI_NEON>(
const float* source, 
float* target, 
const unsigned int sourceStrideElements, 
const unsigned int height, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric)
 
 3015    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 3016    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 3035    const unsigned int filterSize_2 = filterSize / 2u;
 
 3038    ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
 
 3041    float32x4_t filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2]);
 
 3044    float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
 
 3045    float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
 
 3047    float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
 
 3048    float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
 
 3050    float32x4_t source_32x4c = vld1q_f32(source + 4 * 2);
 
 3051    float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
 
 3053    float32x4_t source_32x4d = vld1q_f32(source + 4 * 3);
 
 3054    float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
 
 3057    for (
unsigned int i = 1u; i <= filterSize_2; ++i)
 
 3064        const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
 
 3065        const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
 
 3070            filterFactor_32x4 = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 3074            source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
 
 3075            source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
 
 3077            result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
 
 3078            result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
 
 3080            source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 2), vld1q_f32(sourcePlus + 4 * 2));
 
 3081            source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 3), vld1q_f32(sourcePlus + 4 * 3));
 
 3083            result_32x4c = vmlaq_f32(result_32x4c, source_32x4a, filterFactor_32x4);
 
 3084            result_32x4d = vmlaq_f32(result_32x4d, source_32x4b, filterFactor_32x4);
 
 3090            float32x4_t filterFactor128Minus = vdupq_n_f32(
filter[filterSize_2 - i]);
 
 3091            float32x4_t filterFactor128Plus = vdupq_n_f32(
filter[filterSize_2 + i]);
 
 3093            float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
 
 3094            float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
 
 3096            float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
 
 3097            float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
 
 3099            result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
 
 3100            result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
 
 3102            result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
 
 3103            result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
 
 3105            source128aMinus = vld1q_f32(sourceMinus + 4 * 2);
 
 3106            source128aPlus = vld1q_f32(sourcePlus + 4 * 2);
 
 3108            source128bMinus = vld1q_f32(sourceMinus + 4 * 3);
 
 3109            source128bPlus = vld1q_f32(sourcePlus + 4 * 3);
 
 3111            result_32x4c = vmlaq_f32(result_32x4c, source128aMinus, filterFactor128Minus);
 
 3112            result_32x4d = vmlaq_f32(result_32x4d, source128bMinus, filterFactor128Minus);
 
 3114            result_32x4c = vmlaq_f32(result_32x4c, source128aPlus, filterFactor128Plus);
 
 3115            result_32x4d = vmlaq_f32(result_32x4d, source128bPlus, filterFactor128Plus);
 
 3119    vst1q_f32(target +  0, result_32x4a);
 
 3120    vst1q_f32(target +  4, result_32x4b);
 
 3121    vst1q_f32(target +  8, result_32x4c);
 
 3122    vst1q_f32(target + 12, result_32x4d);
 
 
 3127template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
 3128OCEAN_FORCE_INLINE 
void FrameFilterSeparable::filterVerticalBorderRow32BitPerChannelFloat(
const TSource* source, TTarget* target, 
const unsigned int width, 
const unsigned height, 
const unsigned int channels, 
const unsigned int row, 
const float* filter, 
const unsigned int filterSize, 
const bool isSymmetric, 
const unsigned int sourcePaddingElements)
 
 3130    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 3131    ocean_assert(channels >= 1u);
 
 3132    ocean_assert(filterSize <= height);
 
 3133    ocean_assert(filterSize % 2u == 1u);
 
 3135    const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
 
 3137    unsigned int remainingElements = width * channels;
 
 3139    while (remainingElements >= 16u)
 
 3141        filterVerticalBorderRow16Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, height, row, 
filter, filterSize, isSymmetric);
 
 3146        remainingElements -= 16u;
 
 3149    while (remainingElements >= 8u)
 
 3151        filterVerticalBorderRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, height, row, 
filter, filterSize, isSymmetric);
 
 3156        remainingElements -= 8u;
 
 3159    ocean_assert(width * channels >= 8u);
 
 3160    ocean_assert(remainingElements < 8u);
 
 3162    if (remainingElements != 0u)
 
 3164        const unsigned int shift = 8u - remainingElements;
 
 3166        filterVerticalBorderRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements, height, row, 
filter, filterSize, isSymmetric);
 
 
 3170template <
typename TSource, 
typename TFilter, const ProcessorInstructions tProcessorInstructions>
 
 3171void FrameFilterSeparable::filterHorizontalSubset(
const TSource* source, TFilter* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const TFilter* filter, 
const unsigned int filterSize, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const unsigned int firstRow, 
const unsigned int numberRows)
 
 3173    ocean_assert(source != 
nullptr && target != 
nullptr && 
filter != 
nullptr);
 
 3174    ocean_assert(width >= filterSize + 1u);
 
 3176    ocean_assert(channels >= 1u && channels <= 8u);
 
 3177    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 3179    ocean_assert_and_suppress_unused(firstRow + numberRows <= height, height);
 
 3181    const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
 
 3182    const unsigned int targetStrideElements = width * channels + targetPaddingElements;
 
 3186    const unsigned int filterSize_2 = filterSize / 2u;
 
 3187    const unsigned int extraPixels = filterSize_2 * 2u;
 
 3189    const unsigned int extendedElements = (width + extraPixels) * channels;
 
 3191    Memory extendedRowMemory = Memory::create<TSource>(extendedElements);
 
 3192    TSource* 
const extendedRow = extendedRowMemory.
data<TSource>();
 
 3193    ocean_assert(extendedRow != 
nullptr);
 
 3195    source += firstRow * sourceStrideElements;
 
 3196    target += firstRow * targetStrideElements;
 
 3198    for (
unsigned int rowsProcessed = 0u; rowsProcessed < numberRows; ++rowsProcessed)
 
 3201        fillLeftExtraBorder<TSource>(source, channels, filterSize_2, extendedRow);
 
 3202        memcpy(extendedRow + filterSize_2 * channels, source, width * channels * 
sizeof(TSource));
 
 3203        fillRightExtraBorder<TSource>(source + width * channels, channels, filterSize_2, extendedRow + (width + filterSize_2) * channels);
 
 3205        const TSource* extendedSource = extendedRow;
 
 3207        unsigned int remainingElements = width * channels;
 
 3209#if (defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10) || (defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20) 
 3211#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10 
 3213#elif defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20 
 3219        while (remainingElements >= 8u)
 
 3221            filterHorizontalRowOneBlockWith8Elements<TSource, TFilter, instructions>(extendedSource, target, channels, 
filter, filterSize, isSymmetric);
 
 3223            extendedSource += 8;
 
 3226            remainingElements -= 8u;
 
 3231        while (remainingElements >= 4u)
 
 3233            filterHorizontalRowOneBlockWith4Elements<TSource, TFilter, instructions>(extendedSource, target, channels, 
filter, filterSize, isSymmetric);
 
 3235            extendedSource += 4;
 
 3238            remainingElements -= 4u;
 
 3243        if (remainingElements != 0u)
 
 3245            const unsigned int shift = 4u - remainingElements;
 
 3247            extendedSource -= shift;
 
 3250            filterHorizontalRowOneBlockWith4Elements<TSource, TFilter, instructions>(extendedSource, target, channels, 
filter, filterSize, isSymmetric);
 
 3258        OCEAN_SUPPRESS_UNUSED_WARNING(extendedSource);
 
 3259        OCEAN_SUPPRESS_UNUSED_WARNING(remainingElements);
 
 3260        OCEAN_SUPPRESS_UNUSED_WARNING(isSymmetric);
 
 3264#ifdef OCEAN_INTENSIVE_DEBUG 
 3266            const TFilter* 
const debugTarget = target - width * channels;
 
 3268            for (
unsigned int x = 0u; x < width; ++x)
 
 3270                for (
unsigned int n = 0u; n < channels; ++n)
 
 3272                    float result = 0.0f;
 
 3274                    for (
int xx = -
int(filterSize_2); xx <= int(filterSize_2); ++xx)
 
 3277                        result += float(*(source + mirroredXX * channels + 
int(n))) * 
filter[xx + int(filterSize_2)];
 
 3280                    const TFilter targetValue = debugTarget[x * channels + n];
 
 3282                    if (std::is_same<float, TFilter>::value)
 
 3288                        const TFilter result8_converted = (TFilter)(result);
 
 3289                        const TFilter result8_rounded = (TFilter)(result + 0.51f);
 
 3290                        ocean_assert(result8_converted == targetValue || result8_rounded == targetValue);
 
 3297        source += sourceStrideElements;
 
 3298        target += targetPaddingElements;
 
 
 3302template <
typename TSource, 
typename TTarget, ProcessorInstructions tProcessorInstructions>
 
 3303void FrameFilterSeparable::filterVerticalSubset(
const TSource* source, TTarget* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const float* filter, 
const unsigned int filterSize, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const unsigned int firstRow, 
const unsigned int numberRows)
 
 3305    ocean_assert(source != 
nullptr && target != 
nullptr);
 
 3306    ocean_assert(
filter != 
nullptr);
 
 3307    ocean_assert(height >= filterSize / 2u + 1u);
 
 3308    ocean_assert(channels >= 1u && channels <= 8u);
 
 3310    ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
 
 3312    ocean_assert(firstRow + numberRows <= height);
 
 3313    ocean_assert(width * channels >= 8u * 2u);
 
 3315    const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
 
 3316    const unsigned int targetStrideElements = width * channels + targetPaddingElements;
 
 3320    const unsigned int filterSize_2 = filterSize / 2u;
 
 3322#ifdef OCEAN_INTENSIVE_DEBUG 
 3323    const TSource* 
const debugSource = source;
 
 3326    source += firstRow * sourceStrideElements;
 
 3327    target += firstRow * targetStrideElements;
 
 3329    unsigned int row = firstRow;
 
 3333    while (row < min(firstRow + numberRows, filterSize_2))
 
 3335        filterVerticalBorderRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, height, channels, row, 
filter, filterSize, isSymmetric, sourcePaddingElements);
 
 3337#ifdef OCEAN_INTENSIVE_DEBUG 
 3339            for (
unsigned int x = 0u; x < width * channels; ++x)
 
 3341                float result = 0.0f;
 
 3343                for (
int y = -
int(filterSize_2); y <= int(filterSize_2); ++y)
 
 3346                    result += float(*(debugSource + mirroredY * 
int(sourceStrideElements) + 
int(x))) * 
filter[y + int(filterSize_2)];
 
 3349                const TTarget targetValue = target[x];
 
 3351                if (std::is_same<float, TTarget>::value)
 
 3363        source += sourceStrideElements;
 
 3364        target += targetStrideElements;
 
 3371    while (row < min(firstRow + numberRows, height - filterSize_2))
 
 3373        filterVerticalCoreRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, channels, 
filter, filterSize, isSymmetric, sourcePaddingElements);
 
 3375#ifdef OCEAN_INTENSIVE_DEBUG 
 3377            for (
unsigned int x = 0u; x < width * channels; ++x)
 
 3379                float result = 0.0f;
 
 3381                for (
int y = -
int(filterSize_2); y <= int(filterSize_2); ++y)
 
 3382                    result += 
float(*(debugSource + (
int(row) + y) * 
int(sourceStrideElements) + 
int(x))) * 
filter[y + 
int(filterSize_2)];
 
 3384                const TTarget targetValue = target[x];
 
 3386                ocean_assert(result >= 0.0f && result < 256.0f);
 
 3388                if (std::is_same<float, TTarget>::value)
 
 3400        source += sourceStrideElements;
 
 3401        target += targetStrideElements;
 
 3408    while (row < firstRow + numberRows)
 
 3410        ocean_assert(row + filterSize_2 >= height);
 
 3412        filterVerticalBorderRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, height, channels, row, 
filter, filterSize, isSymmetric, sourcePaddingElements);
 
 3414#ifdef OCEAN_INTENSIVE_DEBUG 
 3417            for (
unsigned int x = 0u; x < width * channels; ++x)
 
 3419                float result = 0.0f;
 
 3421                for (
int y = -
int(filterSize_2); y <= int(filterSize_2); ++y)
 
 3424                    result += float(*(debugSource + mirroredY * 
int(sourceStrideElements) + 
int(x))) * 
filter[y + int(filterSize_2)];
 
 3427                const TTarget targetValue = target[x];
 
 3429                ocean_assert(result >= 0.0f && result < 256.0f);
 
 3431                if (std::is_same<float, TTarget>::value)
 
 3443        source += sourceStrideElements;
 
 3444        target += targetStrideElements;
 
 
 3450template <
typename T, 
typename TFilter, ProcessorInstructions tProcessorInstructions>
 
 3451inline void FrameFilterSeparable::filter(
const T* source, T* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const TFilter* horizontalFilter, 
const unsigned int horizontalFilterSize, 
const TFilter* verticalFilter, 
const unsigned int verticalFilterSize, 
ReusableMemory* reusableMemory, 
Worker* worker)
 
 3453    Frame localIntermediateFrame;
 
 3454    Frame* intermediateFrame = &localIntermediateFrame;
 
 3456    if (reusableMemory != 
nullptr)
 
 3467        worker->
executeFunction(
Worker::Function::createStatic(&filterHorizontalSubset<T, TFilter, tProcessorInstructions>, source, intermediateFrame->
data<TFilter>(), width, height, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame->
paddingElements(), 0u, 0u), 0u, height);
 
 3471        filterHorizontalSubset<T, TFilter, tProcessorInstructions>(source, intermediateFrame->
data<TFilter>(), width, height, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame->
paddingElements(), 0u, height);
 
 3477    std::vector<float> localFloatFilters;
 
 3478    const float* verticalFloatFilter = 
nullptr;
 
 3480    if (std::is_same<TFilter, float>::value)
 
 3482        verticalFloatFilter = (
const float*)(verticalFilter);
 
 3486        ocean_assert((std::is_same<TFilter, unsigned int>::value));
 
 3488        const TFilter sumHorizontalFilterValues = 
sumFilterValues(horizontalFilter, horizontalFilterSize);
 
 3489        const TFilter sumVerticalFilterValues = 
sumFilterValues(verticalFilter, verticalFilterSize);
 
 3491        const unsigned int normalizationFactor = (
unsigned int)(sumHorizontalFilterValues) * (
unsigned int)(sumVerticalFilterValues);
 
 3492        ocean_assert(normalizationFactor != 0u);
 
 3494        const float invNormalizationFactor = 1.0f / float(normalizationFactor);
 
 3496        std::vector<float>& floatFilterBufferToUse = reusableMemory != 
nullptr ? reusableMemory->
filterFactors_ : localFloatFilters;
 
 3498        floatFilterBufferToUse.resize(verticalFilterSize);
 
 3500        for (
unsigned int n = 0u; n < verticalFilterSize; ++n)
 
 3502            floatFilterBufferToUse[n] = float(verticalFilter[n]) * invNormalizationFactor;
 
 3505        verticalFloatFilter = floatFilterBufferToUse.data();
 
 3510        worker->
executeFunction(
Worker::Function::createStatic(&filterVerticalSubset<TFilter, T, tProcessorInstructions>, intermediateFrame->
constdata<TFilter>(), target, width, height, channels, (
const float*)(verticalFloatFilter), verticalFilterSize, intermediateFrame->
paddingElements(), targetPaddingElements, 0u, 0u), 0u, height);
 
 3514        filterVerticalSubset<TFilter, T, tProcessorInstructions>(intermediateFrame->
constdata<TFilter>(), target, width, height, channels, (
const float*)(verticalFloatFilter), verticalFilterSize, intermediateFrame->
paddingElements(), targetPaddingElements, 0u, height);
 
 
 3518template <
typename T, 
typename TFilter>
 
 3519bool FrameFilterSeparable::filter(
const T* source, T* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const TFilter* horizontalFilter, 
const unsigned int horizontalFilterSize, 
const TFilter* verticalFilter, 
const unsigned int verticalFilterSize, 
Worker* worker, 
ReusableMemory* reusableMemory, 
const ProcessorInstructions processorInstructions)
 
 3521    ocean_assert(source != 
nullptr && target != 
nullptr);
 
 3522    ocean_assert(width >= horizontalFilterSize && height >= verticalFilterSize);
 
 3523    ocean_assert(channels >= 1u);
 
 3525    if (source == 
nullptr || target == 
nullptr || width < horizontalFilterSize || height < verticalFilterSize || channels == 0u)
 
 3530    OCEAN_SUPPRESS_UNUSED_WARNING(reusableMemory);
 
 3532    if (width * channels >= 16u && width >= horizontalFilterSize + 1u)
 
 3534        switch (Processor::bestInstructionGroup<false>(processorInstructions))
 
 3541                OCEAN_APPLY_IF_SSE((filter<T, TFilter, PI_SSE_2>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, reusableMemory, worker)));
 
 3545                OCEAN_APPLY_IF_NEON((filter<T, TFilter, PI_GROUP_NEON>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, reusableMemory, worker)));
 
 3552                ocean_assert(
false && 
"Invalid instructions!");
 
 3556    if constexpr (std::is_same<float, TFilter>::value)
 
 3558        filterUniversal<T>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, (
const float*)(horizontalFilter), horizontalFilterSize, (
const float*)(verticalFilter), verticalFilterSize, worker);
 
 3563        if constexpr (std::is_same<unsigned int, TFilter>::value)
 
 3565            const TFilter horizontalNormalization = 
sumFilterValues(horizontalFilter, horizontalFilterSize);
 
 3566            ocean_assert(horizontalNormalization != TFilter(0));
 
 3568            std::vector<float> horizontalFloatFilter(horizontalFilterSize);
 
 3569            for (
size_t n = 0; n < horizontalFloatFilter.size(); ++n)
 
 3571                horizontalFloatFilter[n] = float(horizontalFilter[n]) / float(horizontalNormalization);
 
 3574            const TFilter verticalNormalization = 
sumFilterValues(verticalFilter, verticalFilterSize);
 
 3575            ocean_assert(verticalNormalization != TFilter(0));
 
 3577            std::vector<float> verticalFloatFilter(verticalFilterSize);
 
 3578            for (
size_t n = 0; n < verticalFloatFilter.size(); ++n)
 
 3580                verticalFloatFilter[n] = float(verticalFilter[n]) / float(verticalNormalization);
 
 3583            return filterUniversal<T>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFloatFilter.data(), (
unsigned int)horizontalFloatFilter.size(), verticalFloatFilter.data(), (
unsigned int)verticalFloatFilter.size(), worker);
 
 3587    ocean_assert(
false && 
"Invalid combination of parameters!");
 
 
 3591template <
typename T>
 
 3592bool FrameFilterSeparable::filterUniversal(
const T* source, T* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const float* horizontalFilter, 
const unsigned int horizontalFilterSize, 
const float* verticalFilter, 
const unsigned int verticalFilterSize, 
Worker* worker)
 
 3594    ocean_assert(source != 
nullptr && target != 
nullptr);
 
 3595    ocean_assert(width >= 1u && height >= 1u);
 
 3596    ocean_assert(channels != 0u);
 
 3598    ocean_assert(horizontalFilter != 
nullptr && verticalFilter != 
nullptr);
 
 3599    ocean_assert(horizontalFilterSize % 2u == 1u);
 
 3600    ocean_assert(verticalFilterSize % 2u == 1u);
 
 3602    if (source == 
nullptr || target == 
nullptr 
 3603        || verticalFilter == 
nullptr || horizontalFilter == 
nullptr 
 3604        || horizontalFilterSize > width || verticalFilterSize > height
 
 3605        || horizontalFilterSize % 2u != 1u || verticalFilterSize % 2u != 1u)
 
 3616        worker->
executeFunction(
Worker::Function::createStatic(&filterUniversalHorizontalSubset<T, TIntermediate>, source, intermediateFrame.
data<TIntermediate>(), width, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame.
paddingElements(), 0u, 0u), 0u, height);
 
 3617        worker->
executeFunction(
Worker::Function::createStatic(&filterUniversalVerticalSubset<T, TIntermediate>, intermediateFrame.
constdata<TIntermediate>(), target, width, height, channels, verticalFilter, verticalFilterSize, intermediateFrame.
paddingElements(), targetPaddingElements, 0u, 0u), 0u, height);
 
 3621        filterUniversalHorizontalSubset<T, TIntermediate>(source, intermediateFrame.
data<TIntermediate>(), width, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame.
paddingElements(), 0u, height);
 
 3622        filterUniversalVerticalSubset<T, TIntermediate>(intermediateFrame.
data<TIntermediate>(), target, width, height, channels, verticalFilter, verticalFilterSize, intermediateFrame.
paddingElements(), targetPaddingElements, 0u, height);
 
 
 3628template <
typename T, 
typename TIntermediate>
 
 3629void FrameFilterSeparable::filterUniversalHorizontalSubset(
const T* source, TIntermediate* target, 
const unsigned int width, 
unsigned int channels, 
const float* horizontalFilter, 
unsigned int horizontalFilterSize, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const unsigned int firstRow, 
const unsigned int numberRows)
 
 3631    ocean_assert(source != 
nullptr && target != 
nullptr);
 
 3632    ocean_assert(width >= 1u);
 
 3633    ocean_assert(channels != 0u);
 
 3635    ocean_assert(horizontalFilterSize <= 
size_t(width));
 
 3636    ocean_assert(horizontalFilterSize % 2u == 1u);
 
 3638    const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
 
 3639    const unsigned int targetStrideElements = width * channels + targetPaddingElements;
 
 3641    const unsigned int filterSize = horizontalFilterSize;
 
 3642    const unsigned int filterSize_2 = filterSize / 2u;
 
 3643    ocean_assert(filterSize_2 * 2u <= width);
 
 3645    std::vector<TIntermediate> filterCopy;
 
 3647    if (!std::is_same<TIntermediate, float>::value)
 
 3649        filterCopy.resize(horizontalFilterSize);
 
 3650        for (
size_t n = 0; n < filterCopy.size(); ++n)
 
 3652            filterCopy[n] = TIntermediate(horizontalFilter[n]);
 
 3656    const TIntermediate* 
const filter = filterCopy.empty() ? (
const TIntermediate*)horizontalFilter : filterCopy.data();
 
 3658    source += firstRow * sourceStrideElements;
 
 3659    target += firstRow * targetStrideElements;
 
 3661    TIntermediate* 
const targetEnd = target + numberRows * targetStrideElements;
 
 3663    while (target != targetEnd)
 
 3665        ocean_assert(target < targetEnd);
 
 3669        for (
unsigned int x = 0u; x < filterSize_2; ++x)
 
 3671            for (
unsigned int n = 0u; n < channels; ++n)
 
 3675                for (
unsigned int s = 1u; s < filterSize; ++s)
 
 3680                target[n] = response;
 
 3689        for (
unsigned int x = filterSize_2; x < width - filterSize_2; ++x)
 
 3691            for (
unsigned int n = 0u; n < channels; ++n)
 
 3693                TIntermediate response = TIntermediate(source[channels * 0u + n]) * 
filter[0];
 
 3695                for (
unsigned int s = 1u; s < filterSize; ++s)
 
 3697                    response += TIntermediate(source[channels * s + n]) * 
filter[s];
 
 3700                target[n] = response;
 
 3709        for (
unsigned int x = 0u; x < filterSize_2; ++x)
 
 3711            for (
unsigned int n = 0u; n < channels; ++n)
 
 3715                for (
unsigned int s = 1u; s < filterSize; ++s)
 
 3720                target[n] = response;
 
 3727        source += filterSize_2 * 2u * channels + sourcePaddingElements;
 
 3728        target += targetPaddingElements;
 
 
 3732template <
typename T, 
typename TIntermediate>
 
 3733void FrameFilterSeparable::filterUniversalVerticalSubset(
const TIntermediate* source, T* target, 
const unsigned int width, 
const unsigned int height, 
const unsigned int channels, 
const float* verticalFilter, 
const unsigned int verticalFilterSize, 
const unsigned int sourcePaddingElements, 
const unsigned int targetPaddingElements, 
const unsigned int firstRow, 
const unsigned int numberRows)
 
 3735    ocean_assert(source != 
nullptr && target != 
nullptr);
 
 3736    ocean_assert(width >= 1u && height >= 1u);
 
 3737    ocean_assert(channels != 0u);
 
 3739    ocean_assert(verticalFilterSize <= height);
 
 3740    ocean_assert(verticalFilterSize % 2u == 1u);
 
 3742    const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
 
 3743    const unsigned int targetStrideElements = width * channels + targetPaddingElements;
 
 3745    const TIntermediate* 
const sourceStart = source;
 
 3747    const unsigned int filterSize = verticalFilterSize;
 
 3748    const unsigned int filterSize_2 = filterSize / 2u;
 
 3749    ocean_assert(filterSize_2 * 2u <= height);
 
 3751    std::vector<TIntermediate> filterCopy;
 
 3753    if (!std::is_same<TIntermediate, float>::value)
 
 3755        filterCopy.resize(verticalFilterSize);
 
 3757        for (
size_t n = 0; n < filterCopy.size(); ++n)
 
 3759            filterCopy[n] = TIntermediate(verticalFilter[n]);
 
 3763    const TIntermediate* 
const filter = filterCopy.empty() ? (
const TIntermediate*)verticalFilter : filterCopy.data();
 
 3765    source += max(0, 
int(firstRow) - 
int(filterSize_2)) * sourceStrideElements;
 
 3766    target += firstRow * targetStrideElements;
 
 3768    unsigned int y = firstRow;
 
 3772    while (y < min(filterSize_2, firstRow + numberRows))
 
 3774        ocean_assert(source == sourceStart);
 
 3775        const TIntermediate* sourceCopy = source;
 
 3777        for (
unsigned int x = 0u; x < width; ++x)
 
 3779            for (
unsigned int n = 0u; n < channels; ++n)
 
 3783                for (
unsigned int s = 1u; s < filterSize; ++s)
 
 3788                target[n] = T(response);
 
 3795        target += targetPaddingElements;
 
 3798        source = sourceCopy;
 
 3804    const unsigned int centerRows = (
unsigned int)max(0, 
int(min(firstRow + numberRows, height - filterSize_2)) - int(y));
 
 3806    for (
unsigned int yCenter = 0u; yCenter < centerRows; ++yCenter)
 
 3808        for (
unsigned int x = 0u; x < width; ++x)
 
 3810            for (
unsigned int c = 0u; c < channels; ++c)
 
 3812                TIntermediate response = TIntermediate(source[channels * 0u + c]) * 
filter[0];
 
 3814                for (
unsigned int s = 1u; s < filterSize; ++s)
 
 3816                    response += TIntermediate(source[sourceStrideElements * s + c]) * 
filter[s];
 
 3819                target[c] = T(response);
 
 3826        source += sourcePaddingElements;
 
 3827        target += targetPaddingElements;
 
 3834    while (y < firstRow + numberRows)
 
 3836        ocean_assert(y >= height - filterSize_2 && y < height);
 
 3837        source = sourceStart + (height - filterSize_2 * 2u) * sourceStrideElements;
 
 3839        const unsigned int yy = y - (height - filterSize_2);
 
 3840        ocean_assert(yy < filterSize_2);
 
 3842        for (
unsigned int x = 0u; x < width; ++x)
 
 3844            for (
unsigned int n = 0u; n < channels; ++n)
 
 3848                for (
unsigned int s = 1u; s < filterSize; ++s)
 
 3853                target[n] = T(response);
 
 3860        target += targetPaddingElements;
 
 
 3883    ocean_assert(value < 2u * size);
 
 3894        ocean_assert(size * 2u - value - 1u < size);
 
 3895        return size * 2u - value - 1u;
 
 
This class holds re-usable memory for the filtering process.
Definition FrameFilterSeparable.h:40
 
ReusableMemory()=default
Default constructor.
 
std::vector< float > filterFactors_
Float-based filter factors which can be re-used during filtering.
Definition FrameFilterSeparable.h:56
 
std::vector< float > normalizedVerticalFilter_
Normalized vertical filter factors which can be re-used during filtering.
Definition FrameFilterSeparable.h:62
 
Frame intermediateFrame_
An intermediate frame which can be re-used during filtering.
Definition FrameFilterSeparable.h:53
 
std::vector< float > normalizedHorizontalFilter_
Normalized horizontal filter factors which can be re-used during filtering.
Definition FrameFilterSeparable.h:59
 
This class implements separable filter.
Definition FrameFilterSeparable.h:33
 
static void filterVerticalSubset(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int channels, const float *filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, unsigned int firstRow, const unsigned int numberRows)
Applies the vertical filtering for a subset of the frame with a specified 1D filter kernel for frames...
Definition FrameFilterSeparable.h:3303
 
static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource *sourceLeft, const TSource *sourceRight, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4)
Determines the filter responses for one filter factor of a symmetric filter for 4 successive frame el...
 
static bool filterUniversal(const T *source, T *target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const float *horizontalFilter, const unsigned int horizontalFilterSize, const float *verticalFilter, const unsigned int verticalFilterSize, Worker *worker=nullptr)
Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a hori...
Definition FrameFilterSeparable.h:3592
 
static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource *sourceLeft, const TSource *sourceRight, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4a, typename SIMD32x4< TFilter >::Type &target_32x4b)
Determines the filter responses for one filter factor of a symmetric filter for 8 successive frame el...
 
static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource *source, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4a, typename SIMD32x4< TFilter >::Type &target_32x4b)
Determines the filter responses for one filter factor of an asymmetric filter for 8 successive frame ...
 
static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith8Elements(const TSource *const source, TFilter *const target, const unsigned int channels, const TFilter *const filter, const unsigned int filterSize, const bool isSymmetric)
Determines the horizontal filter responses for one block with 8 successive frame elements (8 elements...
Definition FrameFilterSeparable.h:2556
 
static OCEAN_FORCE_INLINE void writeSIMD(const typename SIMD32x4< T >::Type &value, T *target)
Writes a SIMD with four 32 bit values to (not aligned) memory.
 
static void filterUniversalHorizontalSubset(const T *source, TIntermediate *target, const unsigned int width, const unsigned int channels, const float *horizontalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies an horizontal filter to a subset of an image with almost arbitrary data type.
Definition FrameFilterSeparable.h:3629
 
static void filterUniversalVerticalSubset(const TIntermediate *source, T *target, const unsigned int width, const unsigned int height, const unsigned int channels, const float *verticalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies an vertical filter to a subset of an image with almost arbitrary data type.
Definition FrameFilterSeparable.h:3733
 
static void filterHorizontalSubset(const TSource *source, TFilter *target, const unsigned int width, const unsigned int height, const unsigned int channels, const TFilter *filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies the horizontal filtering in a subset of a frame with a specified 1D filter kernel for frames ...
Definition FrameFilterSeparable.h:3171
 
static OCEAN_FORCE_INLINE void filterVerticalCoreRow32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int channels, const float *filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
Determines the vertical filter responses for the inner core of a frame for one row.
Definition FrameFilterSeparable.h:804
 
static T sumFilterValues(const T *filterValues, const size_t size)
Determines the sum of all elements of a given 1D filter.
Definition FrameFilterSeparable.h:706
 
static bool isFilterSymmetric(const T *filterValues, const size_t size)
Returns whether a given 1D filter is symmetric.
Definition FrameFilterSeparable.h:689
 
static OCEAN_FORCE_INLINE void filterVerticalBorderRow16Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses near the (vertical) border of a frame for one row while proc...
 
static OCEAN_FORCE_INLINE void filterVerticalCoreRow4Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
 
static OCEAN_FORCE_INLINE void filterVerticalCoreRow8Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
 
static OCEAN_FORCE_INLINE void filterVerticalCoreRow16Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
 
static void fillLeftExtraBorder(const T *source, const unsigned int channels, const unsigned int pixels, T *extendedRowLeft)
Fills the left border area of an extended row with mirrored pixel information (from the left image re...
Definition FrameFilterSeparable.h:782
 
static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource *source, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4)
Determines the filter responses for one filter factor of an asymmetric filter for 4 successive frame ...
 
static OCEAN_FORCE_INLINE void filterVerticalBorderRow32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
Determines the vertical filter responses near the (vertical) border of a frame for one row.
 
static void fillRightExtraBorder(const T *sourceEnd, const unsigned int channels, const unsigned int pixels, T *extendedRowRight)
Fills the right border area of an extended row with mirrored pixel information (from the right image ...
Definition FrameFilterSeparable.h:793
 
static OCEAN_FORCE_INLINE void filterVerticalBorderRow8Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses near the (vertical) border of a frame for one row while proc...
 
static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith4Elements(const TSource *const source, TFilter *const target, const unsigned int channels, const TFilter *const filter, const unsigned int filterSize, const bool isSymmetric)
Determines the horizontal filter responses for one block with 4 successive frame elements (4 elements...
Definition FrameFilterSeparable.h:2494
 
static bool filter(const Frame &source, Frame &target, const std::vector< unsigned int > &horizontalFilter, const std::vector< unsigned int > &verticalFilter, Worker *worker=nullptr, ReusableMemory *reusableMemory=nullptr, const ProcessorInstructions processorInstructions=Processor::get().instructions())
Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a hori...
 
static OCEAN_FORCE_INLINE void setSIMDZero(typename SIMD32x4< T >::Type &value)
Sets a given SIMD value to zero.
 
static Caller< void > createStatic(typename StaticFunctionPointerMaker< void, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass >::Type function)
Creates a new caller container for a static function with no function parameter.
Definition Caller.h:2877
 
This class implements Ocean's image class.
Definition Frame.h:1808
 
const T * constdata(const unsigned int planeIndex=0u) const
Returns a pointer to the read-only pixel data of a specific plane.
Definition Frame.h:4251
 
T * data(const unsigned int planeIndex=0u)
Returns a pointer to the pixel data of a specific plane.
Definition Frame.h:4242
 
bool set(const FrameType &frameType, const bool forceOwner, const bool forceWritable=false, const Indices32 &planePaddingElements=Indices32(), const Timestamp ×tamp=Timestamp(false), bool *reallocated=nullptr)
Sets a new frame type for this frame.
 
unsigned int paddingElements(const unsigned int planeIndex=0u) const
Returns the optional number of padding elements at the end of each row for a specific plane.
Definition Frame.h:4125
 
Definition of a frame type composed by the frame dimension, pixel format and pixel origin.
Definition Frame.h:30
 
@ ORIGIN_UPPER_LEFT
The first pixel lies in the upper left corner, the last pixel in the lower right corner.
Definition Frame.h:1050
 
This class implements an object able to allocate memory.
Definition base/Memory.h:22
 
void * data()
Returns the pointer to the writable memory which is allocated by this object.
Definition base/Memory.h:303
 
This class provides basic numeric functionalities.
Definition Numeric.h:57
 
This class implements a worker able to distribute function calls over different threads.
Definition Worker.h:33
 
bool executeFunction(const Function &function, const unsigned int first, const unsigned int size, const unsigned int firstIndex=(unsigned int)(-1), const unsigned int sizeIndex=(unsigned int)(-1), const unsigned int minimalIterations=1u, const unsigned int threadIndex=(unsigned int)(-1))
Executes a callback function separable by two function parameters.
 
ProcessorInstructions
Definition of individual processor instruction types.
Definition base/Processor.h:22
 
static unsigned int mirroredBorderLocationRight(const unsigned int value, const unsigned int size)
Mirrors a given value at the right border if necessary.
Definition FrameFilterSeparable.h:3881
 
static unsigned int mirroredBorderLocationLeft(const int value)
Mirrors a given value at the left border if necessary.
Definition FrameFilterSeparable.h:3866
 
@ PI_NONE
Unknown processor instruction set.
Definition base/Processor.h:24
 
@ PI_GROUP_AVX_2_SSE_2
All AVX instructions between (including) AVX and AVX2 and SSE instructions between (including) SSE an...
Definition base/Processor.h:64
 
@ PI_GROUP_SSE_4_1
All SSE instructions between (including) SSE and SSE4.1.
Definition base/Processor.h:60
 
@ PI_SSE_2
SEE2 instructions.
Definition base/Processor.h:28
 
@ PI_NEON
NEON instructions.
Definition base/Processor.h:50
 
@ PI_GROUP_AVX_2_SSE_4_1
All AVX instructions between (including) AVX and AVX2 and SSE instructions between (including) SSE an...
Definition base/Processor.h:68
 
@ PI_GROUP_SSE_2
All SSE instructions between (including) SSE and SSE2.
Definition base/Processor.h:58
 
@ PI_GROUP_NEON
All NEON instructions (which is currently NEON only).
Definition base/Processor.h:66
 
The namespace covering the entire Ocean framework.
Definition Accessor.h:15
 
__m128 Type
Definition FrameFilterSeparable.h:663
 
__m128i Type
Definition FrameFilterSeparable.h:654
 
Definition of a 128 bit SIMD data type holding four 32 bit values.
Definition FrameFilterSeparable.h:72
 
Default definition of a type with tBytes bytes.
Definition DataType.h:32
 
float Type
The 32 bit floating point data type for any data type T but 'double'.
Definition DataType.h:373