418bool FrameFilterGaussian::filter(
const T* source, T* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const unsigned int horizontalFilterSize,
const unsigned int verticalFilterSize,
const float sigma,
Worker* worker,
ReusableMemory* reusableMemory,
const ProcessorInstructions processorInstructions)
420 ocean_assert(source !=
nullptr && target !=
nullptr);
421 ocean_assert(width >= horizontalFilterSize && height >= verticalFilterSize);
423 ocean_assert(horizontalFilterSize >= 1u && horizontalFilterSize % 2u == 1u);
424 ocean_assert(verticalFilterSize >= 1u && verticalFilterSize % 2u == 1u);
425 if (horizontalFilterSize == 0u || horizontalFilterSize % 2u != 1u || verticalFilterSize == 0u || verticalFilterSize % 2u != 1u)
430#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
434 if (std::is_same<T, uint8_t>::value && std::is_same<TFilter, unsigned int>::value)
436 if (width >= 18u && channels == 1u && horizontalFilterSize == 3u && verticalFilterSize == 3u && sigma <= 0.0f)
438 filter1Channel8Bit121NEON((
const uint8_t*)(source), (uint8_t*)(target), width, height, sourcePaddingElements, targetPaddingElements, reusableMemory);
445#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
449 if (std::is_same<T, uint8_t>::value && std::is_same<TFilter, unsigned int>::value)
451 if (width >= 18u && channels == 1u && horizontalFilterSize == 3u && verticalFilterSize == 3u && sigma <= 0.0f)
453 filter1Channel8Bit121SSE((
const uint8_t*)(source), (uint8_t*)(target), width, height, sourcePaddingElements, targetPaddingElements, reusableMemory);
462 std::vector<TFilter> localHorizontalFilter;
463 TFilter* horizontalFilter =
nullptr;
465 if (reusableMemory !=
nullptr)
476 localHorizontalFilter.resize(horizontalFilterSize);
477 horizontalFilter = localHorizontalFilter.data();
489 if (horizontalFilterSize == verticalFilterSize)
491 return FrameFilterSeparable::filter<T, TFilter>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, horizontalFilter, horizontalFilterSize, worker, separableReusableMemory, processorInstructions);
495 std::vector<TFilter> localVerticalFilter;
496 TFilter* verticalFilter =
nullptr;
498 if (reusableMemory !=
nullptr)
509 localVerticalFilter.resize(verticalFilterSize);
510 verticalFilter = localVerticalFilter.data();
522 return FrameFilterSeparable::filter<T, TFilter>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, worker, separableReusableMemory, processorInstructions);
536 ocean_assert(source !=
nullptr);
537 ocean_assert(target !=
nullptr);
538 ocean_assert(width >= 18u);
539 ocean_assert(height >= 1u);
541 const __m128i constant_2_u_16x8 = _mm_set1_epi16(2);
542 const __m128i constant_8_u_16x8 = _mm_set1_epi16(8);
543 const __m128i zero_128 = _mm_setzero_si128();
545 const unsigned int sourceStrideElements = width * 1u + sourcePaddingElements;
546 const unsigned int targetStrideElements = width * 1u + targetPaddingElements;
548 const unsigned int innerPixels = width - 2u;
550 Memory memoryResponseRows;
551 uint16_t* responseRows =
nullptr;
553 const unsigned int reusableMemoryNecessaryElements = width * 4u;
555 if (reusableMemory !=
nullptr)
559 reusableMemory->
responseRowsMemory_ = Memory::create<uint16_t>(reusableMemoryNecessaryElements);
566 memoryResponseRows = Memory::create<uint16_t>(reusableMemoryNecessaryElements);
567 responseRows = memoryResponseRows.
data<uint16_t>();
570 ocean_assert(responseRows !=
nullptr);
572 uint16_t* responseTopRow = responseRows + width * 0u;
576 responseTopRow[0] = source[0] * 3u + source[1];
578 for (
unsigned int n = 0u; n < innerPixels; n += 16u)
580 if (n + 16u > innerPixels)
582 ocean_assert(n >= 16u && innerPixels > 16u);
583 const unsigned int newN = innerPixels - 16u;
585 const unsigned int offset = n - newN;
586 ocean_assert_and_suppress_unused(offset < innerPixels, offset);
588 ocean_assert(n > newN);
593 ocean_assert(n + 16u == innerPixels);
594 ocean_assert(!(n + 16u < innerPixels));
597 const __m128i source_0_u_8x16 = _mm_loadu_si128((
const __m128i*)(source + n + 0u));
598 const __m128i source_1_u_8x16 = _mm_loadu_si128((
const __m128i*)(source + n + 1u));
599 const __m128i source_2_u_8x16 = _mm_loadu_si128((
const __m128i*)(source + n + 2u));
602 const __m128i source_0_low_u_16x8 = _mm_unpacklo_epi8(source_0_u_8x16, zero_128);
603 const __m128i source_0_high_u_16x8 = _mm_unpackhi_epi8(source_0_u_8x16, zero_128);
604 const __m128i source_2_low_u_16x8 = _mm_unpacklo_epi8(source_2_u_8x16, zero_128);
605 const __m128i source_2_high_u_16x8 = _mm_unpackhi_epi8(source_2_u_8x16, zero_128);
607 __m128i low_u_16x8 = _mm_add_epi16(source_0_low_u_16x8, source_2_low_u_16x8);
608 __m128i high_u_16x8 = _mm_add_epi16(source_0_high_u_16x8, source_2_high_u_16x8);
611 const __m128i source_1_low_u_16x8 = _mm_unpacklo_epi8(source_1_u_8x16, zero_128);
612 const __m128i source_1_high_u_16x8 = _mm_unpackhi_epi8(source_1_u_8x16, zero_128);
614 low_u_16x8 = _mm_add_epi16(low_u_16x8, _mm_mullo_epi16(source_1_low_u_16x8, constant_2_u_16x8));
615 high_u_16x8 = _mm_add_epi16(high_u_16x8, _mm_mullo_epi16(source_1_high_u_16x8, constant_2_u_16x8));
617 _mm_storeu_si128((__m128i*)(responseTopRow + 1u + n + 0u), low_u_16x8);
618 _mm_storeu_si128((__m128i*)(responseTopRow + 1u + n + 8u), high_u_16x8);
621 responseTopRow[width - 1u] = source[width - 2u] + source[width - 1u] * 3u;
624 uint16_t* responseCenterRow = responseTopRow;
625 uint16_t* responseBottomRow = responseRows + width * 2u;
626 uint8_t*
const sourceExtraCopy = (uint8_t*)(responseRows + width * 3u);
628 source += sourceStrideElements;
630 for (
unsigned int y = 0u; y < height; ++y)
632 if (y == height - 2u)
635 memcpy(sourceExtraCopy, source, width *
sizeof(uint8_t));
640 responseBottomRow[0u] = source[0] * 3u + source[1];
648 const ScopedValueT<uint8_t> firstPixelValue(*target, uint8_t((responseTopRow[0] + responseCenterRow[0] * 2u + responseBottomRow[0] + 8u) / 16u));
650 for (
unsigned int n = 0u; n < innerPixels; n += 16u)
652 if (n + 16u > innerPixels)
654 ocean_assert(n >= 16u && innerPixels > 16u);
655 const unsigned int newN = innerPixels - 16u;
657 const unsigned int offset = n - newN;
658 ocean_assert_and_suppress_unused(offset < innerPixels, offset);
660 ocean_assert(n > newN);
665 ocean_assert(n + 16u == innerPixels);
666 ocean_assert(!(n + 16u < innerPixels));
669 const __m128i sourceBottom_0_u_8x16 = _mm_loadu_si128((
const __m128i*)(source + n + 0u));
670 const __m128i sourceBottom_1_u_8x16 = _mm_loadu_si128((
const __m128i*)(source + n + 1u));
671 const __m128i sourceBottom_2_u_8x16 = _mm_loadu_si128((
const __m128i*)(source + n + 2u));
674 const __m128i bottomSource_0_low_u_16x8 = _mm_unpacklo_epi8(sourceBottom_0_u_8x16, zero_128);
675 const __m128i bottomSource_0_high_u_16x8 = _mm_unpackhi_epi8(sourceBottom_0_u_8x16, zero_128);
676 const __m128i bottomSource_2_low_u_16x8 = _mm_unpacklo_epi8(sourceBottom_2_u_8x16, zero_128);
677 const __m128i bottomSource_2_high_u_16x8 = _mm_unpackhi_epi8(sourceBottom_2_u_8x16, zero_128);
679 __m128i bottomLow_u_16x8 = _mm_add_epi16(bottomSource_0_low_u_16x8, bottomSource_2_low_u_16x8);
680 __m128i bottomHigh_u_16x8 = _mm_add_epi16(bottomSource_0_high_u_16x8, bottomSource_2_high_u_16x8);
683 const __m128i bottomSource_1_low_u_16x8 = _mm_unpacklo_epi8(sourceBottom_1_u_8x16, zero_128);
684 const __m128i bottomSource_1_high_u_16x8 = _mm_unpackhi_epi8(sourceBottom_1_u_8x16, zero_128);
686 bottomLow_u_16x8 = _mm_add_epi16(bottomLow_u_16x8, _mm_mullo_epi16(bottomSource_1_low_u_16x8, constant_2_u_16x8));
687 bottomHigh_u_16x8 = _mm_add_epi16(bottomHigh_u_16x8, _mm_mullo_epi16(bottomSource_1_high_u_16x8, constant_2_u_16x8));
691 const __m128i topLow_u_16x8 = _mm_loadu_si128((
const __m128i*)(responseTopRow + 1u + n + 0u));
692 const __m128i topHigh_u_16x8 = _mm_loadu_si128((
const __m128i*)(responseTopRow + 1u + n + 8u));
695 const __m128i centerLow_u_16x8 = _mm_loadu_si128((
const __m128i*)(responseCenterRow + 1u + n + 0u));
696 const __m128i centerHigh_u_16x8 = _mm_loadu_si128((
const __m128i*)(responseCenterRow + 1u + n + 8u));
699 __m128i resultLow_u_16x8 = _mm_add_epi16(topLow_u_16x8, bottomLow_u_16x8);
700 __m128i resultHigh_u_16x8 = _mm_add_epi16(topHigh_u_16x8, bottomHigh_u_16x8);
703 resultLow_u_16x8 = _mm_add_epi16(resultLow_u_16x8, _mm_mullo_epi16(centerLow_u_16x8, constant_2_u_16x8));
704 resultHigh_u_16x8 = _mm_add_epi16(resultHigh_u_16x8, _mm_mullo_epi16(centerHigh_u_16x8, constant_2_u_16x8));
708 _mm_storeu_si128((__m128i*)(responseBottomRow + 1u + n + 0u), bottomLow_u_16x8);
709 _mm_storeu_si128((__m128i*)(responseBottomRow + 1u + n + 8u), bottomHigh_u_16x8);
712 resultLow_u_16x8 = _mm_srli_epi16(_mm_add_epi16(resultLow_u_16x8, constant_8_u_16x8), 4);
713 resultHigh_u_16x8 = _mm_srli_epi16(_mm_add_epi16(resultHigh_u_16x8, constant_8_u_16x8), 4);
715 const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, resultHigh_u_16x8);
717 _mm_storeu_si128((__m128i*)(target + 1u + n), result_u_8x16);
720 responseBottomRow[width - 1u] = source[width - 2u] + source[width - 1u] * 3u;
727 target[width - 1u] = uint8_t((responseTopRow[width - 1u] + responseCenterRow[width - 1u] * 2u + responseBottomRow[width - 1u] + 8u) / 16u);
729 source += sourceStrideElements;
730 target += targetStrideElements;
732 std::swap(responseTopRow, responseCenterRow);
738 responseCenterRow = responseRows + width * 1u;
740 else if (y == height - 2u)
745 source = sourceExtraCopy;
748 std::swap(responseCenterRow, responseBottomRow);
758 ocean_assert(source !=
nullptr);
759 ocean_assert(target !=
nullptr);
760 ocean_assert(width >= 18u);
761 ocean_assert(height >= 1u);
764 const uint8x8_t constant_2_u_8x8 = vdup_n_u8(2u);
765 const uint16x8_t constant_2_u_16x8 = vdupq_n_u16(2u);
767 const unsigned int sourceStrideElements = width * 1u + sourcePaddingElements;
768 const unsigned int targetStrideElements = width * 1u + targetPaddingElements;
770 const unsigned int innerPixels = width - 2u;
772 Memory memoryResponseRows;
773 uint16_t* responseRows =
nullptr;
775 const unsigned int reusableMemoryNecessaryElements = width * 4u;
777 if (reusableMemory !=
nullptr)
781 reusableMemory->
responseRowsMemory_ = Memory::create<uint16_t>(reusableMemoryNecessaryElements);
788 memoryResponseRows = Memory::create<uint16_t>(reusableMemoryNecessaryElements);
789 responseRows = memoryResponseRows.
data<uint16_t>();
792 ocean_assert(responseRows !=
nullptr);
794 uint16_t* responseTopRow = responseRows + width * 0u;
798 responseTopRow[0] = source[0] * 3u + source[1];
800 for (
unsigned int n = 0u; n < innerPixels; n += 16u)
802 if (n + 16u > innerPixels)
804 ocean_assert(n >= 16u && innerPixels > 16u);
805 const unsigned int newN = innerPixels - 16u;
807 const unsigned int offset = n - newN;
808 ocean_assert_and_suppress_unused(offset < innerPixels, offset);
810 ocean_assert(n > newN);
815 ocean_assert(n + 16u == innerPixels);
816 ocean_assert(!(n + 16u < innerPixels));
819 const uint8x16_t source_0_u_8x16 = vld1q_u8(source + n + 0u);
820 const uint8x16_t source_1_u_8x16 = vld1q_u8(source + n + 1u);
821 const uint8x16_t source_2_u_8x16 = vld1q_u8(source + n + 2u);
824 uint16x8_t low_u_16x8 = vaddl_u8(vget_low_u8(source_0_u_8x16), vget_low_u8(source_2_u_8x16));
825 uint16x8_t high_u_16x8 = vaddl_u8(vget_high_u8(source_0_u_8x16), vget_high_u8(source_2_u_8x16));
828 low_u_16x8 = vmlal_u8(low_u_16x8, vget_low_u8(source_1_u_8x16), constant_2_u_8x8);
829 high_u_16x8 = vmlal_u8(high_u_16x8, vget_high_u8(source_1_u_8x16), constant_2_u_8x8);
831 vst1q_u16(responseTopRow + 1u + n + 0u, low_u_16x8);
832 vst1q_u16(responseTopRow + 1u + n + 8u, high_u_16x8);
835 responseTopRow[width - 1u] = source[width - 2u] + source[width - 1u] * 3u;
838 uint16_t* responseCenterRow = responseTopRow;
839 uint16_t* responseBottomRow = responseRows + width * 2u;
840 uint8_t*
const sourceExtraCopy = (uint8_t*)(responseRows + width * 3u);
842 source += sourceStrideElements;
844 for (
unsigned int y = 0u; y < height; ++y)
846 if (y == height - 2u)
849 memcpy(sourceExtraCopy, source, width *
sizeof(uint8_t));
854 responseBottomRow[0u] = source[0] * 3u + source[1];
862 const ScopedValueT<uint8_t> firstPixelValue(*target, uint8_t((responseTopRow[0] + responseCenterRow[0] * 2u + responseBottomRow[0] + 8u) / 16u));
864 for (
unsigned int n = 0u; n < innerPixels; n += 16u)
866 if (n + 16u > innerPixels)
868 ocean_assert(n >= 16u && innerPixels > 16u);
869 const unsigned int newN = innerPixels - 16u;
871 const unsigned int offset = n - newN;
872 ocean_assert_and_suppress_unused(offset < innerPixels, offset);
874 ocean_assert(n > newN);
879 ocean_assert(n + 16u == innerPixels);
880 ocean_assert(!(n + 16u < innerPixels));
883 const uint8x16_t sourceBottom_0_u_8x16 = vld1q_u8(source + n + 0u);
884 const uint8x16_t sourceBottom_1_u_8x16 = vld1q_u8(source + n + 1u);
885 const uint8x16_t sourceBottom_2_u_8x16 = vld1q_u8(source + n + 2u);
888 uint16x8_t bottomLow_u_16x8 = vaddl_u8(vget_low_u8(sourceBottom_0_u_8x16), vget_low_u8(sourceBottom_2_u_8x16));
889 uint16x8_t bottomHigh_u_16x8 = vaddl_u8(vget_high_u8(sourceBottom_0_u_8x16), vget_high_u8(sourceBottom_2_u_8x16));
892 bottomLow_u_16x8 = vmlal_u8(bottomLow_u_16x8, vget_low_u8(sourceBottom_1_u_8x16), constant_2_u_8x8);
893 bottomHigh_u_16x8 = vmlal_u8(bottomHigh_u_16x8, vget_high_u8(sourceBottom_1_u_8x16), constant_2_u_8x8);
897 const uint16x8_t topLow_u_16x8 = vld1q_u16(responseTopRow + 1u + n + 0u);
898 const uint16x8_t topHigh_u_16x8 = vld1q_u16(responseTopRow + 1u + n + 8u);
901 const uint16x8_t centerLow_u_16x8 = vld1q_u16(responseCenterRow + 1u + n + 0u);
902 const uint16x8_t centerHigh_u_16x8 = vld1q_u16(responseCenterRow + 1u + n + 8u);
905 uint16x8_t resultLow_u_16x8 = vaddq_u16(topLow_u_16x8, bottomLow_u_16x8);
906 uint16x8_t resultHigh_u_16x8 = vaddq_u16(topHigh_u_16x8, bottomHigh_u_16x8);
909 resultLow_u_16x8 = vmlaq_u16(resultLow_u_16x8, centerLow_u_16x8, constant_2_u_16x8);
910 resultHigh_u_16x8 = vmlaq_u16(resultHigh_u_16x8, centerHigh_u_16x8, constant_2_u_16x8);
914 vst1q_u16(responseBottomRow + 1u + n + 0u, bottomLow_u_16x8);
915 vst1q_u16(responseBottomRow + 1u + n + 8u, bottomHigh_u_16x8);
918 const uint8x16_t result_u_8x16 = vcombine_u8(vrshrn_n_u16(resultLow_u_16x8, 4), vrshrn_n_u16(resultHigh_u_16x8, 4));
920 vst1q_u8(target + 1u + n, result_u_8x16);
923 responseBottomRow[width - 1u] = source[width - 2u] + source[width - 1u] * 3u;
930 target[width - 1u] = uint8_t((responseTopRow[width - 1u] + responseCenterRow[width - 1u] * 2u + responseBottomRow[width - 1u] + 8u) / 16u);
932 source += sourceStrideElements;
933 target += targetStrideElements;
935 std::swap(responseTopRow, responseCenterRow);
941 responseCenterRow = responseRows + width * 1u;
943 else if (y == height - 2u)
948 source = sourceExtraCopy;
951 std::swap(responseCenterRow, responseBottomRow);