296inline void FrameFilterGaussian::determineFilterFactors<unsigned int>(
const unsigned int filterSize,
unsigned int* filter,
unsigned int* denominator)
298 ocean_assert(filterSize % 2u == 1u);
299 ocean_assert(
filter !=
nullptr);
301 if (filterSize <= 7u)
303 static constexpr std::array<unsigned int, 16> predefinedFilters =
308 1u, 4u, 7u, 9u, 7u, 4u, 1u
311 static constexpr std::array<unsigned int, 4> predefinedDenominators =
319 static constexpr std::array<unsigned int, 4> offsets =
327 ocean_assert(filterSize / 2u < offsets.size());
328 const unsigned int filterOffset = offsets[filterSize / 2u];
330 for (
unsigned int n = 0u; n < filterSize; ++n)
332 ocean_assert(filterOffset + n < predefinedFilters.size());
333 filter[n] = predefinedFilters[filterOffset + n];
337 if (denominator !=
nullptr)
339 ocean_assert(filterSize / 2u < predefinedDenominators.size());
340 *denominator = predefinedDenominators[filterSize / 2u];
346 const float sigma = filterSize2sigma<float>(filterSize);
348 determineFilterFactorsWithExplicitSigma<unsigned int>(filterSize, sigma,
filter, denominator);
354 ocean_assert(filterSize % 2u == 1u);
355 ocean_assert(
filter !=
nullptr);
357 if (filterSize <= 7u)
359 static constexpr std::array<float, 16> predefinedFilters =
363 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f,
364 0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f,
368 static constexpr std::array<unsigned int, 4> offsets =
376 ocean_assert(filterSize / 2u < offsets.size());
377 const unsigned int filterOffset = offsets[filterSize / 2u];
379 for (
unsigned int n = 0u; n < filterSize; ++n)
381 ocean_assert(filterOffset + n < predefinedFilters.size());
382 filter[n] = T(predefinedFilters[filterOffset + n]);
385 if (denominator !=
nullptr)
393 const float sigma = filterSize2sigma<float>(filterSize);
395 determineFilterFactorsWithExplicitSigma<T>(filterSize, sigma,
filter, denominator);
399bool FrameFilterGaussian::filter(
const T* source, T* target,
const unsigned int width,
const unsigned int height,
const unsigned int channels,
const unsigned int sourcePaddingElements,
const unsigned int targetPaddingElements,
const unsigned int horizontalFilterSize,
const unsigned int verticalFilterSize,
const float sigma,
Worker* worker,
ReusableMemory* reusableMemory,
const ProcessorInstructions processorInstructions)
401 ocean_assert(source !=
nullptr && target !=
nullptr);
402 ocean_assert(width >= horizontalFilterSize && height >= verticalFilterSize);
404 ocean_assert(horizontalFilterSize >= 1u && horizontalFilterSize % 2u == 1u);
405 ocean_assert(verticalFilterSize >= 1u && verticalFilterSize % 2u == 1u);
406 if (horizontalFilterSize == 0u || horizontalFilterSize % 2u != 1u || verticalFilterSize == 0u || verticalFilterSize % 2u != 1u)
411#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
415 if (std::is_same<T, uint8_t>::value && std::is_same<TFilter, unsigned int>::value)
417 if (width >= 18u && channels == 1u && horizontalFilterSize == 3u && verticalFilterSize == 3u && sigma <= 0.0f)
419 filter1Channel8Bit121NEON((
const uint8_t*)(source), (uint8_t*)(target), width, height, sourcePaddingElements, targetPaddingElements, reusableMemory);
428 std::vector<TFilter> localHorizontalFilter;
429 TFilter* horizontalFilter =
nullptr;
431 if (reusableMemory !=
nullptr)
442 localHorizontalFilter.resize(horizontalFilterSize);
443 horizontalFilter = localHorizontalFilter.data();
455 if (horizontalFilterSize == verticalFilterSize)
457 return FrameFilterSeparable::filter<T, TFilter>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, horizontalFilter, horizontalFilterSize, worker, separableReusableMemory, processorInstructions);
461 std::vector<TFilter> localVerticalFilter;
462 TFilter* verticalFilter =
nullptr;
464 if (reusableMemory !=
nullptr)
475 localVerticalFilter.resize(verticalFilterSize);
476 verticalFilter = localVerticalFilter.data();
488 return FrameFilterSeparable::filter<T, TFilter>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, worker, separableReusableMemory, processorInstructions);
502 ocean_assert(source !=
nullptr);
503 ocean_assert(target !=
nullptr);
504 ocean_assert(width >= 18u);
505 ocean_assert(height >= 1u);
508 const uint8x8_t constant_2_u_8x8 = vdup_n_u8(2u);
509 const uint16x8_t constant_2_u_16x8 = vdupq_n_u16(2u);
511 const unsigned int sourceStrideElements = width * 1u + sourcePaddingElements;
512 const unsigned int targetStrideElements = width * 1u + targetPaddingElements;
514 const unsigned int innerPixels = width - 2u;
516 Memory memoryResponseRows;
517 uint16_t* responseRows =
nullptr;
519 const unsigned int reusableMemoryNecessaryElements = width * 4u;
521 if (reusableMemory !=
nullptr)
525 reusableMemory->
responseRowsMemory_ = Memory::create<uint16_t>(reusableMemoryNecessaryElements);
532 memoryResponseRows = Memory::create<uint16_t>(reusableMemoryNecessaryElements);
533 responseRows = memoryResponseRows.
data<uint16_t>();
536 ocean_assert(responseRows !=
nullptr);
538 uint16_t* responseTopRow = responseRows + width * 0u;
542 responseTopRow[0] = source[0] * 3u + source[1];
544 for (
unsigned int n = 0u; n < innerPixels; n += 16u)
546 if (n + 16u > innerPixels)
548 ocean_assert(n >= 16u && innerPixels > 16u);
549 const unsigned int newN = innerPixels - 16u;
551 const unsigned int offset = n - newN;
552 ocean_assert_and_suppress_unused(offset < innerPixels, offset);
554 ocean_assert(n > newN);
559 ocean_assert(n + 16u == innerPixels);
560 ocean_assert(!(n + 16u < innerPixels));
563 const uint8x16_t source_0_u_8x16 = vld1q_u8(source + n + 0u);
564 const uint8x16_t source_1_u_8x16 = vld1q_u8(source + n + 1u);
565 const uint8x16_t source_2_u_8x16 = vld1q_u8(source + n + 2u);
568 uint16x8_t low_u_16x8 = vaddl_u8(vget_low_u8(source_0_u_8x16), vget_low_u8(source_2_u_8x16));
569 uint16x8_t high_u_16x8 = vaddl_u8(vget_high_u8(source_0_u_8x16), vget_high_u8(source_2_u_8x16));
572 low_u_16x8 = vmlal_u8(low_u_16x8, vget_low_u8(source_1_u_8x16), constant_2_u_8x8);
573 high_u_16x8 = vmlal_u8(high_u_16x8, vget_high_u8(source_1_u_8x16), constant_2_u_8x8);
575 vst1q_u16(responseTopRow + 1u + n + 0u, low_u_16x8);
576 vst1q_u16(responseTopRow + 1u + n + 8u, high_u_16x8);
579 responseTopRow[width - 1u] = source[width - 2u] + source[width - 1u] * 3u;
582 uint16_t* responseCenterRow = responseTopRow;
583 uint16_t* responseBottomRow = responseRows + width * 2u;
584 uint8_t*
const sourceExtraCopy = (uint8_t*)(responseRows + width * 3u);
586 source += sourceStrideElements;
588 for (
unsigned int y = 0u; y < height; ++y)
590 if (y == height - 2u)
593 memcpy(sourceExtraCopy, source, width *
sizeof(uint8_t));
598 responseBottomRow[0u] = source[0] * 3u + source[1];
606 const ScopedValueT<uint8_t> firstPixelValue(*target, uint8_t((responseTopRow[0] + responseCenterRow[0] * 2u + responseBottomRow[0] + 8u) / 16u));
608 for (
unsigned int n = 0u; n < innerPixels; n += 16u)
610 if (n + 16u > innerPixels)
612 ocean_assert(n >= 16u && innerPixels > 16u);
613 const unsigned int newN = innerPixels - 16u;
615 const unsigned int offset = n - newN;
616 ocean_assert_and_suppress_unused(offset < innerPixels, offset);
618 ocean_assert(n > newN);
623 ocean_assert(n + 16u == innerPixels);
624 ocean_assert(!(n + 16u < innerPixels));
627 const uint8x16_t sourceBottom_0_u_8x16 = vld1q_u8(source + n + 0u);
628 const uint8x16_t sourceBottom_1_u_8x16 = vld1q_u8(source + n + 1u);
629 const uint8x16_t sourceBottom_2_u_8x16 = vld1q_u8(source + n + 2u);
632 uint16x8_t bottomLow_u_16x8 = vaddl_u8(vget_low_u8(sourceBottom_0_u_8x16), vget_low_u8(sourceBottom_2_u_8x16));
633 uint16x8_t bottomHigh_u_16x8 = vaddl_u8(vget_high_u8(sourceBottom_0_u_8x16), vget_high_u8(sourceBottom_2_u_8x16));
636 bottomLow_u_16x8 = vmlal_u8(bottomLow_u_16x8, vget_low_u8(sourceBottom_1_u_8x16), constant_2_u_8x8);
637 bottomHigh_u_16x8 = vmlal_u8(bottomHigh_u_16x8, vget_high_u8(sourceBottom_1_u_8x16), constant_2_u_8x8);
641 const uint16x8_t topLow_u_16x8 = vld1q_u16(responseTopRow + 1u + n + 0u);
642 const uint16x8_t topHigh_u_16x8 = vld1q_u16(responseTopRow + 1u + n + 8u);
645 const uint16x8_t centerLow_u_16x8 = vld1q_u16(responseCenterRow + 1u + n + 0u);
646 const uint16x8_t centerHigh_u_16x8 = vld1q_u16(responseCenterRow + 1u + n + 8u);
649 uint16x8_t resultLow_u_16x8 = vaddq_u16(topLow_u_16x8, bottomLow_u_16x8);
650 uint16x8_t resultHigh_u_16x8 = vaddq_u16(topHigh_u_16x8, bottomHigh_u_16x8);
653 resultLow_u_16x8 = vmlaq_u16(resultLow_u_16x8, centerLow_u_16x8, constant_2_u_16x8);
654 resultHigh_u_16x8 = vmlaq_u16(resultHigh_u_16x8, centerHigh_u_16x8, constant_2_u_16x8);
658 vst1q_u16(responseBottomRow + 1u + n + 0u, bottomLow_u_16x8);
659 vst1q_u16(responseBottomRow + 1u + n + 8u, bottomHigh_u_16x8);
662 const uint8x16_t result_u_8x16 = vcombine_u8(vrshrn_n_u16(resultLow_u_16x8, 4), vrshrn_n_u16(resultHigh_u_16x8, 4));
664 vst1q_u8(target + 1u + n, result_u_8x16);
667 responseBottomRow[width - 1u] = source[width - 2u] + source[width - 1u] * 3u;
674 target[width - 1u] = uint8_t((responseTopRow[width - 1u] + responseCenterRow[width - 1u] * 2u + responseBottomRow[width - 1u] + 8u) / 16u);
676 source += sourceStrideElements;
677 target += targetStrideElements;
679 std::swap(responseTopRow, responseCenterRow);
685 responseCenterRow = responseRows + width * 1u;
687 else if (y == height - 2u)
692 source = sourceExtraCopy;
695 std::swap(responseCenterRow, responseBottomRow);