82    ocean_assert(tPatchSize >= 5u);
 
   84    ocean_assert(imageTopLeft != 
nullptr && buffer != 
nullptr);
 
   85    ocean_assert(imageStrideElements >= 1u * tPatchSize);
 
   87    ocean_assert(factorRight <= 128u && factorBottom <= 128u);
 
   89    const unsigned int factorLeft = 128u - factorRight;
 
   90    const unsigned int factorTop = 128u - factorBottom;
 
   92    constexpr unsigned int blocks15 = tPatchSize / 15u;
 
   93    constexpr unsigned int remainingAfterBlocks15 = tPatchSize % 15u;
 
   95    constexpr bool partialBlock15 = remainingAfterBlocks15 > 10u;
 
   96    constexpr unsigned int remainingAfterPartialBlock15 = partialBlock15 ? 0u : remainingAfterBlocks15;
 
   98    constexpr bool block7 = remainingAfterPartialBlock15 >= 7u;
 
   99    constexpr unsigned int remainingAfterBlock7 = remainingAfterPartialBlock15 % 7u;
 
  101    constexpr bool partialBlock7 = remainingAfterBlock7 >= 3u;
 
  102    constexpr unsigned int remainingAfterPartialBlock7 = partialBlock7 ? 0u : remainingAfterBlock7;
 
  104    constexpr unsigned int blocks1 = remainingAfterPartialBlock7;
 
  106    const unsigned int factorTopLeft = factorTop * factorLeft;
 
  107    const unsigned int factorTopRight = factorTop * factorRight;
 
  110    const __m128i factorsTop_u_16x8 = _mm_set1_epi32(
int(factorTopLeft) | 
int(factorTopRight) << 16);
 
  112    const unsigned int factorBottomLeft = factorBottom * factorLeft;
 
  113    const unsigned int factorBottomRight = factorBottom * factorRight;
 
  116    const __m128i factorsBottom_u_16x8 = _mm_set1_epi32(
int(factorBottomLeft) | 
int(factorBottomRight) << 16);
 
  118    for (
unsigned int y = 0u; y < tPatchSize; ++y)
 
  123        for (
unsigned int x = 0u; x < blocks15; ++x)
 
  125            const __m128i top_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft));
 
  126            const __m128i bottom_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements));
 
  130            const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
 
  131            const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
 
  134            const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
 
  135            const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
 
  139            const __m128i topHighA_u_16x8 = _mm_unpackhi_epi8(top_u_8x16, _mm_setzero_si128());
 
  140            const __m128i bottomHighA_u_16x8 = _mm_unpackhi_epi8(bottom_u_8x16, _mm_setzero_si128());
 
  143            const __m128i topHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
 
  144            const __m128i bottomHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
 
  148            const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
 
  149            const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
 
  151            const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
 
  152            const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
 
  155            const __m128i topHighA_u_32x4 = _mm_madd_epi16(topHighA_u_16x8, factorsTop_u_16x8);
 
  156            const __m128i bottomHighA_u_32x4 = _mm_madd_epi16(bottomHighA_u_16x8, factorsBottom_u_16x8);
 
  158            const __m128i topHighB_u_32x4 = _mm_madd_epi16(topHighB_u_16x8, factorsTop_u_16x8);
 
  159            const __m128i bottomHighB_u_32x4 = _mm_madd_epi16(bottomHighB_u_16x8, factorsBottom_u_16x8);
 
  162            const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
 
  163            const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
 
  165            const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
 
  168            const __m128i resultHighA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighA_u_32x4, bottomHighA_u_32x4), _mm_set1_epi32(8192)), 14);
 
  169            const __m128i resultHighB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighB_u_32x4, bottomHighB_u_32x4), _mm_set1_epi32(8192)), 14);
 
  171            const __m128i resultHigh_u_16x8 = _mm_or_si128(resultHighA_u_32x4, _mm_slli_epi32(resultHighB_u_32x4, 16));
 
  176            const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, resultHigh_u_16x8);
 
  179            const bool isLastBlock = (y + 1u == tPatchSize) && (x + 1u == blocks15) && (!block7 && !partialBlock7 && blocks1 == 0u);
 
  183                memcpy(buffer, &result_u_8x16, 15);
 
  187                _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
 
  194        if constexpr (partialBlock15)
 
  196            ocean_assert(!block7 && !partialBlock7 && blocks1 == 0u);
 
  199            __m128i bottom_u_8x16;
 
  201            if (y < tPatchSize - 1u)
 
  203                top_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft));
 
  204                bottom_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements));
 
  208                memcpy(&top_u_8x16, imageTopLeft, remainingAfterBlocks15 + 1u);
 
  209                memcpy(&bottom_u_8x16, imageTopLeft + imageStrideElements, remainingAfterBlocks15 + 1u);
 
  214            const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
 
  215            const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
 
  218            const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
 
  219            const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
 
  223            const __m128i topHighA_u_16x8 = _mm_unpackhi_epi8(top_u_8x16, _mm_setzero_si128());
 
  224            const __m128i bottomHighA_u_16x8 = _mm_unpackhi_epi8(bottom_u_8x16, _mm_setzero_si128());
 
  227            const __m128i topHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
 
  228            const __m128i bottomHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
 
  232            const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
 
  233            const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
 
  235            const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
 
  236            const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
 
  239            const __m128i topHighA_u_32x4 = _mm_madd_epi16(topHighA_u_16x8, factorsTop_u_16x8);
 
  240            const __m128i bottomHighA_u_32x4 = _mm_madd_epi16(bottomHighA_u_16x8, factorsBottom_u_16x8);
 
  242            const __m128i topHighB_u_32x4 = _mm_madd_epi16(topHighB_u_16x8, factorsTop_u_16x8);
 
  243            const __m128i bottomHighB_u_32x4 = _mm_madd_epi16(bottomHighB_u_16x8, factorsBottom_u_16x8);
 
  246            const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
 
  247            const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
 
  249            const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
 
  252            const __m128i resultHighA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighA_u_32x4, bottomHighA_u_32x4), _mm_set1_epi32(8192)), 14);
 
  253            const __m128i resultHighB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighB_u_32x4, bottomHighB_u_32x4), _mm_set1_epi32(8192)), 14);
 
  255            const __m128i resultHigh_u_16x8 = _mm_or_si128(resultHighA_u_32x4, _mm_slli_epi32(resultHighB_u_32x4, 16));
 
  260            const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, resultHigh_u_16x8);
 
  262            ocean_assert(!block7 && !partialBlock7 && blocks1 == 0u);
 
  263            const bool isLastBlock = y + 1u == tPatchSize;
 
  267                memcpy(buffer, &result_u_8x16, remainingAfterBlocks15);
 
  271                _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
 
  274            imageTopLeft += remainingAfterBlocks15;
 
  275            buffer += remainingAfterBlocks15;
 
  278        if constexpr (block7)
 
  280            const __m128i top_u_8x16 = _mm_loadl_epi64((
const __m128i*)(imageTopLeft));
 
  281            const __m128i bottom_u_8x16 = _mm_loadl_epi64((
const __m128i*)(imageTopLeft + imageStrideElements));
 
  285            const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
 
  286            const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
 
  289            const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
 
  290            const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
 
  294            const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
 
  295            const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
 
  297            const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
 
  298            const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
 
  301            const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
 
  302            const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
 
  304            const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
 
  307            const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, _mm_setzero_si128());
 
  309            const bool isLastBlock = (y + 1u == tPatchSize) && (!partialBlock7 && blocks1 == 0u);
 
  313                memcpy(buffer, &result_u_8x16, 7);
 
  317                _mm_storel_epi64((__m128i*)buffer, result_u_8x16);
 
  324        if constexpr (partialBlock7)
 
  326            ocean_assert(blocks1 == 0u);
 
  329            __m128i bottom_u_8x16;
 
  331            if (y < tPatchSize - 1u)
 
  333                top_u_8x16 = _mm_loadl_epi64((
const __m128i*)(imageTopLeft));
 
  334                bottom_u_8x16 = _mm_loadl_epi64((
const __m128i*)(imageTopLeft + imageStrideElements));
 
  338                memcpy(&top_u_8x16, imageTopLeft, remainingAfterBlock7 + 1u);
 
  339                memcpy(&bottom_u_8x16, imageTopLeft + imageStrideElements, remainingAfterBlock7 + 1u);
 
  344            const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
 
  345            const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
 
  348            const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
 
  349            const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
 
  353            const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
 
  354            const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
 
  356            const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
 
  357            const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
 
  360            const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
 
  361            const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
 
  363            const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
 
  366            const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, _mm_setzero_si128());
 
  368            ocean_assert(blocks1 == 0u);
 
  369            const bool isLastBlock = y + 1u == tPatchSize;
 
  373                memcpy(buffer, &result_u_8x16, remainingAfterBlock7);
 
  377                _mm_storel_epi64((__m128i*)buffer, result_u_8x16);
 
  380            imageTopLeft += remainingAfterBlock7;
 
  381            buffer += remainingAfterBlock7;
 
  384        if constexpr (blocks1 != 0u)
 
  386            const uint8_t* 
const imageBottomLeft = imageTopLeft + imageStrideElements;
 
  388            for (
unsigned int n = 0u; n < blocks1; ++n)
 
  390                buffer[n] = uint8_t((imageTopLeft[n] * factorTopLeft + imageTopLeft[1u + n] * factorTopRight + imageBottomLeft[n] * factorBottomLeft + imageBottomLeft[1u + n] * factorBottomRight + 8192u) / 16384u);
 
  393            imageTopLeft += blocks1;
 
  397        imageTopLeft += imageStrideElements - tPatchSize;
 
 
  405    ocean_assert(imageTopLeft != 
nullptr && buffer != 
nullptr);
 
  406    ocean_assert(imageStrideElements >= 1u * tPatchSize);
 
  408    ocean_assert(factorRight <= 128u && factorBottom <= 128u);
 
  410    const unsigned int factorLeft = 128u - factorRight;
 
  411    const unsigned int factorTop = 128u - factorBottom;
 
  413    constexpr unsigned int blocks4 = tPatchSize / 4u;
 
  414    constexpr unsigned int remainingAfterBlocks4 = tPatchSize % 4u;
 
  416    constexpr bool partialBlock4 = remainingAfterBlocks4 >= 2u;
 
  417    constexpr unsigned int remainingAfterPartialBlock4 = partialBlock4 ? 0u : remainingAfterBlocks4;
 
  419    constexpr unsigned int blocks1 = remainingAfterPartialBlock4;
 
  421    const unsigned int factorTopLeft = factorTop * factorLeft;
 
  422    const unsigned int factorTopRight = factorTop * factorRight;
 
  425    const __m128i factorsTop_u_16x8 = _mm_set1_epi32(
int(factorTopLeft) | 
int(factorTopRight) << 16);
 
  427    const unsigned int factorBottomLeft = factorBottom * factorLeft;
 
  428    const unsigned int factorBottomRight = factorBottom * factorRight;
 
  431    const __m128i factorsBottom_u_16x8 = _mm_set1_epi32(
int(factorBottomLeft) | 
int(factorBottomRight) << 16);
 
  433    for (
unsigned int y = 0u; y < tPatchSize; ++y)
 
  438        for (
unsigned int x = 0u; x < blocks4; ++x)
 
  440            const bool canReadLastElements = y < tPatchSize - 1u || x < blocks4 - 1u;
 
  443            __m128i bottom_u_8x16;
 
  445            if (canReadLastElements)
 
  447                top_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft));
 
  448                bottom_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements));
 
  452                top_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(imageTopLeft - 1)), 1);
 
  453                bottom_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements - 1u)), 1);
 
  461            const __m128i topChannel0_u_16x8 = _mm_shuffle_epi8(top_u_8x16, 
SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
 
  462            const __m128i bottomChannel0_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, 
SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
 
  465            const __m128i topChannel1_u_16x8 = _mm_shuffle_epi8(top_u_8x16, 
SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
 
  466            const __m128i bottomChannel1_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, 
SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
 
  469            const __m128i topChannel2_u_16x8 = _mm_shuffle_epi8(top_u_8x16, 
SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
 
  470            const __m128i bottomChannel2_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, 
SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
 
  474            const __m128i topChannel0_u_32x4 = _mm_madd_epi16(topChannel0_u_16x8, factorsTop_u_16x8);
 
  475            const __m128i topChannel1_u_32x4 = _mm_madd_epi16(topChannel1_u_16x8, factorsTop_u_16x8);
 
  476            const __m128i topChannel2_u_32x4 = _mm_madd_epi16(topChannel2_u_16x8, factorsTop_u_16x8);
 
  478            const __m128i bottomChannel0_u_32x4 = _mm_madd_epi16(bottomChannel0_u_16x8, factorsBottom_u_16x8);
 
  479            const __m128i bottomChannel1_u_32x4 = _mm_madd_epi16(bottomChannel1_u_16x8, factorsBottom_u_16x8);
 
  480            const __m128i bottomChannel2_u_32x4 = _mm_madd_epi16(bottomChannel2_u_16x8, factorsBottom_u_16x8);
 
  483            const __m128i resultChannel0_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel0_u_32x4, bottomChannel0_u_32x4), _mm_set1_epi32(8192)), 14);
 
  484            const __m128i resultChannel1_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel1_u_32x4, bottomChannel1_u_32x4), _mm_set1_epi32(8192)), 14);
 
  485            const __m128i resultChannel2_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel2_u_32x4, bottomChannel2_u_32x4), _mm_set1_epi32(8192)), 14);
 
  490            const __m128i interleavedA_u_8x16 = _mm_shuffle_epi8(resultChannel0_u_32x4, 
SSE::set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
 
  491            const __m128i interleavedB_u_8x16 = _mm_shuffle_epi8(resultChannel1_u_32x4, 
SSE::set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
 
  492            const __m128i interleavedC_u_8x16 = _mm_shuffle_epi8(resultChannel2_u_32x4, 
SSE::set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
 
  495            const __m128i result_u_8x16 = _mm_or_si128(interleavedA_u_8x16, _mm_or_si128(interleavedB_u_8x16, interleavedC_u_8x16));
 
  497            const bool isLastBlock = (y + 1u == tPatchSize) && (x + 1u == blocks4) && (!partialBlock4 && blocks1 <= 1u);
 
  501                uint8_t tempBuffer[16];
 
  502                _mm_storeu_si128((__m128i*)tempBuffer, result_u_8x16);
 
  504                memcpy(buffer, tempBuffer, 12);
 
  508                _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
 
  517            const bool canReadLastElements = y < tPatchSize - 1u;
 
  520            __m128i bottom_u_8x16;
 
  522            if (canReadLastElements)
 
  524                top_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft));
 
  525                bottom_u_8x16 = _mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements));
 
  529                constexpr unsigned int overlappingElements = 16u - (remainingAfterBlocks4 * 3u + 3u);
 
  530                ocean_assert(overlappingElements < 16u);
 
  532                top_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(imageTopLeft - overlappingElements)), overlappingElements);
 
  533                bottom_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((
const __m128i*)(imageTopLeft + imageStrideElements - overlappingElements)), overlappingElements);
 
  541            const __m128i topChannel0_u_16x8 = _mm_shuffle_epi8(top_u_8x16, 
SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
 
  542            const __m128i bottomChannel0_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, 
SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
 
  545            const __m128i topChannel1_u_16x8 = _mm_shuffle_epi8(top_u_8x16, 
SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
 
  546            const __m128i bottomChannel1_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, 
SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
 
  549            const __m128i topChannel2_u_16x8 = _mm_shuffle_epi8(top_u_8x16, 
SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
 
  550            const __m128i bottomChannel2_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, 
SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
 
  554            const __m128i topChannel0_u_32x4 = _mm_madd_epi16(topChannel0_u_16x8, factorsTop_u_16x8);
 
  555            const __m128i topChannel1_u_32x4 = _mm_madd_epi16(topChannel1_u_16x8, factorsTop_u_16x8);
 
  556            const __m128i topChannel2_u_32x4 = _mm_madd_epi16(topChannel2_u_16x8, factorsTop_u_16x8);
 
  558            const __m128i bottomChannel0_u_32x4 = _mm_madd_epi16(bottomChannel0_u_16x8, factorsBottom_u_16x8);
 
  559            const __m128i bottomChannel1_u_32x4 = _mm_madd_epi16(bottomChannel1_u_16x8, factorsBottom_u_16x8);
 
  560            const __m128i bottomChannel2_u_32x4 = _mm_madd_epi16(bottomChannel2_u_16x8, factorsBottom_u_16x8);
 
  563            const __m128i resultChannel0_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel0_u_32x4, bottomChannel0_u_32x4), _mm_set1_epi32(8192)), 14);
 
  564            const __m128i resultChannel1_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel1_u_32x4, bottomChannel1_u_32x4), _mm_set1_epi32(8192)), 14);
 
  565            const __m128i resultChannel2_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel2_u_32x4, bottomChannel2_u_32x4), _mm_set1_epi32(8192)), 14);
 
  570            const __m128i interleavedA_u_8x16 = _mm_shuffle_epi8(resultChannel0_u_32x4, 
SSE::set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
 
  571            const __m128i interleavedB_u_8x16 = _mm_shuffle_epi8(resultChannel1_u_32x4, 
SSE::set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
 
  572            const __m128i interleavedC_u_8x16 = _mm_shuffle_epi8(resultChannel2_u_32x4, 
SSE::set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
 
  575            const __m128i result_u_8x16 = _mm_or_si128(interleavedA_u_8x16, _mm_or_si128(interleavedB_u_8x16, interleavedC_u_8x16));
 
  577            ocean_assert(blocks1 == 0u);
 
  579            const bool isLastBlock = y + 1u == tPatchSize;
 
  583                uint8_t tempBuffer[16];
 
  584                _mm_storeu_si128((__m128i*)tempBuffer, result_u_8x16);
 
  586                memcpy(buffer, tempBuffer, remainingAfterBlocks4 * 3u);
 
  590                _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
 
  593            imageTopLeft += remainingAfterBlocks4 * 3u;
 
  594            buffer += remainingAfterBlocks4 * 3u;
 
  597        if constexpr (blocks1 != 0u)
 
  599            const uint8_t* 
const imageBottomLeft = imageTopLeft + imageStrideElements;
 
  601            for (
unsigned int n = 0u; n < blocks1; ++n)
 
  603                for (
unsigned int c = 0u; c < 3u; ++c)
 
  605                    buffer[n * 3u + c] = uint8_t((imageTopLeft[n * 3u + c] * factorTopLeft + imageTopLeft[n * 3u + 3u + c] * factorTopRight + imageBottomLeft[n * 3u + c] * factorBottomLeft + imageBottomLeft[n * 3u + 3u + c] * factorBottomRight + 8192u) / 16384u);
 
  609            imageTopLeft += blocks1 * 3u;
 
  610            buffer += blocks1 * 3u;
 
  613        imageTopLeft += imageStrideElements - tPatchSize * 3u;