308 static_assert(0u < tStep01 && tStep01 < tStep12 && tStep12 < 1023u,
"Invalid steps");
310 ocean_assert(source !=
nullptr && target !=
nullptr);
311 ocean_assert(size >= 4 && size % 4 == 0);
312 ocean_assert(parameters !=
nullptr);
317 const int* coefficients =
reinterpret_cast<const int*
>(parameters);
319 const int32_t m0_256 = coefficients[0];
320 const int32_t m1_256 = coefficients[1];
321 const int32_t m2_256 = coefficients[2];
323 const int32_t c1 = coefficients[3];
324 const int32_t c2 = coefficients[4];
326 size_t blocks4 = size /
size_t(4);
328#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
330 const size_t blocks16 = size /
size_t(16);
332 const int16x4_t m0_256_s_16x4 = vdup_n_s16(int16_t(m0_256));
333 const int16x4_t m1_256_s_16x4 = vdup_n_s16(int16_t(m1_256));
334 const int16x4_t m2_256_s_16x4 = vdup_n_s16(int16_t(m2_256));
336 const int16x8_t c1_s_16x8 = vdupq_n_s16(int16_t(c1));
337 const int16x8_t c2_s_16x8 = vdupq_n_s16(int16_t(c2));
339 for (
size_t n = 0; n < blocks16; ++n)
341 convert16PixelY10_PackedToY8ApproximatedNEON<tStep01, tStep12>(source, m0_256_s_16x4, m1_256_s_16x4, m2_256_s_16x4, c1_s_16x8, c2_s_16x8, target);
347 blocks4 = (size - blocks16 *
size_t(16)) /
size_t(4);
348 ocean_assert(blocks4 <= size /
size_t(4));
354 const int32_t c1_256 = c1 * 256;
355 const int32_t c2_256 = c2 * 256;
357 for (
size_t n = 0; n < blocks4; ++n)
361 int32_t(uint16_t(source[0]) << uint16_t(2) | (uint16_t(source[4]) & uint16_t(0b00000011))),
362 int32_t(uint16_t(source[1]) << uint16_t(2) | ((uint16_t(source[4]) & uint16_t(0b00001100)) >> uint16_t(2))),
363 int32_t(uint16_t(source[2]) << uint16_t(2) | ((uint16_t(source[4]) & uint16_t(0b00110000)) >> uint16_t(4))),
364 int32_t(uint16_t(source[3]) << uint16_t(2) | (uint16_t(source[4]) >> uint16_t(6)))
367 for (
unsigned int i = 0u; i < 4u; ++i)
369 const uint32_t& xx = x[i];
373 result256 = (m0_256 * xx);
375 else if (xx <= tStep12)
377 result256 = (m1_256 * xx + c1_256);
381 result256 = (m2_256 * xx + c2_256);
384 ocean_assert(0 <= result256 && result256 <= 255 * 256);
386 target[i] = int8_t((uint32_t(result256) + 128u) >> 8u);
400 const uint8x16_t packedA_u_8x16 = vld1q_u8(source);
401 const uint8x8_t packedB_u_8x8 = vld1_u8(source + 12);
405 constexpr uint8x16_t shuffle_u_8x16 =
NEON::create_uint8x16(16u, 16u, 16u, 16u, 0u, 1u, 2u, 3u, 5u, 6u, 7u, 8u, 10u, 11u, 12u, 13u);
406 const uint8x16_t intermediateA_u_8x16 = vqtbl1q_u8(packedA_u_8x16, shuffle_u_8x16);
408 const uint8x8_t intermediateB_u_8x8 = vext_u8(packedB_u_8x8, packedB_u_8x8, 3);
410 const uint8x16_t target_u_8x16 = vextq_u8(intermediateA_u_8x16, vcombine_u8(intermediateB_u_8x8, intermediateB_u_8x8), 4);
414 constexpr uint8x16_t mask_u_8x16 =
NEON::create_uint8x16(0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0xFFu, 0xFFu, 0xFFu);
416 const uint8x16_t packedA_u_8x16 = vld1q_u8(source);
417 const uint8x8_t packedB_u_8x8 = vld1_u8(source + 11);
419 const uint8x8_t packedAA_u_8x8 = vget_low_u8(packedA_u_8x16);
420 const uint8x8_t packedAB_u_8x8 = vget_high_u8(packedA_u_8x16);
424 const uint8x16_t intermediateA_u_8x16 = vextq_u8(vcombine_u8(vtbl1_u8(packedAA_u_8x8, shuffleA_u_8x8), vtbl1_u8(packedAB_u_8x8, shuffleB_u_8x8)), mask_u_8x16, 1);
426 const uint8x16_t intermediateB_u_8x16 = vcombine_u8(vget_low_u8(mask_u_8x16), vand_u8(packedB_u_8x8, vget_high_u8(mask_u_8x16)));
428 const uint8x16_t target_u_8x16 = vorrq_u8(intermediateA_u_8x16, intermediateB_u_8x16);
432 vst1q_u8(target, target_u_8x16);
438 static_assert(0u < tStep01 && tStep01 < tStep12 && tStep12 < 1023u,
"Invalid steps");
440 constexpr int8x16_t leftShifts_s_8x16 =
NEON::create_int8x16(6, 0, 4, 0, 2, 0, 0, 0, 6, 0, 4, 0, 2, 0, 0, 0);
445 const uint8x16_t packedAB_u_8x16 = vld1q_u8(source);
446 const uint8x16_t packedCD_u_8x16 = vld1q_u8(source + 4);
450 constexpr uint8x16_t shuffleAB_u_8x16 =
NEON::create_uint8x16(4u, 0u, 4u, 1u, 4u, 2u, 4u, 3u, 9u, 5u, 9u, 6u, 9u, 7u, 9u, 8u);
451 const uint8x16_t intermediateAB_u_8x16 = vqtbl1q_u8(packedAB_u_8x16, shuffleAB_u_8x16);
453 constexpr uint8x16_t shuffleCD_u_8x16 =
NEON::create_uint8x16(10u, 6u, 10u, 7u, 10u, 8u, 10u, 9u, 15u, 11u, 15u, 12u, 15u, 13u, 15u, 14u);
454 const uint8x16_t intermediateCD_u_8x16 = vqtbl1q_u8(packedCD_u_8x16, shuffleCD_u_8x16);
462 const uint8x16_t packedAB_u_8x16 = vld1q_u8(source);
463 const uint8x8_t packedForD_u_8x8 = vld1_u8(source + 12);
465 const uint8x8_t packedForA_u_8x8 = vget_low_u8(packedAB_u_8x16);
466 const uint8x8_t packedForB_u_8x8 = vget_low_u8(vextq_u8(packedAB_u_8x16, packedAB_u_8x16, 5));
467 const uint8x8_t packedForC_u_8x8 = vget_high_u8(packedAB_u_8x16);
469 const uint8x16_t intermediateAB_u_8x16 = vcombine_u8(vtbl1_u8(packedForA_u_8x8, shuffleAB_u_8x8), vtbl1_u8(packedForB_u_8x8, shuffleAB_u_8x8));
470 const uint8x16_t intermediateCD_u_8x16 = vcombine_u8(vtbl1_u8(packedForC_u_8x8, shuffleC_u_8x8), vtbl1_u8(packedForD_u_8x8, shuffleD_u_8x8));
477 const uint16x8_t intermediateAB_u_16x8 = vreinterpretq_u16_u8(vshlq_u8(intermediateAB_u_8x16, leftShifts_s_8x16));
478 const uint16x8_t intermediateCD_u_16x8 = vreinterpretq_u16_u8(vshlq_u8(intermediateCD_u_8x16, leftShifts_s_8x16));
483 const uint16x8_t unpackedAB_u_16x8 = vshlq_u16(intermediateAB_u_16x8, rightShifts_s_16x8);
484 const uint16x8_t unpackedCD_u_16x8 = vshlq_u16(intermediateCD_u_16x8, rightShifts_s_16x8);
493 constexpr int16x8_t step01_s_16x8 =
NEON::create_int16x8(int16_t(tStep01), int16_t(tStep01), int16_t(tStep01), int16_t(tStep01), int16_t(tStep01), int16_t(tStep01), int16_t(tStep01), int16_t(tStep01));
494 constexpr int16x8_t step12_s_16x8 =
NEON::create_int16x8(int16_t(tStep12), int16_t(tStep12), int16_t(tStep12), int16_t(tStep12), int16_t(tStep12), int16_t(tStep12), int16_t(tStep12), int16_t(tStep12));
498 const uint16x8_t isWithin0AB_u_16x8 = vcleq_s16(vreinterpretq_s16_u16(unpackedAB_u_16x8), step01_s_16x8);
499 const uint16x8_t isWithin0CD_u_16x8 = vcleq_s16(vreinterpretq_s16_u16(unpackedCD_u_16x8), step01_s_16x8);
500 const uint8x16_t isWithin0_u_8x16 = vcombine_u8(vmovn_u16(isWithin0AB_u_16x8), vmovn_u16(isWithin0CD_u_16x8));
502 const uint16x8_t isWithin2AB_u_16x8 = vcgtq_s16(vreinterpretq_s16_u16(unpackedAB_u_16x8), step12_s_16x8);
503 const uint16x8_t isWithin2CD_u_16x8 = vcgtq_s16(vreinterpretq_s16_u16(unpackedCD_u_16x8), step12_s_16x8);
504 const uint8x16_t isWithin2_u_8x16 = vcombine_u8(vmovn_u16(isWithin2AB_u_16x8), vmovn_u16(isWithin2CD_u_16x8));
506 const uint8x16_t isWithin1_u_8x16 = vmvnq_u8(vorrq_u8(isWithin0_u_8x16, isWithin2_u_8x16));
509 const int16x4_t unpackedA_s_16x4 = vreinterpret_s16_u16(vget_low_u8(unpackedAB_u_16x8));
510 const int16x4_t unpackedB_s_16x4 = vreinterpret_s16_u16(vget_high_u8(unpackedAB_u_16x8));
511 const int16x4_t unpackedC_s_16x4 = vreinterpret_s16_u16(vget_low_u8(unpackedCD_u_16x8));
512 const int16x4_t unpackedD_s_16x4 = vreinterpret_s16_u16(vget_high_u8(unpackedCD_u_16x8));
515 const uint16x8_t resultAB0_u_16x8 = vcombine_u16(vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedA_s_16x4), 8), vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedB_s_16x4), 8));
516 const uint16x8_t resultCD0_u_16x8 = vcombine_u16(vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedC_s_16x4), 8), vqrshrun_n_s32(vmull_s16(m0_s_16x4, unpackedD_s_16x4), 8));
519 const int16x8_t resultAB1_s_16x8 = vaddq_s16(c1_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedA_s_16x4), 8), vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedB_s_16x4), 8)));
520 const int16x8_t resultCD1_s_16x8 = vaddq_s16(c1_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedC_s_16x4), 8), vrshrn_n_s32(vmull_s16(m1_s_16x4, unpackedD_s_16x4), 8)));
523 const int16x8_t resultAB2_s_16x8 = vaddq_s16(c2_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedA_s_16x4), 8), vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedB_s_16x4), 8)));
524 const int16x8_t resultCD2_s_16x8 = vaddq_s16(c2_s_16x8, vcombine_s16(vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedC_s_16x4), 8), vrshrn_n_s32(vmull_s16(m2_s_16x4, unpackedD_s_16x4), 8)));
526 const uint8x16_t result0_u_8x16 = vcombine_u8(vqmovn_u16(resultAB0_u_16x8), vqmovn_u16(resultCD0_u_16x8));
527 const uint8x16_t result1_u_8x16 = vcombine_u8(vqmovun_s16(resultAB1_s_16x8), vqmovun_s16(resultCD1_s_16x8));
528 const uint8x16_t result2_u_8x16 = vcombine_u8(vqmovun_s16(resultAB2_s_16x8), vqmovun_s16(resultCD2_s_16x8));
532 const uint8x16_t result_u_8x16 = vorrq_u8(vorrq_u8(vandq_u8(result0_u_8x16, isWithin0_u_8x16), vandq_u8(result1_u_8x16, isWithin1_u_8x16)), vandq_u8(result2_u_8x16, isWithin2_u_8x16));
534 vst1q_u8(target, result_u_8x16);