8#ifndef META_OCEAN_CV_SSE_H
9#define META_OCEAN_CV_SSE_H
17#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
44#if !defined(OCEAN_COMPILER_MSC)
64 static_assert(
sizeof(
M128i) == 16,
"Invalid data type!");
75 static_assert(
sizeof(
M128) == 16,
"Invalid data type!");
86 static_assert(
sizeof(
M128d) == 16,
"Invalid data type!");
96 static inline void prefetchT0(
const void*
const data);
102 static inline void prefetchT1(
const void*
const data);
108 static inline void prefetchT2(
const void*
const data);
114 static inline void prefetchNTA(
const void*
const data);
122 template <
unsigned int tIndex>
123 static inline uint8_t
value_u8(
const __m128i& value);
131 static inline uint8_t
value_u8(
const __m128i& value,
const unsigned int index);
139 template <
unsigned int tIndex>
140 static inline uint16_t
value_u16(
const __m128i& value);
148 template <
unsigned int tIndex>
149 static inline unsigned int value_u32(
const __m128i& value);
156 static OCEAN_FORCE_INLINE
unsigned int sum_u32_4(
const __m128i& value);
177 static OCEAN_FORCE_INLINE
float sum_f32_4(
const __m128& value);
184 static OCEAN_FORCE_INLINE
double sum_f64_2(
const __m128d& value);
222 template <
bool tBufferHas16Bytes>
243 template <
bool tBufferHas16Bytes>
425 static inline void average30Elements1Channel8Bit3x3(
const uint8_t*
const image0,
const uint8_t*
const image1,
const uint8_t*
const image2, uint8_t*
const result);
534 template <
bool tBufferHas16Bytes>
545 template <
bool tBufferHas16Bytes>
563 static inline __m128i
interpolation1Channel8Bit8Elements(
const __m128i& values0,
const __m128i& values1,
const __m128i& fx_fy_,
const __m128i& fxfy_,
const __m128i& fx_fy,
const __m128i& fxfy);
580 static inline __m128i
interpolation2Channel16Bit8Elements(
const __m128i& values0,
const __m128i& values1,
const __m128i& fx_fy_,
const __m128i& fxfy_,
const __m128i& fx_fy,
const __m128i& fxfy);
597 static inline __m128i
interpolation3Channel24Bit8Elements(
const __m128i& values0,
const __m128i& values1,
const __m128i& fx_fy_,
const __m128i& fxfy_,
const __m128i& fx_fy,
const __m128i& fxfy);
636 static inline __m128i
interpolation4Channel32Bit8Elements(
const __m128i& values0,
const __m128i& values1,
const __m128i& fx_fy_,
const __m128i& fxfy_,
const __m128i& fx_fy,
const __m128i& fxfy);
653 static inline __m128i
interpolation4Channel32Bit2x4Elements(
const __m128i& values0,
const __m128i& values1,
const __m128i& fx_fy_,
const __m128i& fxfy_,
const __m128i& fx_fy,
const __m128i& fxfy);
667 static inline unsigned int ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy);
685 static inline unsigned int ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f0x_y_,
const unsigned int f0xy_,
const unsigned int f0x_y,
const unsigned int f0xy,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy);
724 static OCEAN_FORCE_INLINE
void deInterleave3Channel8Bit48Elements(
const __m128i& interleavedA,
const __m128i& interleavedB,
const __m128i& interleavedC, __m128i& channel0, __m128i& channel1, __m128i& channel2);
763 OCEAN_FORCE_INLINE
static void interleave3Channel8Bit48Elements(
const __m128i& channel0,
const __m128i& channel1,
const __m128i& channel2, __m128i& interleavedA, __m128i& interleavedB, __m128i& interleavedC);
772 static OCEAN_FORCE_INLINE
void interleave3Channel8Bit48Elements(
const uint8_t*
const channel0,
const uint8_t*
const channel1,
const uint8_t*
const channel2, uint8_t*
const interleaved);
807 static OCEAN_FORCE_INLINE
void reverseChannelOrder3Channel8Bit48Elements(
const __m128i& interleaved0,
const __m128i& interleaved1,
const __m128i& interleaved2, __m128i& reversedInterleaved0, __m128i& reversedInterleaved1, __m128i& reversedInterleaved2);
845 static inline void reverseElements8Bit48Elements(
const __m128i& elements0,
const __m128i& elements1,
const __m128i& elements2, __m128i& reversedElements0, __m128i& reversedElements1, __m128i& reversedElements2);
922 template <
bool tBufferHas16Bytes>
974 static inline __m128i
load128i(
const void*
const buffer);
984 template <
bool tBufferHas16Bytes>
995 template <
bool tBufferHas16Bytes>
1006 template <
bool tBufferHas16Bytes>
1017 template <
bool tBufferHas16Bytes>
1028 template <
bool tBufferHas16Bytes>
1042 template <
unsigned int tShiftBytes>
1050 static inline void store128i(
const __m128i& value, uint8_t*
const buffer);
1058 static inline __m128i
set128i(
const unsigned long long high64,
const unsigned long long low64);
1256 static OCEAN_FORCE_INLINE
void multiplyInt8x16ToInt32x8(
const __m128i& values0,
const __m128i& values1, __m128i& products0, __m128i& products1);
1289 static inline unsigned int interpolation2Channel16Bit1x1(
const uint8_t*
const pixel,
const unsigned int size,
const unsigned int fx_y_,
const unsigned int fxy_,
const unsigned int fx_y,
const unsigned int fxy);
1294 _mm_prefetch((
char*)data, _MM_HINT_T0);
1299 _mm_prefetch((
char*)data, _MM_HINT_T1);
1304 _mm_prefetch((
char*)data, _MM_HINT_T2);
1309 _mm_prefetch((
char*)data, _MM_HINT_NTA);
1312template <
unsigned int tIndex>
1315 static_assert(tIndex <= 15u,
"Invalid index!");
1317#ifdef OCEAN_COMPILER_MSC
1318 return value.m128i_u8[tIndex];
1320 return ((
const M128i*)(&value))->m128i_u8[tIndex];
1326 ocean_assert(index <= 15u);
1328#ifdef OCEAN_COMPILER_MSC
1329 return value.m128i_u8[index];
1331 return ((
const M128i*)(&value))->m128i_u8[index];
1335template <
unsigned int tIndex>
1338 static_assert(tIndex <= 7u,
"Invalid index!");
1340#ifdef OCEAN_COMPILER_MSC
1341 return value.m128i_u16[tIndex];
1343 return ((
const M128i*)(&value))->m128i_u16[tIndex];
1347template <
unsigned int tIndex>
1350 static_assert(tIndex <= 3u,
"Invalid index!");
1352#ifdef OCEAN_COMPILER_MSC
1353 return value.m128i_u32[tIndex];
1355 return ((
const M128i*)(&value))->m128i_u32[tIndex];
1361#ifdef OCEAN_COMPILER_MSC
1362 return value.m128i_u32[0] + value.m128i_u32[1] + value.m128i_u32[2] + value.m128i_u32[3];
1364 return ((
const M128i*)(&value))->m128i_u32[0] + ((
const M128i*)(&value))->m128i_u32[1] + ((
const M128i*)(&value))->m128i_u32[2] + ((
const M128i*)(&value))->m128i_u32[3];
1370#ifdef OCEAN_COMPILER_MSC
1371 return value.m128i_u32[0] + value.m128i_u32[1];
1373 return ((
const M128i*)(&value))->m128i_u32[0] + ((
const M128i*)(&value))->m128i_u32[1];
1379#ifdef OCEAN_COMPILER_MSC
1380 return value.m128i_u32[0] + value.m128i_u32[2];
1382 return ((
const M128i*)(&value))->m128i_u32[0] + ((
const M128i*)(&value))->m128i_u32[2];
1388#ifdef OCEAN_COMPILER_MSC
1389 return value.m128_f32[0] + value.m128_f32[1] + value.m128_f32[2] + value.m128_f32[3];
1391 return ((
const M128*)(&value))->m128_f32[0] + ((
const M128*)(&value))->m128_f32[1] + ((
const M128*)(&value))->m128_f32[2] + ((
const M128*)(&value))->m128_f32[3];
1397#ifdef OCEAN_COMPILER_MSC
1398 return value.m128d_f64[0] + value.m128d_f64[1];
1400 return ((
const M128d*)(&value))->m128d_f64[0] + ((
const M128d*)(&value))->m128d_f64[1];
1406 ocean_assert(image0 && image1);
1413 ocean_assert(image0 && image1);
1420 ocean_assert(image0 && image1);
1422 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1423 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1426 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1430 const __m128i subtractLow = _mm_shuffle_epi8(subtract,
set128i(0xA0A0A0A0A00AA008ull, 0xA006A004A002A000ull));
1431 const __m128i subtractHigh = _mm_shuffle_epi8(subtract,
set128i(0xA0A0A0A0A00BA009ull, 0xA007A005A003A001ull));
1434 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1435 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1442 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1447 ocean_assert(image0 && image1);
1449 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1450 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1453 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1457 const __m128i subtractLow = _mm_shuffle_epi8(subtract,
set128i(0xA0A0A0A0A00EA00Cull, 0xA00AA008A006A004ull));
1458 const __m128i subtractHigh = _mm_shuffle_epi8(subtract,
set128i(0xA0A0A0A0A00FA00Dull, 0xA00BA009A007A005ull));
1461 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1462 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1469 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1472template <
bool tBufferHas16Bytes>
1475 ocean_assert(image0 && image1);
1477 const __m128i row0 = load_u8_13_lower_random<tBufferHas16Bytes>(image0);
1478 const __m128i row1 = load_u8_13_lower_random<tBufferHas16Bytes>(image1);
1481 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1485 const __m128i subtractLow = _mm_shuffle_epi8(subtract,
set128i(0xA0A0A00CA00AA008ull, 0xA006A004A002A000ull));
1486 const __m128i subtractHigh = _mm_shuffle_epi8(subtract,
set128i(0xA0A0A0A0A00BA009ull, 0xA007A005A003A001ull));
1489 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1490 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1497 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1502 ocean_assert(image0 && image1);
1504 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1505 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1508 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1512 const __m128i subtractLow = _mm_shuffle_epi8(subtract,
set128i(0xA0A0A00FA00DA00Bull, 0xA009A007A005A003ull));
1513 const __m128i subtractHigh = _mm_shuffle_epi8(subtract,
set128i(0xA0A0A0A0A00EA00Cull, 0xA00AA008A006A004ull));
1516 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1517 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1524 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1527template <
bool tBufferHas16Bytes>
1530 ocean_assert(image0 && image1);
1532 const __m128i row0 = load_u8_15_lower_random<tBufferHas16Bytes>(image0);
1533 const __m128i row1 = load_u8_15_lower_random<tBufferHas16Bytes>(image1);
1536 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1543 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1544 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1551 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1554template <
bool tBufferHas16Bytes>
1557 ocean_assert(image0 && image1);
1559 return _mm_sad_epu8(load_u8_10_upper_zero<tBufferHas16Bytes>(image0), load_u8_10_upper_zero<tBufferHas16Bytes>(image1));
1562template <
bool tBufferHas16Bytes>
1565 ocean_assert(image0 && image1);
1567 return _mm_sad_epu8(load_u8_15_upper_zero<tBufferHas16Bytes>(image0), load_u8_15_upper_zero<tBufferHas16Bytes>(image1));
1572 ocean_assert(image0 && image1);
1574 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
1575 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
1582 ocean_assert(image0 && image1);
1589 ocean_assert(image0 && image1);
1590 ocean_assert((
unsigned long long)image0 % 16ll == 0ll);
1591 ocean_assert((
unsigned long long)image1 % 16ll == 0ll);
1593 const __m128i row0 = _mm_load_si128((__m128i*)image0);
1594 const __m128i row1 = _mm_load_si128((__m128i*)image1);
1602 const __m128i subtract = _mm_or_si128(_mm_subs_epu8(row0, row1), _mm_subs_epu8(row1, row0));
1609 const __m128i squareLow = _mm_mullo_epi16(subtractLow, subtractLow);
1610 const __m128i squareHigh = _mm_mullo_epi16(subtractHigh, subtractHigh);
1617 return _mm_add_epi32(sumSquareLow, sumSquareHigh);
1632#ifdef OCEAN_COMPILER_MSC
1634 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1635 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1636 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1637 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1638 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1639 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1640 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1642 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1643 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1644 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1645 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1646 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1647 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1648 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1650 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1651 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1652 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1653 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1654 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1655 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1656 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1658 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1659 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1660 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1661 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1662 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1663 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1664 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1666 ocean_assert(fx_fy_.m128i_u16[0] + fxfy_.m128i_u16[0] + fx_fy.m128i_u16[0] + fxfy.m128i_u16[0] == 128u * 128u);
1672 const M128i& debug_fx_fy_ = *(
const M128i*)(&fx_fy_);
1673 const M128i& debug_fx_fy = *(
const M128i*)(&fx_fy);
1674 const M128i& debug_fxfy_ = *(
const M128i*)(&fxfy_);
1675 const M128i& debug_fxfy = *(
const M128i*)(&fxfy);
1715 __m128i shuffle =
set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1718 __m128i row = _mm_shuffle_epi8(values0, shuffle);
1720 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1721 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1723 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA);
1724 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1727 row = _mm_shuffle_epi8(values1, shuffle);
1729 multiLow = _mm_mullo_epi16(row, fx_fy);
1730 multiHigh = _mm_mulhi_epu16(row, fx_fy);
1732 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1733 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1737 shuffle =
set128i(0xA008A007A006A005ull, 0xA004A003A002A001ull);
1740 row = _mm_shuffle_epi8(values0, shuffle);
1742 multiLow = _mm_mullo_epi16(row, fxfy_);
1743 multiHigh = _mm_mulhi_epu16(row, fxfy_);
1745 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1746 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1750 row = _mm_shuffle_epi8(values1, shuffle);
1752 multiLow = _mm_mullo_epi16(row, fxfy);
1753 multiHigh = _mm_mulhi_epu16(row, fxfy);
1755 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1756 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1760 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
1761 resultEven = _mm_srli_epi32(resultEven, 14);
1763 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
1764 resultOdd = _mm_srli_epi32(resultOdd, 14);
1782#ifdef OCEAN_COMPILER_MSC
1784 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1785 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1786 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1787 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1788 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1789 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1790 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1792 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1793 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1794 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1795 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1796 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1797 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1798 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1800 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1801 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1802 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1803 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1804 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1805 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1806 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1808 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1809 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1810 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1811 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1812 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1813 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1814 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1820 const M128i& debug_fx_fy_ = *(
const M128i*)(&fx_fy_);
1821 const M128i& debug_fx_fy = *(
const M128i*)(&fx_fy);
1822 const M128i& debug_fxfy_ = *(
const M128i*)(&fxfy_);
1823 const M128i& debug_fxfy = *(
const M128i*)(&fxfy);
1861 __m128i shuffle =
set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
1864 __m128i row = _mm_shuffle_epi8(values0, shuffle);
1866 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
1867 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
1869 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA);
1870 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
1873 row = _mm_shuffle_epi8(values1, shuffle);
1875 multiLow = _mm_mullo_epi16(row, fx_fy);
1876 multiHigh = _mm_mulhi_epu16(row, fx_fy);
1878 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1879 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1883 shuffle =
set128i(0xA009A008A007A006ull, 0xA005A004A003A002ull);
1886 row = _mm_shuffle_epi8(values0, shuffle);
1888 multiLow = _mm_mullo_epi16(row, fxfy_);
1889 multiHigh = _mm_mulhi_epu16(row, fxfy_);
1891 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1892 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1896 row = _mm_shuffle_epi8(values1, shuffle);
1898 multiLow = _mm_mullo_epi16(row, fxfy);
1899 multiHigh = _mm_mulhi_epu16(row, fxfy);
1901 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
1902 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
1906 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
1907 resultEven = _mm_srli_epi32(resultEven, 14);
1909 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
1910 resultOdd = _mm_srli_epi32(resultOdd, 14);
1928#ifdef OCEAN_COMPILER_MSC
1930 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
1931 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
1932 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
1933 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
1934 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
1935 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
1936 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
1938 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
1939 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
1940 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
1941 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
1942 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
1943 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
1944 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
1946 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
1947 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
1948 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
1949 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
1950 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
1951 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
1952 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
1954 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
1955 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
1956 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
1957 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
1958 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
1959 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
1960 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
1966 const M128i& debug_fx_fy_ = *(
const M128i*)(&fx_fy_);
1967 const M128i& debug_fx_fy = *(
const M128i*)(&fx_fy);
1968 const M128i& debug_fxfy_ = *(
const M128i*)(&fxfy_);
1969 const M128i& debug_fxfy = *(
const M128i*)(&fxfy);
2007 __m128i shuffle =
set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
2010 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2012 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2013 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2015 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA);
2016 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2019 row = _mm_shuffle_epi8(values1, shuffle);
2021 multiLow = _mm_mullo_epi16(row, fx_fy);
2022 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2024 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2025 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2029 shuffle =
set128i(0xA00AA009A008A007ull, 0xA006A005A004A003ull);
2032 row = _mm_shuffle_epi8(values0, shuffle);
2034 multiLow = _mm_mullo_epi16(row, fxfy_);
2035 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2037 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2038 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2042 row = _mm_shuffle_epi8(values1, shuffle);
2044 multiLow = _mm_mullo_epi16(row, fxfy);
2045 multiHigh = _mm_mulhi_epu16(row, fxfy);
2047 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2048 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2052 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2053 resultEven = _mm_srli_epi32(resultEven, 14);
2055 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2056 resultOdd = _mm_srli_epi32(resultOdd, 14);
2064 __m128i row0_a = _mm_shuffle_epi8(values0,
set128i(0xFF04FF03FF03FF02ull, 0xFF02FF01FF01FF00ull));
2065 __m128i row1_a = _mm_shuffle_epi8(values1,
set128i(0xFF04FF03FF03FF02ull, 0xFF02FF01FF01FF00ull));
2067 __m128i row0_b = _mm_shuffle_epi8(values0,
set128i(0xFF08FF07FF07FF06ull, 0xFF06FF05FF05FF04ull));
2068 __m128i row1_b = _mm_shuffle_epi8(values1,
set128i(0xFF08FF07FF07FF06ull, 0xFF06FF05FF05FF04ull));
2070 __m128i row0_c = _mm_shuffle_epi8(values0,
set128i(0xFF0cFF0bFF0bFF0aull, 0xFF0aFF09FF09FF08ull));
2071 __m128i row1_c = _mm_shuffle_epi8(values1,
set128i(0xFF0cFF0bFF0bFF0aull, 0xFF0aFF09FF09FF08ull));
2073 __m128i row0_d = _mm_shuffle_epi8(values0,
set128i(0xFFFFFFFFFF0fFF0eull, 0xFF0eFF0dFF0dFF0cull));
2074 __m128i row1_d = _mm_shuffle_epi8(values1,
set128i(0xFFFFFFFFFF0fFF0eull, 0xFF0eFF0dFF0dFF0cull));
2076 row0_a = _mm_madd_epi16(row0_a, fx_fy_fxfy_);
2077 row0_b = _mm_madd_epi16(row0_b, fx_fy_fxfy_);
2078 row0_c = _mm_madd_epi16(row0_c, fx_fy_fxfy_);
2079 row0_d = _mm_madd_epi16(row0_d, fx_fy_fxfy_);
2081 row1_a = _mm_madd_epi16(row1_a, fx_fyfxfy);
2082 row1_b = _mm_madd_epi16(row1_b, fx_fyfxfy);
2083 row1_c = _mm_madd_epi16(row1_c, fx_fyfxfy);
2084 row1_d = _mm_madd_epi16(row1_d, fx_fyfxfy);
2086 const __m128i rounding = _mm_set1_epi32(8192);
2088 __m128i row_a = _mm_add_epi32(row0_a, row1_a);
2089 __m128i row_b = _mm_add_epi32(row0_b, row1_b);
2090 __m128i row_c = _mm_add_epi32(row0_c, row1_c);
2091 __m128i row_d = _mm_add_epi32(row0_d, row1_d);
2093 row_a = _mm_add_epi32(row_a, rounding);
2094 row_b = _mm_add_epi32(row_b, rounding);
2095 row_c = _mm_add_epi32(row_c, rounding);
2096 row_d = _mm_add_epi32(row_d, rounding);
2098 row_a = _mm_srli_epi32(row_a, 14);
2099 row_b = _mm_srli_epi32(row_b, 14);
2100 row_c = _mm_srli_epi32(row_c, 14);
2101 row_d = _mm_srli_epi32(row_d, 14);
2103 row_a = _mm_shuffle_epi8(row_a,
set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF0c080400ull));
2104 row_b = _mm_shuffle_epi8(row_b,
set128i(0xFFFFFFFFFFFFFFFFull, 0x0c080400FFFFFFFFull));
2105 row_c = _mm_shuffle_epi8(row_c,
set128i(0xFFFFFFFF0c080400ull, 0xFFFFFFFFFFFFFFFFull));
2106 row_d = _mm_shuffle_epi8(row_d,
set128i(0xFF080400FFFFFFFFull, 0xFFFFFFFFFFFFFFFFull));
2108 row_a = _mm_or_si128(row_a, row_b);
2109 row_c = _mm_or_si128(row_c, row_d);
2111 return _mm_or_si128(row_a, row_c);
2116 __m128i row0_a = _mm_shuffle_epi8(values0,
set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
2117 __m128i row1_a = _mm_shuffle_epi8(values1,
set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
2119 __m128i row0_b = _mm_shuffle_epi8(values0,
set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
2120 __m128i row1_b = _mm_shuffle_epi8(values1,
set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
2122 __m128i row0_c = _mm_shuffle_epi8(values0,
set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
2123 __m128i row1_c = _mm_shuffle_epi8(values1,
set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
2125 row0_a = _mm_madd_epi16(row0_a, fx_fy_fxfy_);
2126 row0_b = _mm_madd_epi16(row0_b, fx_fy_fxfy_);
2127 row0_c = _mm_madd_epi16(row0_c, fx_fy_fxfy_);
2129 row1_a = _mm_madd_epi16(row1_a, fx_fyfxfy);
2130 row1_b = _mm_madd_epi16(row1_b, fx_fyfxfy);
2131 row1_c = _mm_madd_epi16(row1_c, fx_fyfxfy);
2133 const __m128i rounding = _mm_set1_epi32(8192);
2135 __m128i row_a = _mm_add_epi32(row0_a, row1_a);
2136 __m128i row_b = _mm_add_epi32(row0_b, row1_b);
2137 __m128i row_c = _mm_add_epi32(row0_c, row1_c);
2139 row_a = _mm_add_epi32(row_a, rounding);
2140 row_b = _mm_add_epi32(row_b, rounding);
2141 row_c = _mm_add_epi32(row_c, rounding);
2143 row_a = _mm_srli_epi32(row_a, 14);
2144 row_b = _mm_srli_epi32(row_b, 14);
2145 row_c = _mm_srli_epi32(row_c, 14);
2147 row_a = _mm_shuffle_epi8(row_a,
set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
2148 row_b = _mm_shuffle_epi8(row_b,
set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
2149 row_c = _mm_shuffle_epi8(row_c,
set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
2151 return _mm_or_si128(row_a, _mm_or_si128(row_b, row_c));
2166#ifdef OCEAN_COMPILER_MSC
2168 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
2169 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
2170 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
2171 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
2172 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
2173 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
2174 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
2176 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
2177 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
2178 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
2179 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
2180 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
2181 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
2182 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
2184 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
2185 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
2186 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
2187 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
2188 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
2189 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
2190 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
2192 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
2193 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
2194 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
2195 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
2196 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
2197 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
2198 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
2204 const M128i& debug_fx_fy_ = *(
const M128i*)(&fx_fy_);
2205 const M128i& debug_fx_fy = *(
const M128i*)(&fx_fy);
2206 const M128i& debug_fxfy_ = *(
const M128i*)(&fxfy_);
2207 const M128i& debug_fxfy = *(
const M128i*)(&fxfy);
2245 __m128i shuffle =
set128i(0xA007A006A005A004ull, 0xA003A002A001A000ull);
2248 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2250 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2251 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2253 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA);
2254 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2257 row = _mm_shuffle_epi8(values1, shuffle);
2259 multiLow = _mm_mullo_epi16(row, fx_fy);
2260 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2262 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2263 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2267 shuffle =
set128i(0xA00BA00AA009A008ull, 0xA007A006A005A004ull);
2270 row = _mm_shuffle_epi8(values0, shuffle);
2272 multiLow = _mm_mullo_epi16(row, fxfy_);
2273 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2275 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2276 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2280 row = _mm_shuffle_epi8(values1, shuffle);
2282 multiLow = _mm_mullo_epi16(row, fxfy);
2283 multiHigh = _mm_mulhi_epu16(row, fxfy);
2285 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2286 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2290 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2291 resultEven = _mm_srli_epi32(resultEven, 14);
2293 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2294 resultOdd = _mm_srli_epi32(resultOdd, 14);
2313#ifdef OCEAN_COMPILER_MSC
2315 ocean_assert(fx_fy_.m128i_u16[0] == fx_fy_.m128i_u16[1]);
2316 ocean_assert(fx_fy_.m128i_u16[1] == fx_fy_.m128i_u16[2]);
2317 ocean_assert(fx_fy_.m128i_u16[2] == fx_fy_.m128i_u16[3]);
2318 ocean_assert(fx_fy_.m128i_u16[3] == fx_fy_.m128i_u16[4]);
2319 ocean_assert(fx_fy_.m128i_u16[4] == fx_fy_.m128i_u16[5]);
2320 ocean_assert(fx_fy_.m128i_u16[5] == fx_fy_.m128i_u16[6]);
2321 ocean_assert(fx_fy_.m128i_u16[6] == fx_fy_.m128i_u16[7]);
2323 ocean_assert(fxfy_.m128i_u16[0] == fxfy_.m128i_u16[1]);
2324 ocean_assert(fxfy_.m128i_u16[1] == fxfy_.m128i_u16[2]);
2325 ocean_assert(fxfy_.m128i_u16[2] == fxfy_.m128i_u16[3]);
2326 ocean_assert(fxfy_.m128i_u16[3] == fxfy_.m128i_u16[4]);
2327 ocean_assert(fxfy_.m128i_u16[4] == fxfy_.m128i_u16[5]);
2328 ocean_assert(fxfy_.m128i_u16[5] == fxfy_.m128i_u16[6]);
2329 ocean_assert(fxfy_.m128i_u16[6] == fxfy_.m128i_u16[7]);
2331 ocean_assert(fx_fy.m128i_u16[0] == fx_fy.m128i_u16[1]);
2332 ocean_assert(fx_fy.m128i_u16[1] == fx_fy.m128i_u16[2]);
2333 ocean_assert(fx_fy.m128i_u16[2] == fx_fy.m128i_u16[3]);
2334 ocean_assert(fx_fy.m128i_u16[3] == fx_fy.m128i_u16[4]);
2335 ocean_assert(fx_fy.m128i_u16[4] == fx_fy.m128i_u16[5]);
2336 ocean_assert(fx_fy.m128i_u16[5] == fx_fy.m128i_u16[6]);
2337 ocean_assert(fx_fy.m128i_u16[6] == fx_fy.m128i_u16[7]);
2339 ocean_assert(fxfy.m128i_u16[0] == fxfy.m128i_u16[1]);
2340 ocean_assert(fxfy.m128i_u16[1] == fxfy.m128i_u16[2]);
2341 ocean_assert(fxfy.m128i_u16[2] == fxfy.m128i_u16[3]);
2342 ocean_assert(fxfy.m128i_u16[3] == fxfy.m128i_u16[4]);
2343 ocean_assert(fxfy.m128i_u16[4] == fxfy.m128i_u16[5]);
2344 ocean_assert(fxfy.m128i_u16[5] == fxfy.m128i_u16[6]);
2345 ocean_assert(fxfy.m128i_u16[6] == fxfy.m128i_u16[7]);
2351 const M128i& debug_fx_fy_ = *(
const M128i*)(&fx_fy_);
2352 const M128i& debug_fx_fy = *(
const M128i*)(&fx_fy);
2353 const M128i& debug_fxfy_ = *(
const M128i*)(&fxfy_);
2354 const M128i& debug_fxfy = *(
const M128i*)(&fxfy);
2392 __m128i shuffle =
set128i(0xA00BA00AA009A008ull, 0xA003A002A001A000ull);
2395 __m128i row = _mm_shuffle_epi8(values0, shuffle);
2397 __m128i multiLow = _mm_mullo_epi16(row, fx_fy_);
2398 __m128i multiHigh = _mm_mulhi_epu16(row, fx_fy_);
2400 __m128i resultEven = _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA);
2401 __m128i resultOdd = _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA);
2404 row = _mm_shuffle_epi8(values1, shuffle);
2406 multiLow = _mm_mullo_epi16(row, fx_fy);
2407 multiHigh = _mm_mulhi_epu16(row, fx_fy);
2409 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2410 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2414 shuffle =
set128i(0xA00FA00EA00DA00Cull, 0xA007A006A005A004ull);
2417 row = _mm_shuffle_epi8(values0, shuffle);
2419 multiLow = _mm_mullo_epi16(row, fxfy_);
2420 multiHigh = _mm_mulhi_epu16(row, fxfy_);
2422 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2423 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2427 row = _mm_shuffle_epi8(values1, shuffle);
2429 multiLow = _mm_mullo_epi16(row, fxfy);
2430 multiHigh = _mm_mulhi_epu16(row, fxfy);
2432 resultEven = _mm_add_epi32(resultEven, _mm_blend_epi16(multiLow, _mm_slli_si128(multiHigh, 2), 0xAA));
2433 resultOdd = _mm_add_epi32(resultOdd, _mm_blend_epi16(_mm_srli_si128(multiLow, 2), multiHigh, 0xAA));
2437 resultEven = _mm_add_epi32(resultEven, _mm_set1_epi32(8192));
2438 resultEven = _mm_srli_epi32(resultEven, 14);
2440 resultOdd = _mm_add_epi32(resultOdd, _mm_set1_epi32(8192));
2441 resultOdd = _mm_srli_epi32(resultOdd, 14);
2449 ocean_assert(image0 && image1);
2452 const __m128 row0 = _mm_loadu_ps(image0);
2453 const __m128 row1 = _mm_loadu_ps(image1);
2456 const __m128 sumFirst = _mm_add_ps(row0, row1);
2459 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2460 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2463 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2466 const __m128 sumAdjacent = _mm_hadd_ps(sumFirst, sumSecond);
2475 const __m128 division = _mm_mul_ps(sumAdjacent, _mm_set_ps1(0.25f));
2478 _mm_storeu_ps(result, division);
2483 ocean_assert(image0 && image1);
2486 const __m128i row0 = _mm_loadl_epi64((__m128i*)image0);
2487 const __m128i row1 = _mm_loadl_epi64((__m128i*)image1);
2494 const __m128i sum = _mm_add_epi16(sumLow, _mm_add_epi16(sumHigh, _mm_set1_epi32(
int(0x00020002))));
2497 const __m128i division16 = _mm_srli_epi16(sum, 2);
2502 memcpy(result, &division8,
sizeof(uint8_t) * 4);
2507 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
2508 ocean_assert(threshold >= 1u);
2511 const __m128i row0_u_8x8 = _mm_loadl_epi64((__m128i*)image0);
2512 const __m128i row1_u_8x8 = _mm_loadl_epi64((__m128i*)image1);
2514 const __m128i row0_u_16x8 = _mm_cvtepu8_epi16(row0_u_8x8);
2515 const __m128i row1_u_16x8 = _mm_cvtepu8_epi16(row1_u_8x8);
2517 const __m128i verticalSum_u_16x8 = _mm_adds_epu16(row0_u_16x8, row1_u_16x8);
2518 const __m128i sum_u_16x8 = _mm_hadd_epi16(verticalSum_u_16x8, verticalSum_u_16x8);
2520 const __m128i mask_u_16x8 = _mm_cmpgt_epi16(sum_u_16x8, _mm_set1_epi16(
short(threshold - 1u)));
2524 memcpy(result, &mask_u_8x8,
sizeof(uint8_t) * 4);
2529 ocean_assert(image0 && image1);
2532 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2533 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2540 const __m128i sum = _mm_add_epi16(sumLow, _mm_add_epi16(sumHigh, _mm_set1_epi32(
int(0x00020002))));
2543 const __m128i division16 = _mm_srli_epi16(sum, 2);
2549 _mm_storel_epi64((__m128i*)result, division8);
2564 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
2565 ocean_assert(threshold >= 1u);
2568 const __m128i row0_u_8x16 = _mm_lddqu_si128((__m128i*)image0);
2569 const __m128i row1_u_8x16 = _mm_lddqu_si128((__m128i*)image1);
2571 const __m128i horizontalSum0_u_16x8 = _mm_maddubs_epi16(row0_u_8x16, _mm_set1_epi8(1));
2572 const __m128i horizontalSum1_u_16x8 = _mm_maddubs_epi16(row1_u_8x16, _mm_set1_epi8(1));
2574 const __m128i sum_u_16x8 = _mm_add_epi16(horizontalSum0_u_16x8, horizontalSum1_u_16x8);
2576 const __m128i mask_u_16x8 = _mm_cmpgt_epi16(sum_u_16x8, _mm_set1_epi16(
short(threshold - 1u)));
2581 _mm_storel_epi64((__m128i*)result, mask_u_8x8);
2586 ocean_assert(image0 && image1);
2589 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2590 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2597 const __m128i firstSum = _mm_add_epi16(firstSumLow, _mm_add_epi16(firstSumHigh, _mm_set1_epi32(
int(0x00020002))));
2600 const __m128i firstDivision16 = _mm_srli_epi16(firstSum, 2);
2606 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2607 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2614 const __m128i secondSum = _mm_add_epi16(secondSumLow, _mm_add_epi16(secondSumHigh, _mm_set1_epi32(
int(0x00020002))));
2617 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2624 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2627 _mm_storeu_si128((__m128i*)result, division8);
2652 ocean_assert(image0 !=
nullptr && image1 !=
nullptr);
2653 ocean_assert(threshold >= 1u);
2656 const __m128i row0A_u_8x16 = _mm_lddqu_si128((__m128i*)image0);
2657 const __m128i row1A_u_8x16 = _mm_lddqu_si128((__m128i*)image1);
2659 const __m128i horizontalSum0A_u_16x8 = _mm_maddubs_epi16(row0A_u_8x16, _mm_set1_epi8(1));
2660 const __m128i horizontalSum1A_u_16x8 = _mm_maddubs_epi16(row1A_u_8x16, _mm_set1_epi8(1));
2662 const __m128i sumA_u_16x8 = _mm_add_epi16(horizontalSum0A_u_16x8, horizontalSum1A_u_16x8);
2664 const __m128i maskA_u_16x8 = _mm_cmpgt_epi16(sumA_u_16x8, _mm_set1_epi16(
short(threshold - 1)));
2666 const __m128i row0B_u_8x16 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2667 const __m128i row1B_u_8x16 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2669 const __m128i horizontalSum0B_u_16x8 = _mm_maddubs_epi16(row0B_u_8x16, _mm_set1_epi8(1));
2670 const __m128i horizontalSum1B_u_16x8 = _mm_maddubs_epi16(row1B_u_8x16, _mm_set1_epi8(1));
2672 const __m128i sumB_u_16x8 = _mm_add_epi16(horizontalSum0B_u_16x8, horizontalSum1B_u_16x8);
2674 const __m128i maskB_u_16x8 = _mm_cmpgt_epi16(sumB_u_16x8, _mm_set1_epi16(
short(threshold - 1u)));
2679 _mm_storeu_si128((__m128i*)result, mask_u_8x16);
2684 ocean_assert(image0 && image1);
2687 const __m128i row0 = _mm_loadl_epi64((__m128i*)image0);
2688 const __m128i row1 = _mm_loadl_epi64((__m128i*)image1);
2695 const __m128i sumLow = _mm_add_epi16(shuffledRow0, shuffledRow1);
2696 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumLow), _mm_set1_epi32(
int(0x00020002)));
2699 const __m128i division16 = _mm_srli_epi16(sum, 2);
2704 memcpy(result, &division8,
sizeof(uint8_t) * 4);
2709 ocean_assert(image0 && image1);
2712 const __m128 row0 = _mm_loadu_ps(image0);
2713 const __m128 row1 = _mm_loadu_ps(image1);
2716 const __m128 sumFirst = _mm_add_ps(row0, row1);
2719 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2720 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2723 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2728 const __m128 sumComponents = _mm_add_ps(_mm_shuffle_ps(sumFirst, sumSecond, 68u), _mm_shuffle_ps(sumFirst, sumSecond, 238u));
2731 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2734 _mm_storeu_ps(result, division);
2739 ocean_assert(image0 && image1);
2742 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2743 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2750 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(
int(0x00020002)));
2753 const __m128i division16 = _mm_srli_epi16(sum, 2);
2759 _mm_storel_epi64((__m128i*)result, division8);
2764 ocean_assert(image0 && image1);
2767 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2768 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2775 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(
int(0x00020002)));
2778 const __m128i division16 = _mm_srli_epi16(sum, 2);
2784 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2785 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2792 const __m128i secondSum = _mm_add_epi16(_mm_hadd_epi16(secondSumLow, secondSumHigh), _mm_set1_epi32(
int(0x00020002)));
2795 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2802 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
2805 _mm_storeu_si128((__m128i*)result, division8);
2810 ocean_assert(image0 && image1 && result);
2815 const __m128 row0 = _mm_loadu_ps(image0);
2816 const __m128 row1 = _mm_loadu_ps(image1);
2819 const __m128 sumFirst = _mm_add_ps(row0, row1);
2822 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 2);
2823 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 2);
2826 const __m128 sumSecond = _mm_add_ps(rowSecond0, rowSecond1);
2831 const __m128 sumComponents = _mm_add_ps(sumFirst, _mm_shuffle_ps(sumSecond, sumSecond, 57u));
2834 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2838#ifdef OCEAN_COMPILER_MSC
2839 memcpy(result, &division.m128_f32[0],
sizeof(
float) * 3);
2841 memcpy(result, &division,
sizeof(
float) * 3);
2847 ocean_assert(image0 && image1 && result);
2849 __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2850 __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2858 __m128i shuffleMaskLow =
set128i(0xA0A0A0A0A008A007ull, 0xA006A002A001A000ull);
2859 __m128i shuffleMaskHigh =
set128i(0xA0A0A0A0A00BA00Aull, 0xA009A005A004A003ull);
2861 __m128i sumLow = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskLow), _mm_shuffle_epi8(row1, shuffleMaskLow));
2862 __m128i sumHigh = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskHigh), _mm_shuffle_epi8(row1, shuffleMaskHigh));
2865 __m128i sum = _mm_add_epi16(_mm_add_epi16(sumLow, sumHigh), _mm_set1_epi32(
int(0x00020002)));
2868 __m128i division16 = _mm_srli_epi16(sum, 2);
2871 __m128i division8 = _mm_shuffle_epi8(division16,
set128i(0xA0A0A0A0A0A0A0A0ull, 0xA0A00A0806040200ull));
2880 row0 = _mm_lddqu_si128((__m128i*)(image0 + 8));
2881 row1 = _mm_lddqu_si128((__m128i*)(image1 + 8));
2883 shuffleMaskLow =
set128i(0xA0A0A0A0A00CA00Bull, 0xA00AA006A005A004ull);
2884 shuffleMaskHigh =
set128i(0xA0A0A0A0A00FA00Eull, 0xA00DA009A008A007ull);
2886 sumLow = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskLow), _mm_shuffle_epi8(row1, shuffleMaskLow));
2887 sumHigh = _mm_add_epi16(_mm_shuffle_epi8(row0, shuffleMaskHigh), _mm_shuffle_epi8(row1, shuffleMaskHigh));
2890 sum = _mm_add_epi16(_mm_add_epi16(sumLow, sumHigh), _mm_set1_epi32(
int(0x00020002)));
2893 division16 = _mm_srli_epi16(sum, 2);
2896 division8 = _mm_or_si128(division8, _mm_shuffle_epi8(division16,
set128i(0xA0A0A0A00A080604ull, 0x0200A0A0A0A0A0A0ull)));
2898#ifdef OCEAN_COMPILER_MSC
2899 memcpy(result, &division8.m128i_u8[0], 12);
2901 memcpy(result, &division8, 12);
2907 ocean_assert(image0 && image1);
2910 const __m128 row0 = _mm_loadu_ps(image0);
2911 const __m128 row1 = _mm_loadu_ps(image1);
2914 const __m128 sumFirstPixel = _mm_add_ps(row0, row1);
2917 const __m128 rowSecond0 = _mm_loadu_ps(image0 + 4);
2918 const __m128 rowSecond1 = _mm_loadu_ps(image1 + 4);
2921 const __m128 sumSecondPixel = _mm_add_ps(rowSecond0, rowSecond1);
2924 const __m128 sumComponents = _mm_add_ps(sumFirstPixel, sumSecondPixel);
2927 const __m128 division = _mm_mul_ps(sumComponents, _mm_set_ps1(0.25f));
2930 _mm_storeu_ps(result, division);
2935 ocean_assert(image0 && image1);
2937 const __m128i row0 = _mm_lddqu_si128((__m128i*)image0);
2938 const __m128i row1 = _mm_lddqu_si128((__m128i*)image1);
2945 const __m128i sum = _mm_add_epi16(_mm_hadd_epi16(sumLow, sumHigh), _mm_set1_epi32(
int(0x00020002)));
2948 const __m128i division16 = _mm_srli_epi16(sum, 2);
2954 _mm_storel_epi64((__m128i*)result, division8);
2959 ocean_assert(image0 && image1);
2962 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
2963 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
2970 const __m128i firstSum = _mm_add_epi16(_mm_hadd_epi16(firstSumLow, firstSumHigh), _mm_set1_epi32(
int(0x00020002)));
2973 const __m128i firstDivision16 = _mm_srli_epi16(firstSum, 2);
2980 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 16));
2981 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 16));
2988 const __m128i secondSum = _mm_add_epi16(_mm_hadd_epi16(secondSumLow, secondSumHigh), _mm_set1_epi32(
int(0x00020002)));
2991 const __m128i secondDivision16 = _mm_srli_epi16(secondSum, 2);
2998 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
3001 _mm_storeu_si128((__m128i*)result, division8);
3006 ocean_assert(image0 && image1 && image2);
3015 const __m128i firstRow0 = _mm_lddqu_si128((__m128i*)image0);
3016 const __m128i firstRow1 = _mm_lddqu_si128((__m128i*)image1);
3017 const __m128i firstRow2 = _mm_lddqu_si128((__m128i*)image2);
3024 const __m128i secondRow0 = _mm_lddqu_si128((__m128i*)(image0 + 14));
3025 const __m128i secondRow1 = _mm_lddqu_si128((__m128i*)(image1 + 14));
3026 const __m128i secondRow2 = _mm_lddqu_si128((__m128i*)(image2 + 14));
3034 const __m128i firstSum = _mm_add_epi16(firstSumEven, _mm_add_epi16(firstSumOdd, _mm_set1_epi32(
int(0x00080008))));
3036 const __m128i firstSumWithEven = _mm_add_epi16(firstSum, _mm_shuffle_epi8(firstSumEven,
set128i(0xFFFF0F0E0B0AFFFFull, 0x09080504FFFF0302ull)));
3038 const __m128i firstSumWithBoth = _mm_add_epi16(firstSumWithEven, _mm_shuffle_epi8(firstSumOdd,
set128i(0xFFFF0D0C0908FFFFull, 0x07060302FFFF0100ull)));
3042 const __m128i secondSum = _mm_add_epi16(secondSumEven, _mm_add_epi16(secondSumOdd, _mm_set1_epi32(
int(0x00080008))));
3043 const __m128i secondSumWithEven = _mm_add_epi16(secondSum, _mm_shuffle_epi8(secondSumEven,
set128i(0x0F0EFFFF0D0C0908ull, 0xFFFF07060302FFFFull)));
3044 const __m128i secondSumWithBoth = _mm_add_epi16(secondSumWithEven, _mm_shuffle_epi8(secondSumOdd,
set128i(0x0D0CFFFF0B0A0706ull, 0xFFFF05040100FFFFull)));
3047 const __m128i firstDivision16 = _mm_srli_epi16(firstSumWithBoth, 4);
3048 const __m128i secondDivision16 = _mm_srli_epi16(secondSumWithBoth, 4);
3051 const __m128i firstDivision8 = _mm_shuffle_epi8(firstDivision16,
set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0C0A060400ull));
3052 const __m128i secondDivision8 = _mm_shuffle_epi8(secondDivision16,
set128i(0xFFFFFFFFFFFF0E0Aull, 0x080402FFFFFFFFFFull));
3055 const __m128i division8 = _mm_or_si128(firstDivision8, secondDivision8);
3058#ifdef OCEAN_COMPILER_MSC
3059 memcpy(result, &division8.m128i_u8[0], 10);
3061 memcpy(result, &division8, 10);
3074 const __m128i maskOdds = _mm_and_si128(value,
CV::SSE::set128i(0x0001000100010001ull, 0x0001000100010001ull));
3077 const __m128i maskNegatives = _mm_srli_epi16(_mm_and_si128(value,
CV::SSE::set128i(0x8000800080008000ull, 0x8000800080008000ull)), 15);
3081 return _mm_add_epi16(value, _mm_and_si128(maskNegatives, maskOdds));
3086 ocean_assert(rightShifts < 16u);
3089 const __m128i offsetForNegatives_s_16x8 = _mm_set1_epi16(
short((1u << rightShifts) - 1u));
3092 const __m128i maskHigh_s_16x8 =
CV::SSE::set128i(0x8000800080008000ull, 0x8000800080008000ull);
3095 const __m128i maskNegativeValues_s_16x8 = _mm_cmpeq_epi16(_mm_and_si128(value, maskHigh_s_16x8), maskHigh_s_16x8);
3098 const __m128i offset_s_16x8 = _mm_and_si128(offsetForNegatives_s_16x8, maskNegativeValues_s_16x8);
3100 return _mm_add_epi16(value, offset_s_16x8);
3110 ocean_assert(rightShifts >= 1 && rightShifts <= 15);
3112 const __m128i signMask_s16x8 = _mm_srai_epi16(value_s16x8, 15);
3114 const __m128i absValue_s16x8 = _mm_abs_epi16(value_s16x8);
3115 const __m128i offset_s16x8 = _mm_set1_epi16(1 << (rightShifts - 1));
3117 const __m128i absValueWithOffset_s16x8 = _mm_add_epi16(absValue_s16x8, offset_s16x8);
3119 const __m128i shifted_s16x8 = _mm_srai_epi16(absValueWithOffset_s16x8, rightShifts);
3121 return _mm_sub_epi16(_mm_xor_si128(shifted_s16x8, signMask_s16x8), signMask_s16x8);
3126 ocean_assert(rightShifts >= 1 && rightShifts <= 15);
3128 const int32_t maxValue = 32767 - (1 << (rightShifts - 1));
3132 return int16_t(maxValue);
3144 const __m128i maskOdds = _mm_and_si128(value,
CV::SSE::set128i(0x0000000100000001ull, 0x0000000100000001ull));
3147 const __m128i maskNegatives = _mm_srli_epi32(_mm_and_si128(value,
CV::SSE::set128i(0x8000000080000000ull, 0x8000000080000000ull)), 31);
3150 return _mm_add_epi32(value, _mm_and_si128(maskNegatives, maskOdds));
3155 ocean_assert(rightShifts < 32u);
3158 const __m128i offsetForNegatives_s_32x4 = _mm_set1_epi32(
int((1u << rightShifts) - 1u));
3161 const __m128i maskHigh_s_32x4 =
CV::SSE::set128i(0x8000000080000000ull, 0x8000000080000000ull);
3164 const __m128i maskNegativeValues_s_32x4 = _mm_cmpeq_epi32(_mm_and_si128(value, maskHigh_s_32x4), maskHigh_s_32x4);
3167 const __m128i offset_s_32x4 = _mm_and_si128(offsetForNegatives_s_32x4, maskNegativeValues_s_32x4);
3169 return _mm_add_epi32(value, offset_s_32x4);
3179 ocean_assert(source && response && width >= 10u);
3182 const __m128i horizontalMinus = _mm_lddqu_si128((__m128i*)(source - 1));
3183 const __m128i horizontalPlus = _mm_lddqu_si128((__m128i*)(source + 1));
3185 const __m128i verticalMinus = _mm_lddqu_si128((__m128i*)(source - width));
3186 const __m128i verticalPlus = _mm_lddqu_si128((__m128i*)(source + width));
3189 const __m128i horizontalMinusLo = _mm_cvtepu8_epi16(horizontalMinus);
3191 const __m128i horizontalMinusHi = _mm_shuffle_epi8(horizontalMinus,
set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3193 const __m128i horizontalPlusLo = _mm_cvtepu8_epi16(horizontalPlus);
3195 const __m128i horizontalPlusHi = _mm_shuffle_epi8(horizontalPlus,
set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3202 const __m128i horizontalGradient = _mm_or_si128(
3203 _mm_shuffle_epi8(horizontalGradientLo,
set128i(0x8080808080808080ull, 0x0E0C0A0806040200ull)),
3204 _mm_shuffle_epi8(horizontalGradientHi,
set128i(0x0E0C0A0806040200ull, 0x8080808080808080ull)));
3207 const __m128i verticalMinusLo = _mm_cvtepu8_epi16(verticalMinus);
3209 const __m128i verticalMinusHi = _mm_shuffle_epi8(verticalMinus,
set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3211 const __m128i verticalPlusLo = _mm_cvtepu8_epi16(verticalPlus);
3213 const __m128i verticalPlusHi = _mm_shuffle_epi8(verticalPlus,
set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3220 const __m128i verticalGradient = _mm_or_si128(
3221 _mm_shuffle_epi8(verticalGradientLo,
set128i(0x8080808080808080ull, 0x0E0C0A0806040200ull)),
3222 _mm_shuffle_epi8(verticalGradientHi,
set128i(0x0E0C0A0806040200ull, 0x8080808080808080ull)));
3225 const __m128i interleavedResponseLo = _mm_unpacklo_epi8(horizontalGradient, verticalGradient);
3226 const __m128i interleavedResponseHi = _mm_unpackhi_epi8(horizontalGradient, verticalGradient);
3228 ocean_assert(
sizeof(
char) == 1ull);
3229 _mm_storeu_si128((__m128i*)response, interleavedResponseLo);
3230 _mm_storeu_si128((__m128i*)(response + 16ull), interleavedResponseHi);
3235 ocean_assert(source && response && width >= 10u);
3238 const __m128i horizontalMinus = _mm_lddqu_si128((__m128i*)(source - 1));
3239 const __m128i horizontalPlus = _mm_lddqu_si128((__m128i*)(source + 1));
3241 const __m128i verticalMinus = _mm_lddqu_si128((__m128i*)(source - width));
3242 const __m128i verticalPlus = _mm_lddqu_si128((__m128i*)(source + width));
3245 const __m128i horizontalMinusLo = _mm_cvtepu8_epi16(horizontalMinus);
3246 const __m128i horizontalMinusHi = _mm_shuffle_epi8(horizontalMinus,
set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3248 const __m128i horizontalPlusLo = _mm_cvtepu8_epi16(horizontalPlus);
3249 const __m128i horizontalPlusHi = _mm_shuffle_epi8(horizontalPlus,
set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3256 const __m128i verticalMinusLo = _mm_cvtepu8_epi16(verticalMinus);
3257 const __m128i verticalMinusHi = _mm_shuffle_epi8(verticalMinus,
set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3259 const __m128i verticalPlusLo = _mm_cvtepu8_epi16(verticalPlus);
3260 const __m128i verticalPlusHi = _mm_shuffle_epi8(verticalPlus,
set128i(0x800F800E800D800Cull, 0x800B800A80098008ull));
3267 const __m128i horizontalHorizontalLo = _mm_mullo_epi16(horizontalGradientLo, horizontalGradientLo);
3268 const __m128i horizontalHorizontalHi = _mm_mullo_epi16(horizontalGradientHi, horizontalGradientHi);
3270 const __m128i verticalVerticalLo = _mm_mullo_epi16(verticalGradientLo, verticalGradientLo);
3271 const __m128i verticalVerticalHi = _mm_mullo_epi16(verticalGradientHi, verticalGradientHi);
3273 const __m128i horzontalVerticalLo = _mm_mullo_epi16(horizontalGradientLo, verticalGradientLo);
3274 const __m128i horzontalVerticalHi = _mm_mullo_epi16(horizontalGradientHi, verticalGradientHi);
3297 const __m128i block0Lo = _mm_or_si128(
3299 _mm_shuffle_epi8(horizontalHorizontalLo,
set128i(0xFFFF0504FFFFFFFFull, 0x0302FFFFFFFF0100ull)),
3300 _mm_shuffle_epi8(verticalVerticalLo,
set128i(0x0504FFFFFFFF0302ull, 0xFFFFFFFF0100FFFFull))),
3301 _mm_shuffle_epi8(horzontalVerticalLo,
set128i(0xFFFFFFFF0302FFFFull, 0xFFFF0100FFFFFFFFull)));
3303 const __m128i block1Lo = _mm_or_si128(
3305 _mm_shuffle_epi8(horizontalHorizontalLo,
set128i(0x0B0AFFFFFFFF0908ull, 0xFFFFFFFF0706FFFFull)),
3306 _mm_shuffle_epi8(verticalVerticalLo,
set128i(0xFFFFFFFF0908FFFFull, 0xFFFF0706FFFFFFFFull))),
3307 _mm_shuffle_epi8(horzontalVerticalLo,
set128i(0xFFFF0908FFFFFFFFull, 0x0706FFFFFFFF0504ull)));
3309 const __m128i block2Lo = _mm_or_si128(
3311 _mm_shuffle_epi8(horizontalHorizontalLo,
set128i(0xFFFFFFFF0F0EFFFFull, 0xFFFF0D0CFFFFFFFFull)),
3312 _mm_shuffle_epi8(verticalVerticalLo,
set128i(0xFFFF0F0EFFFFFFFFull, 0x0D0CFFFFFFFF0B0Aull))),
3313 _mm_shuffle_epi8(horzontalVerticalLo,
set128i(0x0F0EFFFFFFFF0D0Cull, 0xFFFFFFFF0B0AFFFFull)));
3315 const __m128i block0Hi = _mm_or_si128(
3317 _mm_shuffle_epi8(horizontalHorizontalHi,
set128i(0xFFFF0504FFFFFFFFull, 0x0302FFFFFFFF0100ull)),
3318 _mm_shuffle_epi8(verticalVerticalHi,
set128i(0x0504FFFFFFFF0302ull, 0xFFFFFFFF0100FFFFull))),
3319 _mm_shuffle_epi8(horzontalVerticalHi,
set128i(0xFFFFFFFF0302FFFFull, 0xFFFF0100FFFFFFFFull)));
3321 const __m128i block1Hi = _mm_or_si128(
3323 _mm_shuffle_epi8(horizontalHorizontalHi,
set128i(0x0B0AFFFFFFFF0908ull, 0xFFFFFFFF0706FFFFull)),
3324 _mm_shuffle_epi8(verticalVerticalHi,
set128i(0xFFFFFFFF0908FFFFull, 0xFFFF0706FFFFFFFFull))),
3325 _mm_shuffle_epi8(horzontalVerticalHi,
set128i(0xFFFF0908FFFFFFFFull, 0x0706FFFFFFFF0504ull)));
3327 const __m128i block2Hi = _mm_or_si128(
3329 _mm_shuffle_epi8(horizontalHorizontalHi,
set128i(0xFFFFFFFF0F0EFFFFull, 0xFFFF0D0CFFFFFFFFull)),
3330 _mm_shuffle_epi8(verticalVerticalHi,
set128i(0xFFFF0F0EFFFFFFFFull, 0x0D0CFFFFFFFF0B0Aull))),
3331 _mm_shuffle_epi8(horzontalVerticalHi,
set128i(0x0F0EFFFFFFFF0D0Cull, 0xFFFFFFFF0B0AFFFFull)));
3333 _mm_storeu_si128((__m128i*)response, block0Lo);
3334 _mm_storeu_si128((__m128i*)(response + 8ull), block1Lo);
3335 _mm_storeu_si128((__m128i*)(response + 16ull), block2Lo);
3336 _mm_storeu_si128((__m128i*)(response + 24ull), block0Hi);
3337 _mm_storeu_si128((__m128i*)(response + 32ull), block1Hi);
3338 _mm_storeu_si128((__m128i*)(response + 40ull), block2Hi);
3348 channel01 = _mm_shuffle_epi8(interleaved,
set128i(0xFFFFFF0d0a070401ull, 0xFFFFFF0c09060300ull));
3350 channel2 = _mm_shuffle_epi8(interleaved,
set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull));
3361 channel01 = _mm_or_si128(_mm_shuffle_epi8(interleavedA,
set128i(0xFFFFFF0d0a070401ull, 0xFFFF0f0c09060300ull)),
3362 _mm_shuffle_epi8(interleavedB,
set128i(0x060300FFFFFFFFFFull, 0x0502FFFFFFFFFFFFull)));
3364 channel2 = _mm_or_si128(_mm_shuffle_epi8(interleavedA,
set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull)),
3365 _mm_shuffle_epi8(interleavedB,
set128i(0xFFFFFFFFFFFFFFFFull, 0x070401FFFFFFFFFFull)));
3370 channel0 = _mm_or_si128(_mm_shuffle_epi8(interleavedA,
set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFF0f0c09060300ull)),
3371 _mm_or_si128(_mm_shuffle_epi8(interleavedB,
set128i(0xFFFFFFFFFF0e0b08ull, 0x0502FFFFFFFFFFFFull)),
3372 _mm_shuffle_epi8(interleavedC,
set128i(0x0d0a070401FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3374 channel1 = _mm_or_si128(_mm_shuffle_epi8(interleavedA,
set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0d0a070401ull)),
3375 _mm_or_si128(_mm_shuffle_epi8(interleavedB,
set128i(0xFFFFFFFFFF0f0c09ull, 0x060300FFFFFFFFFFull)),
3376 _mm_shuffle_epi8(interleavedC,
set128i(0x0e0b080502FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3378 channel2 = _mm_or_si128(_mm_shuffle_epi8(interleavedA,
set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0e0b080502ull)),
3379 _mm_or_si128(_mm_shuffle_epi8(interleavedB,
set128i(0xFFFFFFFFFFFF0d0aull, 0x070401FFFFFFFFFFull)),
3380 _mm_shuffle_epi8(interleavedC,
set128i(0x0f0c09060300FFFFull, 0xFFFFFFFFFFFFFFFFull))));
3385 ocean_assert(interleaved !=
nullptr);
3392 ocean_assert(interleaved && channel0 && channel1 && channel2);
3394 __m128i channel0_128, channel1_128, channel2_128;
3404 ocean_assert(interleaved !=
nullptr);
3409OCEAN_FORCE_INLINE
void SSE::interleave3Channel8Bit48Elements(
const __m128i& channel0,
const __m128i& channel1,
const __m128i& channel2, __m128i& interleavedA, __m128i& interleavedB, __m128i& interleavedC)
3411 interleavedA = _mm_or_si128(_mm_shuffle_epi8(channel0,
set128i(0x05FFFF04FFFF03FFull, 0xFF02FFFF01FFFF00ull)),
3412 _mm_or_si128(_mm_shuffle_epi8(channel1,
set128i(0xFFFF04FFFF03FFFFull, 0x02FFFF01FFFF00FFull)),
3413 _mm_shuffle_epi8(channel2,
set128i(0xFF04FFFF03FFFF02ull, 0xFFFF01FFFF00FFFFull))));
3415 interleavedB = _mm_or_si128(_mm_shuffle_epi8(channel0,
set128i(0xFF0AFFFF09FFFF08ull, 0xFFFF07FFFF06FFFFull)),
3416 _mm_or_si128(_mm_shuffle_epi8(channel1,
set128i(0x0AFFFF09FFFF08FFull, 0xFF07FFFF06FFFF05ull)),
3417 _mm_shuffle_epi8(channel2,
set128i(0xFFFF09FFFF08FFFFull, 0x07FFFF06FFFF05FFull))));
3419 interleavedC = _mm_or_si128(_mm_shuffle_epi8(channel0,
set128i(0xFFFF0FFFFF0EFFFFull, 0x0DFFFF0CFFFF0BFFull)),
3420 _mm_or_si128(_mm_shuffle_epi8(channel1,
set128i(0xFF0FFFFF0EFFFF0Dull, 0xFFFF0CFFFF0BFFFFull)),
3421 _mm_shuffle_epi8(channel2,
set128i(0x0FFFFF0EFFFF0DFFull, 0xFF0CFFFF0BFFFF0Aull))));
3426 ocean_assert(channel0 && channel1 && channel2 && interleaved);
3428 __m128i interleavedA_128, interleavedB_128, interleavedC_128;
3431 store128i(interleavedA_128, interleaved + 0);
3432 store128i(interleavedB_128, interleaved + 16);
3433 store128i(interleavedC_128, interleaved + 32);
3438 ocean_assert(interleaved !=
nullptr);
3442 const __m128i shuffleMask0 = _mm_set_epi8(5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 0);
3443 const __m128i interleaved0 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask0);
3445 const __m128i shuffleMask1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 7, 7, 7, 6, 6, 6, 5, 5);
3446 const __m128i interleaved1 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask1);
3448 _mm_storeu_si128((__m128i*)(interleaved + 0), interleaved0);
3449 _mm_storel_epi64((__m128i*)(interleaved + 16), interleaved1);
3454 ocean_assert(interleaved !=
nullptr);
3458 const __m128i shuffleMask0 = _mm_set_epi8(-128, 3, 3, 3, -128, 2, 2, 2, -128, 1, 1, 1, -128, 0, 0, 0);
3459 const __m128i shuffleMask1 = _mm_set_epi8(-128, 7, 7, 7, -128, 6, 6, 6, -128, 5, 5, 5, -128, 4, 4, 4);
3462 __m128i result0 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask0);
3463 __m128i result1 = _mm_shuffle_epi8(singleChannel_u_8x8, shuffleMask1);
3465 const __m128i channel4Mask = _mm_set_epi8(-1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0);
3467 const __m128i lastChannelValue_u_8x16 = _mm_set1_epi8(
char(lastChannelValue));
3469 result0 = _mm_blendv_epi8(result0, lastChannelValue_u_8x16, channel4Mask);
3470 result1 = _mm_blendv_epi8(result1, lastChannelValue_u_8x16, channel4Mask);
3472 _mm_storeu_si128((__m128i*)(interleaved + 0), result0);
3473 _mm_storeu_si128((__m128i*)(interleaved + 16), result1);
3478 ocean_assert(interleaved !=
nullptr && reversedInterleaved !=
nullptr);
3485 const __m128i shuffleMask_u_16x8 =
set128i(0x0E0F0C0D0A0B0809ull, 0x0607040502030001ull);
3487 store128i(_mm_shuffle_epi8(
load128i(interleaved + 0), shuffleMask_u_16x8), reversedInterleaved + 0);
3488 store128i(_mm_shuffle_epi8(
load128i(interleaved + 16), shuffleMask_u_16x8), reversedInterleaved + 16);
3491OCEAN_FORCE_INLINE
void SSE::reverseChannelOrder3Channel8Bit48Elements(
const __m128i& interleaved0,
const __m128i& interleaved1,
const __m128i& interleaved2, __m128i& reversedInterleaved0, __m128i& reversedInterleaved1, __m128i& reversedInterleaved2)
3493 reversedInterleaved0 = _mm_or_si128(_mm_shuffle_epi8(interleaved0,
set128i(0xFF0c0d0e090a0b06ull, 0x0708030405000102ull)),
3494 _mm_shuffle_epi8(interleaved1,
set128i(0x01FFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull)));
3496 reversedInterleaved1 = _mm_or_si128(_mm_shuffle_epi8(interleaved0,
set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFF0fFFull)),
3497 _mm_or_si128(_mm_shuffle_epi8(interleaved1,
set128i(0x0fFF0b0c0d08090aull, 0x050607020304FF00ull)),
3498 _mm_shuffle_epi8(interleaved2,
set128i(0xFF00FFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3500 reversedInterleaved2 = _mm_or_si128(_mm_shuffle_epi8(interleaved1,
set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFF0eull)),
3501 _mm_shuffle_epi8(interleaved2,
set128i(0x0d0e0f0a0b0c0708ull, 0x09040506010203FFull)));
3506 ocean_assert(interleaved !=
nullptr && reversedInterleaved !=
nullptr);
3508 __m128i reversedInterleaved0, reversedInterleaved1, reversedInterleaved2;
3511 store128i(reversedInterleaved0, reversedInterleaved);
3512 store128i(reversedInterleaved1, reversedInterleaved + 16);
3513 store128i(reversedInterleaved2, reversedInterleaved + 32);
3518 ocean_assert(interleaved !=
nullptr && reversedInterleaved !=
nullptr);
3525 const __m128i shuffleMask_u_16x8 =
set128i(0x0C0D0E0F08090A0Bull, 0x0405060700010203ull);
3527 store128i(_mm_shuffle_epi8(
load128i(interleaved + 0), shuffleMask_u_16x8), reversedInterleaved + 0);
3528 store128i(_mm_shuffle_epi8(
load128i(interleaved + 16), shuffleMask_u_16x8), reversedInterleaved + 16);
3529 store128i(_mm_shuffle_epi8(
load128i(interleaved + 32), shuffleMask_u_16x8), reversedInterleaved + 32);
3530 store128i(_mm_shuffle_epi8(
load128i(interleaved + 48), shuffleMask_u_16x8), reversedInterleaved + 48);
3535 ocean_assert(interleaved);
3537 __m128i reversedInterleaved0, reversedInterleaved1, reversedInterleaved2;
3540 store128i(reversedInterleaved0, interleaved);
3541 store128i(reversedInterleaved1, interleaved + 16);
3542 store128i(reversedInterleaved2, interleaved + 32);
3547 ocean_assert(first && second && first != second);
3549 __m128i first0, first1, first2;
3552 __m128i second0, second1, second2;
3564inline void SSE::reverseElements8Bit48Elements(
const __m128i& elements0,
const __m128i& elements1,
const __m128i& elements2, __m128i& reversedElements0, __m128i& reversedElements1, __m128i& reversedElements2)
3566 const __m128i mask =
set128i(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
3568 reversedElements0 = _mm_shuffle_epi8(elements2, mask);
3569 reversedElements1 = _mm_shuffle_epi8(elements1, mask);
3570 reversedElements2 = _mm_shuffle_epi8(elements0, mask);
3575 ocean_assert(elements && reversedElements);
3577 __m128i reversedElements0, reversedElements1, reversedElements2;
3580 store128i(reversedElements0, reversedElements);
3581 store128i(reversedElements1, reversedElements + 16);
3582 store128i(reversedElements2, reversedElements + 32);
3587 ocean_assert(elements);
3589 __m128i reversedElements0, reversedElements1, reversedElements2;
3593 store128i(reversedElements1, elements + 16);
3594 store128i(reversedElements2, elements + 32);
3599 ocean_assert(first && second && first != second);
3601 __m128i first0, first1, first2;
3604 __m128i second0, second1, second2;
3618 ocean_assert(elements && shiftedElements);
3620 store128i(_mm_shuffle_epi8(
load128i(elements),
set128i(0x0c0f0e0d080b0a09ull, 0x0407060500030201ull)), shiftedElements);
3625 ocean_assert(elements && shiftedElements);
3627 store128i(_mm_shuffle_epi8(
load128i(elements),
set128i(0x0003020104070605ull, 0x080b0a090c0f0e0dull)), shiftedElements);
3632 ocean_assert(elements && shiftedElements);
3634 store128i(_mm_shuffle_epi8(
load128i(elements),
set128i(0x0e0d0c0f0a09080bull, 0x0605040702010003ull)), shiftedElements);
3639 ocean_assert(elements && shiftedElements);
3641 store128i(_mm_shuffle_epi8(
load128i(elements),
set128i(0x0201000306050407ull, 0x0a09080b0e0d0c0full)), shiftedElements);
3646 const __m128i zero = _mm_setzero_si128();
3647 const __m128i sum = _mm_sad_epu8(elements, zero);
3649 return _mm_add_epi32(_mm_srli_si128(sum, 8), sum);
3654 ocean_assert(elements !=
nullptr);
3659template <
bool tBufferHas16Bytes>
3662 ocean_assert(elements !=
nullptr);
3668 ocean_assert(elements !=
nullptr);
3679 const __m128i channel0_2First = _mm_or_si128(_mm_shuffle_epi8(interleaved0,
set128i(0xFFFFFF0e0b080502ull, 0xFFFF0f0c09060300ull)),
3680 _mm_shuffle_epi8(interleaved1,
set128i(0x070401FFFFFFFFFFull, 0x0502FFFFFFFFFFFFull)));
3683 const __m128i channel0_2Second = _mm_or_si128(_mm_shuffle_epi8(interleaved1,
set128i(0xFFFFFFFFFFFF0d0aull, 0xFFFFFFFFFF0e0b08ull)),
3684 _mm_shuffle_epi8(interleaved2,
set128i(0x0f0c09060300FFFFull, 0x0d0a070401FFFFFFull)));
3687 const __m128i channel1 = _mm_or_si128(_mm_shuffle_epi8(interleaved0,
set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFF0d0a070401ull)),
3688 _mm_or_si128(_mm_shuffle_epi8(interleaved1,
set128i(0xFFFFFFFFFF0f0c09ull, 0x060300FFFFFFFFFFull)),
3689 _mm_shuffle_epi8(interleaved2,
set128i(0x0e0b080502FFFFFFull, 0xFFFFFFFFFFFFFFFFull))));
3691 const __m128i zero = _mm_setzero_si128();
3694 const __m128i sum0_2 = _mm_add_epi32(_mm_sad_epu8(channel0_2First, zero), _mm_sad_epu8(channel0_2Second, zero));
3697 const __m128i sum1 = _mm_sad_epu8(channel1, zero);
3700 return _mm_blend_epi16(sum0_2, _mm_add_epi32(_mm_slli_si128(sum1, 4), _mm_srli_si128(sum1, 4)),
int(0xC));
3705 ocean_assert(interleaved !=
nullptr);
3712 ocean_assert(interleaved !=
nullptr);
3719 ocean_assert(buffer !=
nullptr);
3720 return _mm_loadl_epi64((
const __m128i*)(buffer));
3725 ocean_assert(buffer !=
nullptr);
3726 return _mm_lddqu_si128((
const __m128i*)(buffer));
3729template <
bool tBufferHas16Bytes>
3732 ocean_assert(buffer !=
nullptr);
3736#ifdef OCEAN_COMPILER_MSC
3738 result.m128i_u64[0] = uint64_t(0);
3739 memcpy(result.m128i_u16 + 3, buffer + 0,
sizeof(uint16_t));
3740 memcpy(result.m128i_u64 + 1, buffer + 2,
sizeof(uint64_t));
3747 memcpy(ourResult.
m128i_u16 + 3, buffer + 0,
sizeof(uint16_t));
3748 memcpy(ourResult.
m128i_u64 + 1, buffer + 2,
sizeof(uint64_t));
3756inline __m128i SSE::load_u8_10_upper_zero<true>(
const uint8_t*
const buffer)
3758 ocean_assert(buffer !=
nullptr);
3764template <
bool tBufferHas16Bytes>
3767 ocean_assert(buffer !=
nullptr);
3769 __m128i intermediate;
3770 memcpy(&intermediate, buffer, 15);
3773 return _mm_slli_si128(intermediate, 1);
3777inline __m128i SSE::load_u8_15_upper_zero<true>(
const uint8_t*
const buffer)
3779 ocean_assert(buffer !=
nullptr);
3782 return _mm_slli_si128(_mm_lddqu_si128((__m128i*)(buffer)), 1);
3785template <
bool tBufferHas16Bytes>
3788 ocean_assert(buffer !=
nullptr);
3791 memcpy(&result, buffer, 13);
3797inline __m128i SSE::load_u8_13_lower_random<true>(
const uint8_t*
const buffer)
3799 ocean_assert(buffer !=
nullptr);
3802 return _mm_lddqu_si128((__m128i*)(buffer));
3805template <
bool tBufferHas16Bytes>
3808 ocean_assert(buffer !=
nullptr);
3811 memcpy(&result, buffer, 15);
3813#ifdef OCEAN_COMPILER_MSC
3814 result.m128i_u8[15] = 0u;
3816 ((
M128i&)result).m128i_u8[15] = 0u;
3823inline __m128i SSE::load_u8_15_lower_zero<true>(
const uint8_t*
const buffer)
3825 ocean_assert(buffer !=
nullptr);
3828 __m128i result = _mm_lddqu_si128((__m128i*)(buffer));
3830#ifdef OCEAN_COMPILER_MSC
3831 result.m128i_u8[15] = 0u;
3833 ((
M128i&)result).m128i_u8[15] = 0u;
3839template <
bool tBufferHas16Bytes>
3842 ocean_assert(buffer !=
nullptr);
3845 memcpy(&result, buffer, 15);
3851inline __m128i SSE::load_u8_15_lower_random<true>(
const uint8_t*
const buffer)
3853 ocean_assert(buffer !=
nullptr);
3856 return _mm_lddqu_si128((__m128i*)(buffer));
3859template <
unsigned int tShiftBytes>
3862 static_assert(tShiftBytes <= 16u,
"Invalid shift!");
3864 ocean_assert(buffer !=
nullptr);
3865 return _mm_srli_si128(_mm_lddqu_si128((__m128i*)(buffer)), tShiftBytes);
3870 ocean_assert(buffer !=
nullptr);
3871 _mm_storeu_si128((__m128i*)(buffer), value);
3874inline __m128i
SSE::set128i(
const unsigned long long high64,
const unsigned long long low64)
3880 return _mm_set_epi64x(high64, low64);
3882 return _mm_set_epi32(*(((
int*)&high64) + 1), *((
int*)&high64), *(((
int*)&low64) + 1), *((
int*)&low64));
3887 return _mm_set_epi64x(high64, low64);
3895 return _mm_and_si128(value, _mm_set1_epi32(
int(0x0000FFFFu)));
3900 return _mm_and_si128(value, _mm_set1_epi32(
int(0xFFFF0000u)));
3905 return _mm_and_si128(value, _mm_set1_epi32(
int(0x00FF00FFu)));
3910 return _mm_and_si128(value,
set128i(0x000000FF00FF00FFull, 0x00FF00FF00FF00FFull));
3915 return _mm_and_si128(value,
set128i(0x00FF00FF00FF00FFull, 0x00FF00FF00FF0000ull));
3920 return _mm_shuffle_epi8(value,
set128i(0xA0A0A0A0A0A0A0A0ull, 0x0E0C0A0806040200ull));
3925 return _mm_shuffle_epi8(value,
set128i(0xA0A0A0A0A0A0A0A0ull, 0xA0A0A0A00C080400ull));
3930 return _mm_shuffle_epi8(value,
set128i(0xA0A0A0A0A0A0A0A0ull, 0x0D0C090805040100ull));
3935 return _mm_shuffle_epi8(value,
set128i(0x0E0C0A0806040200ull, 0xA0A0A0A0A0A0A0A0ull));
3941 return _mm_srli_epi32(value, 16);
3946 return _mm_shuffle_epi8(value,
set128i(0xA00FA00DA00BA009ull, 0xA007A005A003A001ull));
3951 return _mm_shuffle_epi8(value,
set128i(0xA0A0A0A0A0A0A009ull, 0xA007A005A003A001ull));
3956 return _mm_shuffle_epi8(value,
set128i(0xFFFFFFFFFF0bFF09ull, 0xFF07FF05FF03FF01ull));
3961 return _mm_shuffle_epi8(value,
set128i(0xA0A0A00DA00BA009ull, 0xA007A005A003A001ull));
3966 return _mm_shuffle_epi8(value,
set128i(0xA0A0A003A0A0A002ull, 0xA0A0A001A0A0A000ull));
3976 return _mm_shuffle_epi8(value,
set128i(0xA007A003A006A002ull, 0xA005A001A004A000ull));
3981 return _mm_shuffle_epi8(value,
set128i(0xA00FA00BA00EA00Aull, 0xA00DA009A00CA008ull));
3986 return _mm_shuffle_epi8(value,
set128i(0xFF07FF05FF06FF04ull, 0xFF03FF01FF02FF00ull));
3991 return _mm_shuffle_epi8(value,
set128i(0xFF0FFF0DFF0EFF0Cull, 0xFF0BFF09FF0AFF08ull));
3996 return _mm_set1_epi32(
int(0x00FF00FFu));
4001 return _mm_set1_epi32(
int(0x0000FFFFu));
4006 const __m128i lowProducts = _mm_mullo_epi16(values0, values1);
4007 const __m128i highProducts = _mm_mulhi_epi16(values0, values1);
4009 products0 = _mm_unpacklo_epi16(lowProducts, highProducts);
4010 products1 = _mm_unpackhi_epi16(lowProducts, highProducts);
4019 results0 = _mm_add_epi32(results0, products0);
4020 results1 = _mm_add_epi32(results1, products1);
4023inline unsigned int SSE::interpolation2Channel16Bit1x1(
const uint8_t*
const pixel,
const unsigned int size,
const unsigned int fx_y_,
const unsigned int fxy_,
const unsigned int fx_y,
const unsigned int fxy)
4025 ocean_assert(pixel);
4026 ocean_assert(fx_y_ + fxy_ + fx_y + fxy == 128u * 128u);
4028 return (pixel[0] * fx_y_ + pixel[2] * fxy_ + pixel[size] * fx_y + pixel[size + 2u] * fxy + 8192u) / 16384u;
4031inline unsigned int SSE::ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int ,
const unsigned int size1,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy)
4033 ocean_assert(pixel0 && pixel1);
4035 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
4040inline unsigned int SSE::ssd2Channel16Bit1x1(
const uint8_t*
const pixel0,
const uint8_t*
const pixel1,
const unsigned int size0,
const unsigned int size1,
const unsigned int f0x_y_,
const unsigned int f0xy_,
const unsigned int f0x_y,
const unsigned int f0xy,
const unsigned int f1x_y_,
const unsigned int f1xy_,
const unsigned int f1x_y,
const unsigned int f1xy)
4042 ocean_assert(pixel0 && pixel1);
4044 ocean_assert(f0x_y_ + f0xy_ + f0x_y + f0xy == 128u * 128u);
4045 ocean_assert(f1x_y_ + f1xy_ + f1x_y + f1xy == 128u * 128u);
4047 return sqrDistance(
interpolation2Channel16Bit1x1(pixel0, size0, f0x_y_, f0xy_, f0x_y, f0xy),
interpolation2Channel16Bit1x1(pixel1, size1, f1x_y_, f1xy_, f1x_y, f1xy));
This class implements computer vision functions using SSE extensions.
Definition SSE.h:41
static __m128i divideByRightShiftSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight signed 32 bit values by applying a right shift.
Definition SSE.h:3172
static void average32Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2762
static void gradientHorizontalVertical8Elements1Channel8Bit(const uint8_t *source, int8_t *response, const unsigned int width)
Determines the horizontal and the vertical gradients for 16 following pixels for a given 1 channel 8 ...
Definition SSE.h:3177
static unsigned int sum_u32_first_2(const __m128i &value)
Adds the first two individual 32 bit unsigned integer values of a m128i value and returns the result.
Definition SSE.h:1368
static void average24Elements3Channel24Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 24 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:2845
static void prefetchT2(const void *const data)
Prefetches a block of temporal memory in all cache levels, except 0th and 1st cache levels.
Definition SSE.h:1302
static void reverseElements8Bit48Elements(const __m128i &elements0, const __m128i &elements1, const __m128i &elements2, __m128i &reversedElements0, __m128i &reversedElements1, __m128i &reversedElements2)
Reverses the order of 48 elements with 8 bit per element.
Definition SSE.h:3564
static __m128i load128i(const void *const buffer)
Loads a 128i value from the memory.
Definition SSE.h:3723
static void average16Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2737
static __m128i load_u8_16_and_shift_right(const uint8_t *const buffer)
Loads 16 bytes from memory which is at least 16 bytes large and shifts the 128i value by a specified ...
Definition SSE.h:3860
static __m128i moveLowBits32_16ToLow64(const __m128i &value)
Moves the lower 16 bits of four 32 bit elements to the lower 64 bits and fills the high 64 bits with ...
Definition SSE.h:3928
static __m128i moveLowBits32_8ToLow32(const __m128i &value)
Moves the lower 8 bits of four 32 bit elements to the lower 32 bits and fills the high 96 bits with 0...
Definition SSE.h:3923
static __m128i moveHighBits16_8_6(const __m128i &value)
Moves the higher 8 bits of six 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3954
static __m128i addOffsetBeforeRightShiftDivisionByTwoSigned32Bit(const __m128i &value)
Adds 1 to each signed 32 bit value which is both, negative and odd, so that each value can be right s...
Definition SSE.h:3135
static OCEAN_FORCE_INLINE double sum_f64_2(const __m128d &value)
Adds the two (all two) individual 64 bit float of a m128 value and returns the result.
Definition SSE.h:1395
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit15Elements(const __m128i &interleaved, __m128i &channel01, __m128i &channel2)
Deinterleaves 15 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3341
static void store128i(const __m128i &value, uint8_t *const buffer)
Stores a 128i value to the memory.
Definition SSE.h:3868
static __m128i sumSquareDifference8BitFront13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 13 elements of a buffer with 8 bit precision.
Definition SSE.h:1473
static __m128i sumInterleave3Channel8Bit45Elements(const uint8_t *interleaved)
Sums 15 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel a...
Definition SSE.h:3710
static __m128i moveLowBits16_8ToHigh64(const __m128i &value)
Moves the lower 8 bits of eight 16 bit elements to the higher 64 bits and fills the low 64 bits with ...
Definition SSE.h:3933
static __m128i divideByRightShiftSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Divides eight int16_t values by applying a right shift.
Definition SSE.h:3103
static __m128i shuffleNeighbor4High64BitsToLow16_8(const __m128i &value)
Shuffles pairs of four neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3979
static void swapReversedElements8Bit48Elements(uint8_t *first, uint8_t *second)
Reverses the order of two sets of 48 elements with 8 bit per element and further swaps both sets.
Definition SSE.h:3597
static __m128i sumAbsoluteDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the last 11 elements of a 16 elements buffer with 8 bit pr...
Definition SSE.h:1411
static void average8ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 8 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition SSE.h:2505
static __m128i interpolation1Channel8Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:1620
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8AndAccumulate(const __m128i &values0, const __m128i &values1, __m128i &results0, __m128i &results1)
Multiplies 8 int16_t values with 8 int16_t values and adds the products to 8 int32_t values.
Definition SSE.h:4013
static __m128i sumSquareDifference8BitBack13Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the last 13 elements of an 16 elements buffer with 8 bit prec...
Definition SSE.h:1500
static unsigned int sum_u32_first_third(const __m128i &value)
Adds the first and the second 32 bit unsigned integer values of a m128i value and returns the result.
Definition SSE.h:1377
static __m128i sumSquareDifference8BitFront12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 12 elements of an 16 elements buffer with 8 bit pre...
Definition SSE.h:1418
static void average16ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 16 elements of 2x2 blocks for 1 binary (x00 or 0xFF) frames.
Definition SSE.h:2562
static __m128i moveHighBits16_8_5(const __m128i &value)
Moves the higher 8 bits of five 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3949
static int16_t maximalValueForRoundedDivisionByRightShiftSigned16Bit(const unsigned int rightShifts)
Returns the maximal value for which the function roundedDivideByRightShiftSigned16Bit() can be applie...
Definition SSE.h:3124
static __m128i shuffleLow32ToLow32_8(const __m128i &value)
Shuffles the lower four 8 bits to the low 8 bits of four 32 bit elements.
Definition SSE.h:3964
static void shiftChannelToFront4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back...
Definition SSE.h:3616
static __m128i moveHighBits16_8(const __m128i &value)
Moves the higher 8 bits of eight 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3944
static __m128i removeHighBits16_8_7_upper(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements and sets the lower two bytes to zero.
Definition SSE.h:3913
static void deInterleave3Channel8Bit45Elements(const uint8_t *interleaved, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 45 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3402
static unsigned int value_u32(const __m128i &value)
Returns one specific 32 bit unsigned integer value of a m128i value object.
Definition SSE.h:1348
static OCEAN_FORCE_INLINE void interleave3Channel8Bit48Elements(const __m128i &channel0, const __m128i &channel1, const __m128i &channel2, __m128i &interleavedA, __m128i &interleavedB, __m128i &interleavedC)
Interleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3409
static __m128i load_u8_15_upper_zero(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3765
static __m128i shuffleNeighbor2Low64BitsToLow16_8(const __m128i &value)
Shuffles pairs of two neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3984
static void prefetchT1(const void *const data)
Prefetches a block of temporal memory in all cache levels except 0th cache level.
Definition SSE.h:1297
static __m128i sum1Channel8Bit16Elements(const __m128i &elements)
Sums 16 elements with 8 bit per element.
Definition SSE.h:3644
static __m128i shuffleNeighbor4Low64BitsToLow16_8(const __m128i &value)
Shuffles pairs of four neighbors of the low 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3969
static void average8Elements2Channel64Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 2 channel 64 bit frames.
Definition SSE.h:2707
static __m128i addOffsetBeforeRightShiftDivisionSigned16Bit(const __m128i &value, const unsigned int rightShifts)
Adds 2^shifts - 1 to each negative int16_t value, so that each value can be right shifted to allow a ...
Definition SSE.h:3084
static __m128i load_u8_15_lower_random(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3840
static __m128i removeHighBits16_8_7_lower(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements and sets the upper two bytes to zero.
Definition SSE.h:3908
static void average8Elements4Channel128Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 4 channel 128 bit frames.
Definition SSE.h:2905
static __m128i load_u8_10_upper_zero(const uint8_t *const buffer)
Loads 10 bytes from memory, which holds either at least 16 bytes or exactly 10 bytes,...
Definition SSE.h:3730
static __m128i sumAbsoluteDifferences8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for 16 elements of an 16 elements buffer with 8 bit precision.
Definition SSE.h:1580
static __m128i moveHighBits32_16(const __m128i &value)
Moves the higher 16 bits of four 32 bit elements to the lower 16 bits and fills the high bits with 0.
Definition SSE.h:3938
static void average16Elements4Channel32Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2933
static __m128i moveHighBits16_8_7(const __m128i &value)
Moves the higher 8 bits of seven 16 bit elements to the lower 8 bits and fills the high bits with 0.
Definition SSE.h:3959
static __m128i roundedDivideByRightShiftSigned16Bit(const __m128i &value_s16x8, const unsigned int rightShifts)
Applies a rounded division by a right shift for eight int16_t values.
Definition SSE.h:3108
static __m128i bitMaskRemoveHigh32_16()
Returns the following 128 bit mask: 0x0000FFFF-0000FFFF-0000FFFF-0000FFFF.
Definition SSE.h:3999
static __m128i sumSquareDifference8Bit16ElementsAligned16(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition SSE.h:1587
static __m128i removeHighBits32_16(const __m128i &value)
Removes the higher 16 bits of four 32 bit elements.
Definition SSE.h:3893
static __m128i shuffleNeighbor2High64BitsToLow16_8(const __m128i &value)
Shuffles pairs of two neighbors of the high 64 bits to the low 8 bits of eight 16 bit elements.
Definition SSE.h:3989
static void average6Elements3Channel96Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 6 elements of 2x2 blocks for 3 channel 96 bit frames.
Definition SSE.h:2808
static __m128i interpolation4Channel32Bit2x4Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 2x4 elements (two separated blocks of 4 elements) of 2x2 blocks for 4 channel 32 bit fra...
Definition SSE.h:2301
static __m128i interpolation3Channel24Bit12Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_fxfy_, const __m128i &fx_fyfxfy)
Interpolates 12 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:2114
static __m128i addOffsetBeforeRightShiftDivisionSigned32Bit(const __m128i &value, const unsigned int rightShifts)
Adds 2^shifts - 1 to each negative signed 32 bit value, so they each value can be right shifted to al...
Definition SSE.h:3153
static __m128i interpolation4Channel32Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2154
static void average8Elements1Channel32Bit2x2(const float *const image0, const float *const image1, float *const result)
Averages 8 elements of 2x2 blocks for 1 channel 32 bit frames.
Definition SSE.h:2447
static OCEAN_FORCE_INLINE void store1Channel8Bit8ElementsTo4Channels32ElementsWithConstantLastChannel(const __m128i &singleChannel_u_8x8, const uint8_t lastChannelValue, uint8_t *interleaved)
Stores 8 single-channel 8-bit elements as 32 interleaved 4-channel elements (8 elements -> 8×4 = 32 b...
Definition SSE.h:3452
static void shiftChannelToBack4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front ...
Definition SSE.h:3630
static void average8Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 8 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2481
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit24Elements(const __m128i &interleavedA, const __m128i &interleavedB, __m128i &channel01, __m128i &channel2)
Deinterleaves 24 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3353
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition SSE.h:1292
static __m128i interpolation1Channel8Bit15Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_fxfy_, const __m128i &fx_fyfxfy)
Interpolates 15 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2062
static uint16_t value_u16(const __m128i &value)
Returns one specific 16 bit unsigned integer value of a m128i value object.
Definition SSE.h:1336
static OCEAN_FORCE_INLINE void reverseChannelOrder3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2, __m128i &reversedInterleaved0, __m128i &reversedInterleaved1, __m128i &reversedInterleaved2)
Reverses the order of the first and last channel of 48 elements of an image with 3 interleaved channe...
Definition SSE.h:3491
static __m128i sumSquareDifferences8BitBack11Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square differences determination for the last 11 elements of an 16 elements buffer with 8 bit pre...
Definition SSE.h:1404
static __m128i removeLowBits32_16(const __m128i &value)
Removes the lower 16 bits of four 32 bit elements.
Definition SSE.h:3898
static __m128i interpolation2Channel16Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:1770
static uint8_t value_u8(const __m128i &value)
Returns one specific 8 bit unsigned integer value of a m128i value object.
Definition SSE.h:1313
static void gradientHorizontalVertical8Elements3Products1Channel8Bit(const uint8_t *source, int16_t *response, const unsigned int width)
Determines the squared horizontal and vertical gradients and the product of both gradients for 16 fol...
Definition SSE.h:3233
static __m128i bitMaskRemoveHigh16_8()
Returns the following 128 bit mask: 0x00FF00FF-00FF00FF-00FF00FF-00FF00FF.
Definition SSE.h:3994
static __m128i removeHighBits16_8(const __m128i &value)
Removes the higher 8 bits of eight 16 bit elements.
Definition SSE.h:3903
static __m128i sum1Channel8BitBack15Elements(const uint8_t *elements)
Sums the last 15 elements of a 16 elements buffer with 8 bit per element, the beginning 1 element is ...
Definition SSE.h:3666
static OCEAN_FORCE_INLINE void store1Channel8Bit8ElementsTo3Channels24Elements(const __m128i &singleChannel_u_8x8, uint8_t *interleaved)
Stores 8 single-channel 8-bit elements as 24 interleaved 3-channel elements (8 elements -> 8×3 = 24 b...
Definition SSE.h:3436
static __m128i load_u8_15_lower_zero(const uint8_t *const buffer)
Loads 15 bytes from memory, which holds either at least 16 bytes or exactly 15 bytes,...
Definition SSE.h:3806
static OCEAN_FORCE_INLINE void deInterleave3Channel8Bit48Elements(const __m128i &interleavedA, const __m128i &interleavedB, const __m128i &interleavedC, __m128i &channel0, __m128i &channel1, __m128i &channel2)
Deinterleaves 48 elements of e.g., an image with 3 channels and 8 bit per element.
Definition SSE.h:3368
static __m128i sumSquareDifference8Bit16Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for 16 elements with 8 bit precision.
Definition SSE.h:1570
static __m128i sumInterleave3Channel8Bit48Elements(const __m128i &interleaved0, const __m128i &interleaved1, const __m128i &interleaved2)
Sums 16 elements individually for an interleaved pixel format with 3 channels and 8 bit per channel a...
Definition SSE.h:3672
static void average32Elements4Channel32Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 4 channel 32 bit frames.
Definition SSE.h:2957
static void average30Elements1Channel8Bit3x3(const uint8_t *const image0, const uint8_t *const image1, const uint8_t *const image2, uint8_t *const result)
Averages 30 elements of 3x3 blocks for 1 channel 8 bit frames.
Definition SSE.h:3004
static __m128i sumSquareDifference8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the first 15 elements of a buffer with 8 bit precision.
Definition SSE.h:1528
static __m128i sum1Channel8BitFront15Elements(const uint8_t *elements)
Sums the first 15 elements of a buffer with 8 bit per element.
Definition SSE.h:3660
static void average32ElementsBinary1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result, const uint16_t threshold=776u)
Averages 32 elements of 2x2 blocks for 1 binary (0x00 or 0xFF) frames.
Definition SSE.h:2650
static void average32Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 32 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2584
static __m128i sumSquareDifference8BitBack12Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum square difference determination for the last 12 elements of an 16 elements buffer with 8 bit prec...
Definition SSE.h:1445
static OCEAN_FORCE_INLINE float sum_f32_4(const __m128 &value)
Adds the four (all four) individual 32 bit float of a m128 value and returns the result.
Definition SSE.h:1386
static __m128i load_u8_13_lower_random(const uint8_t *const buffer)
Loads 13 bytes from memory, which holds either at least 16 bytes or exactly 13 bytes,...
Definition SSE.h:3786
static void swapReversedChannelOrder3Channel8Bit48Elements(uint8_t *first, uint8_t *second)
Reverses the order of the first and last channel of two sets of 48 elements of an image with 3 interl...
Definition SSE.h:3545
static OCEAN_FORCE_INLINE unsigned int sum_u32_4(const __m128i &value)
Adds the four (all four) individual 32 bit unsigned integer values of a m128i value and returns the r...
Definition SSE.h:1359
static void prefetchNTA(const void *const data)
Prefetches a block of non-temporal memory into non-temporal cache structure.
Definition SSE.h:1307
static __m128i moveLowBits16_8ToLow64(const __m128i &value)
Moves the lower 8 bits of eight 16 bit elements to the lower 64 bits and fills the high 64 bits with ...
Definition SSE.h:3918
static __m128i sumAbsoluteDifferences8BitFront10Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the first 10 elements of a buffer with 8 bit precision.
Definition SSE.h:1555
static unsigned int interpolation2Channel16Bit1x1(const uint8_t *const pixel, const unsigned int size, const unsigned int fx_y_, const unsigned int fxy_, const unsigned int fx_y, const unsigned int fxy)
Returns the interpolated pixel values for one 2 channel 16 bit pixel.
Definition SSE.h:4023
static void shiftAndMirrorChannelToBack4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the back and moves the back channel to the front ...
Definition SSE.h:3637
static __m128i load128iLower64(const void *const buffer)
Loads the lower 64 bit of a 128i value from the memory.
Definition SSE.h:3717
static unsigned int ssd2Channel16Bit1x1(const uint8_t *const pixel0, const uint8_t *const pixel1, const unsigned int size0, const unsigned int size1, const unsigned int f1x_y_, const unsigned int f1xy_, const unsigned int f1x_y, const unsigned int f1xy)
Returns the interpolated sum of square difference for one 2 channel 16 bit pixel.
Definition SSE.h:4031
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition SSE.h:3874
static OCEAN_FORCE_INLINE void reverseChannelOrder4Channel8Bit64Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (64 elements) of an image with 4 interleaved channels...
Definition SSE.h:3516
static __m128i addOffsetBeforeRightShiftDivisionByTwoSigned16Bit(const __m128i &value)
Adds 1 to each signed 16 bit value which is both, negative and odd, so that each value can be right s...
Definition SSE.h:3065
static void average8Elements2Channel16Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 8 elements of 2x2 blocks for 2 channel 16 bit frames.
Definition SSE.h:2682
static void shiftAndMirrorChannelToFront4Channel32Bit(const uint8_t *elements, uint8_t *shiftedElements)
Shifts the channels of a 4 channel 32 bit pixels to the front and moves the front channel to the back...
Definition SSE.h:3623
static __m128i sumAbsoluteDifferences8BitFront15Elements(const uint8_t *const image0, const uint8_t *const image1)
Sum absolute differences determination for the first 15 elements of a buffer with 8 bit precision.
Definition SSE.h:1563
static void average16Elements1Channel8Bit2x2(const uint8_t *const image0, const uint8_t *const image1, uint8_t *const result)
Averages 16 elements of 2x2 blocks for 1 channel 8 bit frames.
Definition SSE.h:2527
static OCEAN_FORCE_INLINE void multiplyInt8x16ToInt32x8(const __m128i &values0, const __m128i &values1, __m128i &products0, __m128i &products1)
Multiplies 8 int16_t values with 8 int16_t values and returns the products as 8 int32_t results.
Definition SSE.h:4004
static OCEAN_FORCE_INLINE void reverseChannelOrder2Channel8Bit32Elements(const uint8_t *interleaved, uint8_t *reversedInterleaved)
Reverses the order of the channels of 16 pixels (32 elements) of an image with 2 interleaved channels...
Definition SSE.h:3476
static __m128i interpolation3Channel24Bit8Elements(const __m128i &values0, const __m128i &values1, const __m128i &fx_fy_, const __m128i &fxfy_, const __m128i &fx_fy, const __m128i &fxfy)
Interpolates 8 elements of 2x2 blocks for 3 channel 24 bit frames.
Definition SSE.h:1916
This class provides basic numeric functionalities.
Definition Numeric.h:57
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1159
The namespace covering the entire Ocean framework.
Definition Accessor.h:15
This union defines a wrapper for the __m128 SSE intrinsic data type.
Definition SSE.h:70
float m128_f32[4]
The four 32 bit elements.
Definition SSE.h:72
This union defines a wrapper for the __m128 SSE intrinsic data type.
Definition SSE.h:81
double m128d_f64[2]
The two 64 bit elements.
Definition SSE.h:83
This union defines a wrapper for the __m128i SSE intrinsic data type.
Definition SSE.h:50
uint64_t m128i_u64[2]
The two 64 bit elements.
Definition SSE.h:52
uint16_t m128i_u16[8]
The eight 16 bit elements.
Definition SSE.h:58
uint32_t m128i_u32[4]
The four 32 bit elements.
Definition SSE.h:55
uint8_t m128i_u8[16]
The sixteen 8 bit elements.
Definition SSE.h:61