Ocean
AdvancedFrameInterpolatorBilinearSSE.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_ADVANCED_ADVANCED_FRAME_INTERPOLATOR_BILINEAR_SSE_H
9 #define META_OCEAN_CV_ADVANCED_ADVANCED_FRAME_INTERPOLATOR_BILINEAR_SSE_H
10 
12 
13 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
14 
15 #include "ocean/cv/SSE.h"
16 
17 #include "ocean/math/Vector2.h"
18 
19 namespace Ocean
20 {
21 
22 namespace CV
23 {
24 
25 namespace Advanced
26 {
27 
28 /**
29  * This class implements advanced bilinear frame interpolation functions using SSE extensions.
30  * @ingroup cvadvanced
31  */
32 class OCEAN_CV_ADVANCED_EXPORT AdvancedFrameInterpolatorBilinearSSE
33 {
34  protected:
35 
36  /**
37  * This class allows to specialize functions for individual channels.
38  * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
39  */
40  template <unsigned int tChannels>
42  {
43  public:
44 
45  /**
46  * Interpolates the content of a square image patch with sub-pixel accuracy inside a given image and stores the interpolated data into a buffer.
47  * The center of a pixel is expected to be located at the top-left corner of a pixel.
48  * @param imageTopLeft The pointer to the top-left position of the image, must be valid
49  * @param imageStrideElements The number of elements between two consecutive image rows (including padding), in elements, with range [tChannels * tPatchSize, infinity)
50  * @param buffer The target buffer with `tChannels * tSize * tSize` elements, must be valid
51  * @param factorRight The interpolation factor for the right pixels, with range [0, 128]
52  * @param factorBottom The interpolation factor for the bottom pixels, with range [0, 128]
53  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
54  */
55  template <unsigned int tPatchSize>
56  static inline void interpolateSquarePatch8BitPerChannel(const uint8_t* imageTopLeft, const unsigned int imageStrideElements, uint8_t* buffer, const unsigned int factorRight, const unsigned int factorBottom);
57  };
58 
59  public:
60 
61  /**
62  * Interpolates the content of a square image patch with sub-pixel accuracy inside a given image and stores the interpolated data into a buffer.
63  * The center of a pixel is expected to be located at the top-left corner of a pixel.
64  * @param image The image in which the interpolated patch is located, must be valid
65  * @param width The width of the image, in pixel, with range [tPatchSize + 1, infinity)
66  * @param imagePaddingElements The number of padding elements at the end of each image row, in elements, with range [0, infinity)
67  * @param buffer The target buffer with `tChannels * tSize * tSize` elements, must be valid
68  * @param position The center position of the square region in the image, with range [tPatchSize/2, width - tPatchSize/2 - 1)x[tPatchSize/2, height - tPatchSize/2 - 1)
69  * @tparam tChannels The number of frame channels, with range [1, infinity)
70  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [5, infinity), must be odd
71  * @tparam tPixelCenter The pixel center to be used during interpolation, either 'PC_TOP_LEFT' or 'PC_CENTER'
72  * @tparam TScalar The scalar data type of the sub-pixel position
73  */
74  template <unsigned int tChannels, unsigned int tPatchSize, PixelCenter tPixelCenter = PC_TOP_LEFT, typename TScalar = Scalar>
75  static inline void interpolateSquarePatch8BitPerChannel(const uint8_t* const image, const unsigned int width, const unsigned int imagePaddingElements, uint8_t* buffer, const VectorT2<TScalar>& position);
76 };
77 
78 template <>
79 template <unsigned int tPatchSize>
80 inline void AdvancedFrameInterpolatorBilinearSSE::SpecializedForChannels<1u>::interpolateSquarePatch8BitPerChannel(const uint8_t* imageTopLeft, const unsigned int imageStrideElements, uint8_t* buffer, const unsigned int factorRight, const unsigned int factorBottom)
81 {
82  ocean_assert(tPatchSize >= 5u);
83 
84  ocean_assert(imageTopLeft != nullptr && buffer != nullptr);
85  ocean_assert(imageStrideElements >= 1u * tPatchSize);
86 
87  ocean_assert(factorRight <= 128u && factorBottom <= 128u);
88 
89  const unsigned int factorLeft = 128u - factorRight;
90  const unsigned int factorTop = 128u - factorBottom;
91 
92  constexpr unsigned int blocks15 = tPatchSize / 15u;
93  constexpr unsigned int remainingAfterBlocks15 = tPatchSize % 15u;
94 
95  constexpr bool partialBlock15 = remainingAfterBlocks15 > 10u;
96  constexpr unsigned int remainingAfterPartialBlock15 = partialBlock15 ? 0u : remainingAfterBlocks15;
97 
98  constexpr bool block7 = remainingAfterPartialBlock15 >= 7u;
99  constexpr unsigned int remainingAfterBlock7 = remainingAfterPartialBlock15 % 7u;
100 
101  constexpr bool partialBlock7 = remainingAfterBlock7 >= 3u;
102  constexpr unsigned int remainingAfterPartialBlock7 = partialBlock7 ? 0u : remainingAfterBlock7;
103 
104  constexpr unsigned int blocks1 = remainingAfterPartialBlock7;
105 
106  const unsigned int factorTopLeft = factorTop * factorLeft;
107  const unsigned int factorTopRight = factorTop * factorRight;
108 
109  // TL 0 TR 0 TL 0 TR 0 TL 0 TR 0 TL 0 TR 0
110  const __m128i factorsTop_u_16x8 = _mm_set1_epi32(int(factorTopLeft) | int(factorTopRight) << 16);
111 
112  const unsigned int factorBottomLeft = factorBottom * factorLeft;
113  const unsigned int factorBottomRight = factorBottom * factorRight;
114 
115  // BL 0 BR 0 BL 0 BR 0 BL 0 BR 0 BL 0 BR 0
116  const __m128i factorsBottom_u_16x8 = _mm_set1_epi32(int(factorBottomLeft) | int(factorBottomRight) << 16);
117 
118  for (unsigned int y = 0u; y < tPatchSize; ++y)
119  {
120  SSE::prefetchT0(imageTopLeft + imageStrideElements * 2u);
121  SSE::prefetchT0(imageTopLeft + imageStrideElements * 3u);
122 
123  for (unsigned int x = 0u; x < blocks15; ++x)
124  {
125  const __m128i top_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft));
126  const __m128i bottom_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements));
127 
128 
129  // A B C D E F G H I J K L M N O P -> A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
130  const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
131  const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
132 
133  // A B C D E F G H I J K L M N O P -> B 0 C 0 D 0 E 0 F 0 G 0 H 0 I 0
134  const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
135  const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
136 
137 
138  // A B C D E F G H I J K L M N O P -> I 0 J 0 K 0 L 0 M 0 N 0 O 0 P 0
139  const __m128i topHighA_u_16x8 = _mm_unpackhi_epi8(top_u_8x16, _mm_setzero_si128());
140  const __m128i bottomHighA_u_16x8 = _mm_unpackhi_epi8(bottom_u_8x16, _mm_setzero_si128());
141 
142  // A B C D E F G H I J K L M N O P -> J 0 K 0 L 0 M 0 N 0 O 0 P 0 0 0
143  const __m128i topHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
144  const __m128i bottomHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
145 
146 
147  // A * TL + B * TR, C * TL + D * TR, E * TL + F * TR, G * TL + H * TR
148  const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
149  const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
150 
151  const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
152  const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
153 
154 
155  const __m128i topHighA_u_32x4 = _mm_madd_epi16(topHighA_u_16x8, factorsTop_u_16x8);
156  const __m128i bottomHighA_u_32x4 = _mm_madd_epi16(bottomHighA_u_16x8, factorsBottom_u_16x8);
157 
158  const __m128i topHighB_u_32x4 = _mm_madd_epi16(topHighB_u_16x8, factorsTop_u_16x8);
159  const __m128i bottomHighB_u_32x4 = _mm_madd_epi16(bottomHighB_u_16x8, factorsBottom_u_16x8);
160 
161 
162  const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
163  const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
164 
165  const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
166 
167 
168  const __m128i resultHighA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighA_u_32x4, bottomHighA_u_32x4), _mm_set1_epi32(8192)), 14);
169  const __m128i resultHighB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighB_u_32x4, bottomHighB_u_32x4), _mm_set1_epi32(8192)), 14);
170 
171  const __m128i resultHigh_u_16x8 = _mm_or_si128(resultHighA_u_32x4, _mm_slli_epi32(resultHighB_u_32x4, 16));
172 
173 
174  // A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
175  // I 0 J 0 K 0 L 0 M 0 N 0 O 0 0 0 -> A B C D E F G H I J K L M N O 0
176  const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, resultHigh_u_16x8);
177 
178 
179  const bool isLastBlock = (y + 1u == tPatchSize) && (x + 1u == blocks15) && (!block7 && !partialBlock7 && blocks1 == 0u);
180 
181  if (isLastBlock)
182  {
183  memcpy(buffer, &result_u_8x16, 15);
184  }
185  else
186  {
187  _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
188  }
189 
190  imageTopLeft += 15;
191  buffer += 15;
192  }
193 
194  if constexpr (partialBlock15)
195  {
196  ocean_assert(!block7 && !partialBlock7 && blocks1 == 0u);
197 
198  __m128i top_u_8x16;
199  __m128i bottom_u_8x16;
200 
201  if (y < tPatchSize - 1u)
202  {
203  top_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft));
204  bottom_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements));
205  }
206  else
207  {
208  memcpy(&top_u_8x16, imageTopLeft, remainingAfterBlocks15 + 1u);
209  memcpy(&bottom_u_8x16, imageTopLeft + imageStrideElements, remainingAfterBlocks15 + 1u);
210  }
211 
212 
213  // A B C D E F G H I J K L M N O P -> A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
214  const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
215  const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
216 
217  // A B C D E F G H I J K L M N O P -> B 0 C 0 D 0 E 0 F 0 G 0 H 0 I 0
218  const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
219  const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
220 
221 
222  // A B C D E F G H I J K L M N O P -> I 0 J 0 K 0 L 0 M 0 N 0 O 0 P 0
223  const __m128i topHighA_u_16x8 = _mm_unpackhi_epi8(top_u_8x16, _mm_setzero_si128());
224  const __m128i bottomHighA_u_16x8 = _mm_unpackhi_epi8(bottom_u_8x16, _mm_setzero_si128());
225 
226  // A B C D E F G H I J K L M N O P -> J 0 K 0 L 0 M 0 N 0 O 0 P 0 0 0
227  const __m128i topHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
228  const __m128i bottomHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
229 
230 
231  // A * TL + B * TR, C * TL + D * TR, E * TL + F * TR, G * TL + H * TR
232  const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
233  const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
234 
235  const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
236  const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
237 
238 
239  const __m128i topHighA_u_32x4 = _mm_madd_epi16(topHighA_u_16x8, factorsTop_u_16x8);
240  const __m128i bottomHighA_u_32x4 = _mm_madd_epi16(bottomHighA_u_16x8, factorsBottom_u_16x8);
241 
242  const __m128i topHighB_u_32x4 = _mm_madd_epi16(topHighB_u_16x8, factorsTop_u_16x8);
243  const __m128i bottomHighB_u_32x4 = _mm_madd_epi16(bottomHighB_u_16x8, factorsBottom_u_16x8);
244 
245 
246  const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
247  const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
248 
249  const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
250 
251 
252  const __m128i resultHighA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighA_u_32x4, bottomHighA_u_32x4), _mm_set1_epi32(8192)), 14);
253  const __m128i resultHighB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighB_u_32x4, bottomHighB_u_32x4), _mm_set1_epi32(8192)), 14);
254 
255  const __m128i resultHigh_u_16x8 = _mm_or_si128(resultHighA_u_32x4, _mm_slli_epi32(resultHighB_u_32x4, 16));
256 
257 
258  // A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
259  // I 0 J 0 K 0 L 0 M 0 N 0 O 0 0 0 -> A B C D E F G H I J K L M N O 0
260  const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, resultHigh_u_16x8);
261 
262  ocean_assert(!block7 && !partialBlock7 && blocks1 == 0u);
263  const bool isLastBlock = y + 1u == tPatchSize;
264 
265  if (isLastBlock)
266  {
267  memcpy(buffer, &result_u_8x16, remainingAfterBlocks15);
268  }
269  else
270  {
271  _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
272  }
273 
274  imageTopLeft += remainingAfterBlocks15;
275  buffer += remainingAfterBlocks15;
276  }
277 
278  if constexpr (block7)
279  {
280  const __m128i top_u_8x16 = _mm_loadl_epi64((const __m128i*)(imageTopLeft));
281  const __m128i bottom_u_8x16 = _mm_loadl_epi64((const __m128i*)(imageTopLeft + imageStrideElements));
282 
283 
284  // A B C D E F G H I J K L M N O P -> A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
285  const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
286  const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
287 
288  // A B C D E F G H I J K L M N O P -> B 0 C 0 D 0 E 0 F 0 G 0 H 0 I 0
289  const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
290  const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
291 
292 
293  // A * TL + B * TR, C * TL + D * TR, E * TL + F * TR, G * TL + H * TR
294  const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
295  const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
296 
297  const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
298  const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
299 
300 
301  const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
302  const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
303 
304  const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
305 
306  // A 0 B 0 C 0 D 0 E 0 F 0 H 0 H 0 -> A B C D E F G H X X X X X X X X
307  const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, _mm_setzero_si128());
308 
309  const bool isLastBlock = (y + 1u == tPatchSize) && (!partialBlock7 && blocks1 == 0u);
310 
311  if (isLastBlock)
312  {
313  memcpy(buffer, &result_u_8x16, 7);
314  }
315  else
316  {
317  _mm_storel_epi64((__m128i*)buffer, result_u_8x16);
318  }
319 
320  imageTopLeft += 7;
321  buffer += 7;
322  }
323 
324  if constexpr (partialBlock7)
325  {
326  ocean_assert(blocks1 == 0u);
327 
328  __m128i top_u_8x16;
329  __m128i bottom_u_8x16;
330 
331  if (y < tPatchSize - 1u)
332  {
333  top_u_8x16 = _mm_loadl_epi64((const __m128i*)(imageTopLeft));
334  bottom_u_8x16 = _mm_loadl_epi64((const __m128i*)(imageTopLeft + imageStrideElements));
335  }
336  else
337  {
338  memcpy(&top_u_8x16, imageTopLeft, remainingAfterBlock7 + 1u);
339  memcpy(&bottom_u_8x16, imageTopLeft + imageStrideElements, remainingAfterBlock7 + 1u);
340  }
341 
342 
343  // A B C D E F G H I J K L M N O P -> A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
344  const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
345  const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
346 
347  // A B C D E F G H I J K L M N O P -> B 0 C 0 D 0 E 0 F 0 G 0 H 0 I 0
348  const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
349  const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
350 
351 
352  // A * TL + B * TR, C * TL + D * TR, E * TL + F * TR, G * TL + H * TR
353  const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
354  const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
355 
356  const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
357  const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
358 
359 
360  const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
361  const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
362 
363  const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
364 
365  // A 0 B 0 C 0 D 0 E 0 F 0 H 0 H 0 -> A B C D E F G H X X X X X X X X
366  const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, _mm_setzero_si128());
367 
368  ocean_assert(blocks1 == 0u);
369  const bool isLastBlock = y + 1u == tPatchSize;
370 
371  if (isLastBlock)
372  {
373  memcpy(buffer, &result_u_8x16, remainingAfterBlock7);
374  }
375  else
376  {
377  _mm_storel_epi64((__m128i*)buffer, result_u_8x16);
378  }
379 
380  imageTopLeft += remainingAfterBlock7;
381  buffer += remainingAfterBlock7;
382  }
383 
384  if constexpr (blocks1 != 0u)
385  {
386  const uint8_t* const imageBottomLeft = imageTopLeft + imageStrideElements;
387 
388  for (unsigned int n = 0u; n < blocks1; ++n)
389  {
390  buffer[n] = uint8_t((imageTopLeft[n] * factorTopLeft + imageTopLeft[1u + n] * factorTopRight + imageBottomLeft[n] * factorBottomLeft + imageBottomLeft[1u + n] * factorBottomRight + 8192u) / 16384u);
391  }
392 
393  imageTopLeft += blocks1;
394  buffer += blocks1;
395  }
396 
397  imageTopLeft += imageStrideElements - tPatchSize;
398  }
399 }
400 
401 template <>
402 template <unsigned int tPatchSize>
403 inline void AdvancedFrameInterpolatorBilinearSSE::SpecializedForChannels<3u>::interpolateSquarePatch8BitPerChannel(const uint8_t* imageTopLeft, const unsigned int imageStrideElements, uint8_t* buffer, const unsigned int factorRight, const unsigned int factorBottom)
404 {
405  ocean_assert(imageTopLeft != nullptr && buffer != nullptr);
406  ocean_assert(imageStrideElements >= 1u * tPatchSize);
407 
408  ocean_assert(factorRight <= 128u && factorBottom <= 128u);
409 
410  const unsigned int factorLeft = 128u - factorRight;
411  const unsigned int factorTop = 128u - factorBottom;
412 
413  constexpr unsigned int blocks4 = tPatchSize / 4u;
414  constexpr unsigned int remainingAfterBlocks4 = tPatchSize % 4u;
415 
416  constexpr bool partialBlock4 = remainingAfterBlocks4 >= 2u;
417  constexpr unsigned int remainingAfterPartialBlock4 = partialBlock4 ? 0u : remainingAfterBlocks4;
418 
419  constexpr unsigned int blocks1 = remainingAfterPartialBlock4;
420 
421  const unsigned int factorTopLeft = factorTop * factorLeft;
422  const unsigned int factorTopRight = factorTop * factorRight;
423 
424  // TL 0 TR 0 TL 0 TR 0 TL 0 TR 0 TL 0 TR 0
425  const __m128i factorsTop_u_16x8 = _mm_set1_epi32(int(factorTopLeft) | int(factorTopRight) << 16);
426 
427  const unsigned int factorBottomLeft = factorBottom * factorLeft;
428  const unsigned int factorBottomRight = factorBottom * factorRight;
429 
430  // BL 0 BR 0 BL 0 BR 0 BL 0 BR 0 BL 0 BR 0
431  const __m128i factorsBottom_u_16x8 = _mm_set1_epi32(int(factorBottomLeft) | int(factorBottomRight) << 16);
432 
433  for (unsigned int y = 0u; y < tPatchSize; ++y)
434  {
435  SSE::prefetchT0(imageTopLeft + imageStrideElements * 2u);
436  SSE::prefetchT0(imageTopLeft + imageStrideElements * 3u);
437 
438  for (unsigned int x = 0u; x < blocks4; ++x)
439  {
440  const bool canReadLastElements = y < tPatchSize - 1u || x < blocks4 - 1u;
441 
442  __m128i top_u_8x16;
443  __m128i bottom_u_8x16;
444 
445  if (canReadLastElements)
446  {
447  top_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft));
448  bottom_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements));
449  }
450  else
451  {
452  top_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(imageTopLeft - 1)), 1);
453  bottom_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements - 1u)), 1);
454  }
455 
456 
457  // de-interleaving
458 
459  // R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 0
460  // -> R0 0 R1 0 R1 0 R2 0 R2 0 R3 0 R3 0 R4 0
461  const __m128i topChannel0_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
462  const __m128i bottomChannel0_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
463 
464  // -> G0 0 G1 0 G1 0 G2 0 G2 0 G3 0 G3 0 G4 0
465  const __m128i topChannel1_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
466  const __m128i bottomChannel1_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
467 
468  // -> B0 0 B1 0 B1 0 B2 0 B2 0 B3 0 B3 0 B4 0
469  const __m128i topChannel2_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
470  const __m128i bottomChannel2_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
471 
472 
473  // R0 * TL + R1 * TR, R1 * TL + R2 * TR, R2 * TL + R3 * TR, R3 * TL + R4 * TR
474  const __m128i topChannel0_u_32x4 = _mm_madd_epi16(topChannel0_u_16x8, factorsTop_u_16x8);
475  const __m128i topChannel1_u_32x4 = _mm_madd_epi16(topChannel1_u_16x8, factorsTop_u_16x8);
476  const __m128i topChannel2_u_32x4 = _mm_madd_epi16(topChannel2_u_16x8, factorsTop_u_16x8);
477 
478  const __m128i bottomChannel0_u_32x4 = _mm_madd_epi16(bottomChannel0_u_16x8, factorsBottom_u_16x8);
479  const __m128i bottomChannel1_u_32x4 = _mm_madd_epi16(bottomChannel1_u_16x8, factorsBottom_u_16x8);
480  const __m128i bottomChannel2_u_32x4 = _mm_madd_epi16(bottomChannel2_u_16x8, factorsBottom_u_16x8);
481 
482 
483  const __m128i resultChannel0_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel0_u_32x4, bottomChannel0_u_32x4), _mm_set1_epi32(8192)), 14);
484  const __m128i resultChannel1_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel1_u_32x4, bottomChannel1_u_32x4), _mm_set1_epi32(8192)), 14);
485  const __m128i resultChannel2_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel2_u_32x4, bottomChannel2_u_32x4), _mm_set1_epi32(8192)), 14);
486 
487 
488  // interleaving
489 
490  const __m128i interleavedA_u_8x16 = _mm_shuffle_epi8(resultChannel0_u_32x4, SSE::set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
491  const __m128i interleavedB_u_8x16 = _mm_shuffle_epi8(resultChannel1_u_32x4, SSE::set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
492  const __m128i interleavedC_u_8x16 = _mm_shuffle_epi8(resultChannel2_u_32x4, SSE::set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
493 
494 
495  const __m128i result_u_8x16 = _mm_or_si128(interleavedA_u_8x16, _mm_or_si128(interleavedB_u_8x16, interleavedC_u_8x16));
496 
497  const bool isLastBlock = (y + 1u == tPatchSize) && (x + 1u == blocks4) && (!partialBlock4 && blocks1 <= 1u);
498 
499  if (isLastBlock)
500  {
501  uint8_t tempBuffer[16];
502  _mm_storeu_si128((__m128i*)tempBuffer, result_u_8x16);
503 
504  memcpy(buffer, tempBuffer, 12);
505  }
506  else
507  {
508  _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
509  }
510 
511  imageTopLeft += 12;
512  buffer += 12;
513  }
514 
515  if (partialBlock4)
516  {
517  const bool canReadLastElements = y < tPatchSize - 1u;
518 
519  __m128i top_u_8x16;
520  __m128i bottom_u_8x16;
521 
522  if (canReadLastElements)
523  {
524  top_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft));
525  bottom_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements));
526  }
527  else
528  {
529  constexpr unsigned int overlappingElements = 16u - (remainingAfterBlocks4 * 3u + 3u);
530  ocean_assert(overlappingElements < 16u);
531 
532  top_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(imageTopLeft - overlappingElements)), overlappingElements);
533  bottom_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements - overlappingElements)), overlappingElements);
534  }
535 
536 
537  // de-interleaving
538 
539  // R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 0
540  // -> R0 0 R1 0 R1 0 R2 0 R2 0 R3 0 R3 0 R4 0
541  const __m128i topChannel0_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
542  const __m128i bottomChannel0_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
543 
544  // -> G0 0 G1 0 G1 0 G2 0 G2 0 G3 0 G3 0 G4 0
545  const __m128i topChannel1_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
546  const __m128i bottomChannel1_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
547 
548  // -> B0 0 B1 0 B1 0 B2 0 B2 0 B3 0 B3 0 B4 0
549  const __m128i topChannel2_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
550  const __m128i bottomChannel2_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
551 
552 
553  // R0 * TL + R1 * TR, R1 * TL + R2 * TR, R2 * TL + R3 * TR, R3 * TL + R4 * TR
554  const __m128i topChannel0_u_32x4 = _mm_madd_epi16(topChannel0_u_16x8, factorsTop_u_16x8);
555  const __m128i topChannel1_u_32x4 = _mm_madd_epi16(topChannel1_u_16x8, factorsTop_u_16x8);
556  const __m128i topChannel2_u_32x4 = _mm_madd_epi16(topChannel2_u_16x8, factorsTop_u_16x8);
557 
558  const __m128i bottomChannel0_u_32x4 = _mm_madd_epi16(bottomChannel0_u_16x8, factorsBottom_u_16x8);
559  const __m128i bottomChannel1_u_32x4 = _mm_madd_epi16(bottomChannel1_u_16x8, factorsBottom_u_16x8);
560  const __m128i bottomChannel2_u_32x4 = _mm_madd_epi16(bottomChannel2_u_16x8, factorsBottom_u_16x8);
561 
562 
563  const __m128i resultChannel0_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel0_u_32x4, bottomChannel0_u_32x4), _mm_set1_epi32(8192)), 14);
564  const __m128i resultChannel1_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel1_u_32x4, bottomChannel1_u_32x4), _mm_set1_epi32(8192)), 14);
565  const __m128i resultChannel2_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel2_u_32x4, bottomChannel2_u_32x4), _mm_set1_epi32(8192)), 14);
566 
567 
568  // interleaving
569 
570  const __m128i interleavedA_u_8x16 = _mm_shuffle_epi8(resultChannel0_u_32x4, SSE::set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
571  const __m128i interleavedB_u_8x16 = _mm_shuffle_epi8(resultChannel1_u_32x4, SSE::set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
572  const __m128i interleavedC_u_8x16 = _mm_shuffle_epi8(resultChannel2_u_32x4, SSE::set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
573 
574 
575  const __m128i result_u_8x16 = _mm_or_si128(interleavedA_u_8x16, _mm_or_si128(interleavedB_u_8x16, interleavedC_u_8x16));
576 
577  ocean_assert(blocks1 == 0u);
578 
579  const bool isLastBlock = y + 1u == tPatchSize;
580 
581  if (isLastBlock)
582  {
583  uint8_t tempBuffer[16];
584  _mm_storeu_si128((__m128i*)tempBuffer, result_u_8x16);
585 
586  memcpy(buffer, tempBuffer, remainingAfterBlocks4 * 3u);
587  }
588  else
589  {
590  _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
591  }
592 
593  imageTopLeft += remainingAfterBlocks4 * 3u;
594  buffer += remainingAfterBlocks4 * 3u;
595  }
596 
597  if constexpr (blocks1 != 0u)
598  {
599  const uint8_t* const imageBottomLeft = imageTopLeft + imageStrideElements;
600 
601  for (unsigned int n = 0u; n < blocks1; ++n)
602  {
603  for (unsigned int c = 0u; c < 3u; ++c)
604  {
605  buffer[n * 3u + c] = uint8_t((imageTopLeft[n * 3u + c] * factorTopLeft + imageTopLeft[n * 3u + 3u + c] * factorTopRight + imageBottomLeft[n * 3u + c] * factorBottomLeft + imageBottomLeft[n * 3u + 3u + c] * factorBottomRight + 8192u) / 16384u);
606  }
607  }
608 
609  imageTopLeft += blocks1 * 3u;
610  buffer += blocks1 * 3u;
611  }
612 
613  imageTopLeft += imageStrideElements - tPatchSize * 3u;
614  }
615 }
616 
617 template <unsigned int tChannels>
618 template <unsigned int tPatchSize>
619 inline void AdvancedFrameInterpolatorBilinearSSE::SpecializedForChannels<tChannels>::interpolateSquarePatch8BitPerChannel(const uint8_t* imageTopLeft, const unsigned int imageStrideElements, uint8_t* buffer, const unsigned int factorRight, const unsigned int factorBottom)
620 {
621  ocean_assert(imageTopLeft != nullptr && buffer != nullptr);
622  ocean_assert(imageStrideElements >= 1u * tPatchSize);
623 
624  ocean_assert(factorRight <= 128u && factorBottom <= 128u);
625 
626  const unsigned int factorLeft = 128u - factorRight;
627  const unsigned int factorTop = 128u - factorBottom;
628 
629  const unsigned int factorTopLeft = factorTop * factorLeft;
630  const unsigned int factorTopRight = factorTop * factorRight;
631 
632  const unsigned int factorBottomLeft = factorBottom * factorLeft;
633  const unsigned int factorBottomRight = factorBottom * factorRight;
634 
635  const uint8_t* imageBottomLeft = imageTopLeft + imageStrideElements;
636 
637  for (unsigned int y = 0u; y < tPatchSize; ++y)
638  {
639  for (unsigned int x = 0u; x < tPatchSize; ++x)
640  {
641  for (unsigned int n = 0u; n < tChannels; ++n)
642  {
643  buffer[n] = uint8_t((imageTopLeft[n] * factorTopLeft + imageTopLeft[tChannels + n] * factorTopRight + imageBottomLeft[n] * factorBottomLeft + imageBottomLeft[tChannels + n] * factorBottomRight + 8192u) / 16384u);
644  }
645 
646  imageTopLeft += tChannels;
647  imageBottomLeft += tChannels;
648 
649  buffer += tChannels;
650  }
651 
652  imageTopLeft += imageStrideElements - tChannels * tPatchSize;
653  imageBottomLeft += imageStrideElements - tChannels * tPatchSize;
654  }
655 }
656 
657 template <unsigned int tChannels, unsigned int tPatchSize, PixelCenter tPixelCenter, typename TScalar>
658 inline void AdvancedFrameInterpolatorBilinearSSE::interpolateSquarePatch8BitPerChannel(const uint8_t* const image, const unsigned int width, const unsigned int imagePaddingElements, uint8_t* buffer, const VectorT2<TScalar>& position)
659 {
660  static_assert(tChannels >= 1u, "Invalid channel number!");
661  static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
662 
663  ocean_assert(image != nullptr && buffer != nullptr);
664  ocean_assert(tPatchSize + 1u <= width);
665 
666  constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
667 
668  const unsigned int imageStrideElements = width * tChannels + imagePaddingElements;
669 
670  const VectorT2<TScalar> shiftedPosition = tPixelCenter == PC_TOP_LEFT ? position : position - VectorT2<TScalar>(TScalar(0.5), TScalar(0.5));
671 
672  ocean_assert(shiftedPosition.x() >= TScalar(tPatchSize_2) && shiftedPosition.y() >= TScalar(tPatchSize_2));
673  ocean_assert(shiftedPosition.x() < TScalar(width - tPatchSize_2 - 1u));
674 
675  const unsigned int left = (unsigned int)(shiftedPosition.x()) - tPatchSize_2;
676  const unsigned int top = (unsigned int)(shiftedPosition.y()) - tPatchSize_2;
677 
678  ocean_assert(left + tPatchSize < width);
679 
680  const TScalar tx = shiftedPosition.x() - TScalar(int(shiftedPosition.x()));
681  ocean_assert(tx >= TScalar(0) && tx <= TScalar(1));
682  const unsigned int factorRight = (unsigned int)(tx * TScalar(128) + TScalar(0.5));
683 
684  const TScalar ty = shiftedPosition.y() - TScalar(int(shiftedPosition.y()));
685  ocean_assert(ty >= 0 && ty <= 1);
686  const unsigned int factorBottom = (unsigned int)(ty * TScalar(128) + TScalar(0.5));
687 
688  const uint8_t* const imageTopLeft = image + top * imageStrideElements + left * tChannels;
689 
690  SpecializedForChannels<tChannels>::template interpolateSquarePatch8BitPerChannel<tPatchSize>(imageTopLeft, imageStrideElements, buffer, factorRight, factorBottom);
691 }
692 
693 }
694 
695 }
696 
697 }
698 
699 #endif // OCEAN_HARDWARE_SSE_VERSION >= 41
700 
701 #endif // META_OCEAN_CV_ADVANCED_ADVANCED_FRAME_INTERPOLATOR_BILINEAR_SSE_H
This class allows to specialize functions for individual channels.
Definition: AdvancedFrameInterpolatorBilinearSSE.h:42
static void interpolateSquarePatch8BitPerChannel(const uint8_t *imageTopLeft, const unsigned int imageStrideElements, uint8_t *buffer, const unsigned int factorRight, const unsigned int factorBottom)
Interpolates the content of a square image patch with sub-pixel accuracy inside a given image and sto...
Definition: AdvancedFrameInterpolatorBilinearSSE.h:619
This class implements advanced bilinear frame interpolation functions using SSE extensions.
Definition: AdvancedFrameInterpolatorBilinearSSE.h:33
static void interpolateSquarePatch8BitPerChannel(const uint8_t *const image, const unsigned int width, const unsigned int imagePaddingElements, uint8_t *buffer, const VectorT2< TScalar > &position)
Interpolates the content of a square image patch with sub-pixel accuracy inside a given image and sto...
Definition: AdvancedFrameInterpolatorBilinearSSE.h:658
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition: SSE.h:1255
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition: SSE.h:3770
This class implements a vector with two elements.
Definition: Vector2.h:96
const T & x() const noexcept
Returns the x value.
Definition: Vector2.h:698
const T & y() const noexcept
Returns the y value.
Definition: Vector2.h:710
@ PC_TOP_LEFT
The center of a pixel is in the upper-left corner of each pixel's square.
Definition: CV.h:133
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15