Ocean
Loading...
Searching...
No Matches
AdvancedFrameInterpolatorBilinearSSE.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_ADVANCED_ADVANCED_FRAME_INTERPOLATOR_BILINEAR_SSE_H
9#define META_OCEAN_CV_ADVANCED_ADVANCED_FRAME_INTERPOLATOR_BILINEAR_SSE_H
10
12
13#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
14
15#include "ocean/cv/SSE.h"
16
17#include "ocean/math/Vector2.h"
18
19namespace Ocean
20{
21
22namespace CV
23{
24
25namespace Advanced
26{
27
28/**
29 * This class implements advanced bilinear frame interpolation functions using SSE extensions.
30 * @ingroup cvadvanced
31 */
32class OCEAN_CV_ADVANCED_EXPORT AdvancedFrameInterpolatorBilinearSSE
33{
34 protected:
35
36 /**
37 * This class allows to specialize functions for individual channels.
38 * @tparam tChannels Specifies the number of channels for the given frames, with range [1, infinity)
39 */
40 template <unsigned int tChannels>
42 {
43 public:
44
45 /**
46 * Interpolates the content of a square image patch with sub-pixel accuracy inside a given image and stores the interpolated data into a buffer.
47 * The center of a pixel is expected to be located at the top-left corner of a pixel.
48 * @param imageTopLeft The pointer to the top-left position of the image, must be valid
49 * @param imageStrideElements The number of elements between two consecutive image rows (including padding), in elements, with range [tChannels * tPatchSize, infinity)
50 * @param buffer The target buffer with `tChannels * tSize * tSize` elements, must be valid
51 * @param factorRight The interpolation factor for the right pixels, with range [0, 128]
52 * @param factorBottom The interpolation factor for the bottom pixels, with range [0, 128]
53 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
54 */
55 template <unsigned int tPatchSize>
56 static inline void interpolateSquarePatch8BitPerChannel(const uint8_t* imageTopLeft, const unsigned int imageStrideElements, uint8_t* buffer, const unsigned int factorRight, const unsigned int factorBottom);
57 };
58
59 public:
60
61 /**
62 * Interpolates the content of a square image patch with sub-pixel accuracy inside a given image and stores the interpolated data into a buffer.
63 * The center of a pixel is expected to be located at the top-left corner of a pixel.
64 * @param image The image in which the interpolated patch is located, must be valid
65 * @param width The width of the image, in pixel, with range [tPatchSize + 1, infinity)
66 * @param imagePaddingElements The number of padding elements at the end of each image row, in elements, with range [0, infinity)
67 * @param buffer The target buffer with `tChannels * tSize * tSize` elements, must be valid
68 * @param position The center position of the square region in the image, with range [tPatchSize/2, width - tPatchSize/2 - 1)x[tPatchSize/2, height - tPatchSize/2 - 1)
69 * @tparam tChannels The number of frame channels, with range [1, infinity)
70 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [5, infinity), must be odd
71 * @tparam tPixelCenter The pixel center to be used during interpolation, either 'PC_TOP_LEFT' or 'PC_CENTER'
72 * @tparam TScalar The scalar data type of the sub-pixel position
73 */
74 template <unsigned int tChannels, unsigned int tPatchSize, PixelCenter tPixelCenter = PC_TOP_LEFT, typename TScalar = Scalar>
75 static inline void interpolateSquarePatch8BitPerChannel(const uint8_t* const image, const unsigned int width, const unsigned int imagePaddingElements, uint8_t* buffer, const VectorT2<TScalar>& position);
76};
77
78template <>
79template <unsigned int tPatchSize>
80inline void AdvancedFrameInterpolatorBilinearSSE::SpecializedForChannels<1u>::interpolateSquarePatch8BitPerChannel(const uint8_t* imageTopLeft, const unsigned int imageStrideElements, uint8_t* buffer, const unsigned int factorRight, const unsigned int factorBottom)
81{
82 ocean_assert(tPatchSize >= 5u);
83
84 ocean_assert(imageTopLeft != nullptr && buffer != nullptr);
85 ocean_assert(imageStrideElements >= 1u * tPatchSize);
86
87 ocean_assert(factorRight <= 128u && factorBottom <= 128u);
88
89 const unsigned int factorLeft = 128u - factorRight;
90 const unsigned int factorTop = 128u - factorBottom;
91
92 constexpr unsigned int blocks15 = tPatchSize / 15u;
93 constexpr unsigned int remainingAfterBlocks15 = tPatchSize % 15u;
94
95 constexpr bool partialBlock15 = remainingAfterBlocks15 > 10u;
96 constexpr unsigned int remainingAfterPartialBlock15 = partialBlock15 ? 0u : remainingAfterBlocks15;
97
98 constexpr bool block7 = remainingAfterPartialBlock15 >= 7u;
99 constexpr unsigned int remainingAfterBlock7 = remainingAfterPartialBlock15 % 7u;
100
101 constexpr bool partialBlock7 = remainingAfterBlock7 >= 3u;
102 constexpr unsigned int remainingAfterPartialBlock7 = partialBlock7 ? 0u : remainingAfterBlock7;
103
104 constexpr unsigned int blocks1 = remainingAfterPartialBlock7;
105
106 const unsigned int factorTopLeft = factorTop * factorLeft;
107 const unsigned int factorTopRight = factorTop * factorRight;
108
109 // TL 0 TR 0 TL 0 TR 0 TL 0 TR 0 TL 0 TR 0
110 const __m128i factorsTop_u_16x8 = _mm_set1_epi32(int(factorTopLeft) | int(factorTopRight) << 16);
111
112 const unsigned int factorBottomLeft = factorBottom * factorLeft;
113 const unsigned int factorBottomRight = factorBottom * factorRight;
114
115 // BL 0 BR 0 BL 0 BR 0 BL 0 BR 0 BL 0 BR 0
116 const __m128i factorsBottom_u_16x8 = _mm_set1_epi32(int(factorBottomLeft) | int(factorBottomRight) << 16);
117
118 for (unsigned int y = 0u; y < tPatchSize; ++y)
119 {
120 SSE::prefetchT0(imageTopLeft + imageStrideElements * 2u);
121 SSE::prefetchT0(imageTopLeft + imageStrideElements * 3u);
122
123 for (unsigned int x = 0u; x < blocks15; ++x)
124 {
125 const __m128i top_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft));
126 const __m128i bottom_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements));
127
128
129 // A B C D E F G H I J K L M N O P -> A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
130 const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
131 const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
132
133 // A B C D E F G H I J K L M N O P -> B 0 C 0 D 0 E 0 F 0 G 0 H 0 I 0
134 const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
135 const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
136
137
138 // A B C D E F G H I J K L M N O P -> I 0 J 0 K 0 L 0 M 0 N 0 O 0 P 0
139 const __m128i topHighA_u_16x8 = _mm_unpackhi_epi8(top_u_8x16, _mm_setzero_si128());
140 const __m128i bottomHighA_u_16x8 = _mm_unpackhi_epi8(bottom_u_8x16, _mm_setzero_si128());
141
142 // A B C D E F G H I J K L M N O P -> J 0 K 0 L 0 M 0 N 0 O 0 P 0 0 0
143 const __m128i topHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
144 const __m128i bottomHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
145
146
147 // A * TL + B * TR, C * TL + D * TR, E * TL + F * TR, G * TL + H * TR
148 const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
149 const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
150
151 const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
152 const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
153
154
155 const __m128i topHighA_u_32x4 = _mm_madd_epi16(topHighA_u_16x8, factorsTop_u_16x8);
156 const __m128i bottomHighA_u_32x4 = _mm_madd_epi16(bottomHighA_u_16x8, factorsBottom_u_16x8);
157
158 const __m128i topHighB_u_32x4 = _mm_madd_epi16(topHighB_u_16x8, factorsTop_u_16x8);
159 const __m128i bottomHighB_u_32x4 = _mm_madd_epi16(bottomHighB_u_16x8, factorsBottom_u_16x8);
160
161
162 const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
163 const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
164
165 const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
166
167
168 const __m128i resultHighA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighA_u_32x4, bottomHighA_u_32x4), _mm_set1_epi32(8192)), 14);
169 const __m128i resultHighB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighB_u_32x4, bottomHighB_u_32x4), _mm_set1_epi32(8192)), 14);
170
171 const __m128i resultHigh_u_16x8 = _mm_or_si128(resultHighA_u_32x4, _mm_slli_epi32(resultHighB_u_32x4, 16));
172
173
174 // A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
175 // I 0 J 0 K 0 L 0 M 0 N 0 O 0 0 0 -> A B C D E F G H I J K L M N O 0
176 const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, resultHigh_u_16x8);
177
178
179 const bool isLastBlock = (y + 1u == tPatchSize) && (x + 1u == blocks15) && (!block7 && !partialBlock7 && blocks1 == 0u);
180
181 if (isLastBlock)
182 {
183 memcpy(buffer, &result_u_8x16, 15);
184 }
185 else
186 {
187 _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
188 }
189
190 imageTopLeft += 15;
191 buffer += 15;
192 }
193
194 if constexpr (partialBlock15)
195 {
196 ocean_assert(!block7 && !partialBlock7 && blocks1 == 0u);
197
198 __m128i top_u_8x16;
199 __m128i bottom_u_8x16;
200
201 if (y < tPatchSize - 1u)
202 {
203 top_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft));
204 bottom_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements));
205 }
206 else
207 {
208 memcpy(&top_u_8x16, imageTopLeft, remainingAfterBlocks15 + 1u);
209 memcpy(&bottom_u_8x16, imageTopLeft + imageStrideElements, remainingAfterBlocks15 + 1u);
210 }
211
212
213 // A B C D E F G H I J K L M N O P -> A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
214 const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
215 const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
216
217 // A B C D E F G H I J K L M N O P -> B 0 C 0 D 0 E 0 F 0 G 0 H 0 I 0
218 const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
219 const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
220
221
222 // A B C D E F G H I J K L M N O P -> I 0 J 0 K 0 L 0 M 0 N 0 O 0 P 0
223 const __m128i topHighA_u_16x8 = _mm_unpackhi_epi8(top_u_8x16, _mm_setzero_si128());
224 const __m128i bottomHighA_u_16x8 = _mm_unpackhi_epi8(bottom_u_8x16, _mm_setzero_si128());
225
226 // A B C D E F G H I J K L M N O P -> J 0 K 0 L 0 M 0 N 0 O 0 P 0 0 0
227 const __m128i topHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
228 const __m128i bottomHighB_u_16x8 = _mm_unpackhi_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
229
230
231 // A * TL + B * TR, C * TL + D * TR, E * TL + F * TR, G * TL + H * TR
232 const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
233 const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
234
235 const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
236 const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
237
238
239 const __m128i topHighA_u_32x4 = _mm_madd_epi16(topHighA_u_16x8, factorsTop_u_16x8);
240 const __m128i bottomHighA_u_32x4 = _mm_madd_epi16(bottomHighA_u_16x8, factorsBottom_u_16x8);
241
242 const __m128i topHighB_u_32x4 = _mm_madd_epi16(topHighB_u_16x8, factorsTop_u_16x8);
243 const __m128i bottomHighB_u_32x4 = _mm_madd_epi16(bottomHighB_u_16x8, factorsBottom_u_16x8);
244
245
246 const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
247 const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
248
249 const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
250
251
252 const __m128i resultHighA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighA_u_32x4, bottomHighA_u_32x4), _mm_set1_epi32(8192)), 14);
253 const __m128i resultHighB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topHighB_u_32x4, bottomHighB_u_32x4), _mm_set1_epi32(8192)), 14);
254
255 const __m128i resultHigh_u_16x8 = _mm_or_si128(resultHighA_u_32x4, _mm_slli_epi32(resultHighB_u_32x4, 16));
256
257
258 // A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
259 // I 0 J 0 K 0 L 0 M 0 N 0 O 0 0 0 -> A B C D E F G H I J K L M N O 0
260 const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, resultHigh_u_16x8);
261
262 ocean_assert(!block7 && !partialBlock7 && blocks1 == 0u);
263 const bool isLastBlock = y + 1u == tPatchSize;
264
265 if (isLastBlock)
266 {
267 memcpy(buffer, &result_u_8x16, remainingAfterBlocks15);
268 }
269 else
270 {
271 _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
272 }
273
274 imageTopLeft += remainingAfterBlocks15;
275 buffer += remainingAfterBlocks15;
276 }
277
278 if constexpr (block7)
279 {
280 const __m128i top_u_8x16 = _mm_loadl_epi64((const __m128i*)(imageTopLeft));
281 const __m128i bottom_u_8x16 = _mm_loadl_epi64((const __m128i*)(imageTopLeft + imageStrideElements));
282
283
284 // A B C D E F G H I J K L M N O P -> A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
285 const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
286 const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
287
288 // A B C D E F G H I J K L M N O P -> B 0 C 0 D 0 E 0 F 0 G 0 H 0 I 0
289 const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
290 const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
291
292
293 // A * TL + B * TR, C * TL + D * TR, E * TL + F * TR, G * TL + H * TR
294 const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
295 const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
296
297 const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
298 const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
299
300
301 const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
302 const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
303
304 const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
305
306 // A 0 B 0 C 0 D 0 E 0 F 0 H 0 H 0 -> A B C D E F G H X X X X X X X X
307 const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, _mm_setzero_si128());
308
309 const bool isLastBlock = (y + 1u == tPatchSize) && (!partialBlock7 && blocks1 == 0u);
310
311 if (isLastBlock)
312 {
313 memcpy(buffer, &result_u_8x16, 7);
314 }
315 else
316 {
317 _mm_storel_epi64((__m128i*)buffer, result_u_8x16);
318 }
319
320 imageTopLeft += 7;
321 buffer += 7;
322 }
323
324 if constexpr (partialBlock7)
325 {
326 ocean_assert(blocks1 == 0u);
327
328 __m128i top_u_8x16;
329 __m128i bottom_u_8x16;
330
331 if (y < tPatchSize - 1u)
332 {
333 top_u_8x16 = _mm_loadl_epi64((const __m128i*)(imageTopLeft));
334 bottom_u_8x16 = _mm_loadl_epi64((const __m128i*)(imageTopLeft + imageStrideElements));
335 }
336 else
337 {
338 memcpy(&top_u_8x16, imageTopLeft, remainingAfterBlock7 + 1u);
339 memcpy(&bottom_u_8x16, imageTopLeft + imageStrideElements, remainingAfterBlock7 + 1u);
340 }
341
342
343 // A B C D E F G H I J K L M N O P -> A 0 B 0 C 0 D 0 E 0 F 0 G 0 H 0
344 const __m128i topLowA_u_16x8 = _mm_unpacklo_epi8(top_u_8x16, _mm_setzero_si128());
345 const __m128i bottomLowA_u_16x8 = _mm_unpacklo_epi8(bottom_u_8x16, _mm_setzero_si128());
346
347 // A B C D E F G H I J K L M N O P -> B 0 C 0 D 0 E 0 F 0 G 0 H 0 I 0
348 const __m128i topLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(top_u_8x16, 1), _mm_setzero_si128());
349 const __m128i bottomLowB_u_16x8 = _mm_unpacklo_epi8(_mm_srli_si128(bottom_u_8x16, 1), _mm_setzero_si128());
350
351
352 // A * TL + B * TR, C * TL + D * TR, E * TL + F * TR, G * TL + H * TR
353 const __m128i topLowA_u_32x4 = _mm_madd_epi16(topLowA_u_16x8, factorsTop_u_16x8);
354 const __m128i bottomLowA_u_32x4 = _mm_madd_epi16(bottomLowA_u_16x8, factorsBottom_u_16x8);
355
356 const __m128i topLowB_u_32x4 = _mm_madd_epi16(topLowB_u_16x8, factorsTop_u_16x8);
357 const __m128i bottomLowB_u_32x4 = _mm_madd_epi16(bottomLowB_u_16x8, factorsBottom_u_16x8);
358
359
360 const __m128i resultLowA_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowA_u_32x4, bottomLowA_u_32x4), _mm_set1_epi32(8192)), 14);
361 const __m128i resultLowB_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topLowB_u_32x4, bottomLowB_u_32x4), _mm_set1_epi32(8192)), 14);
362
363 const __m128i resultLow_u_16x8 = _mm_or_si128(resultLowA_u_32x4, _mm_slli_epi32(resultLowB_u_32x4, 16));
364
365 // A 0 B 0 C 0 D 0 E 0 F 0 H 0 H 0 -> A B C D E F G H X X X X X X X X
366 const __m128i result_u_8x16 = _mm_packus_epi16(resultLow_u_16x8, _mm_setzero_si128());
367
368 ocean_assert(blocks1 == 0u);
369 const bool isLastBlock = y + 1u == tPatchSize;
370
371 if (isLastBlock)
372 {
373 memcpy(buffer, &result_u_8x16, remainingAfterBlock7);
374 }
375 else
376 {
377 _mm_storel_epi64((__m128i*)buffer, result_u_8x16);
378 }
379
380 imageTopLeft += remainingAfterBlock7;
381 buffer += remainingAfterBlock7;
382 }
383
384 if constexpr (blocks1 != 0u)
385 {
386 const uint8_t* const imageBottomLeft = imageTopLeft + imageStrideElements;
387
388 for (unsigned int n = 0u; n < blocks1; ++n)
389 {
390 buffer[n] = uint8_t((imageTopLeft[n] * factorTopLeft + imageTopLeft[1u + n] * factorTopRight + imageBottomLeft[n] * factorBottomLeft + imageBottomLeft[1u + n] * factorBottomRight + 8192u) / 16384u);
391 }
392
393 imageTopLeft += blocks1;
394 buffer += blocks1;
395 }
396
397 imageTopLeft += imageStrideElements - tPatchSize;
398 }
399}
400
401template <>
402template <unsigned int tPatchSize>
403inline void AdvancedFrameInterpolatorBilinearSSE::SpecializedForChannels<3u>::interpolateSquarePatch8BitPerChannel(const uint8_t* imageTopLeft, const unsigned int imageStrideElements, uint8_t* buffer, const unsigned int factorRight, const unsigned int factorBottom)
404{
405 ocean_assert(imageTopLeft != nullptr && buffer != nullptr);
406 ocean_assert(imageStrideElements >= 1u * tPatchSize);
407
408 ocean_assert(factorRight <= 128u && factorBottom <= 128u);
409
410 const unsigned int factorLeft = 128u - factorRight;
411 const unsigned int factorTop = 128u - factorBottom;
412
413 constexpr unsigned int blocks4 = tPatchSize / 4u;
414 constexpr unsigned int remainingAfterBlocks4 = tPatchSize % 4u;
415
416 constexpr bool partialBlock4 = remainingAfterBlocks4 >= 2u;
417 constexpr unsigned int remainingAfterPartialBlock4 = partialBlock4 ? 0u : remainingAfterBlocks4;
418
419 constexpr unsigned int blocks1 = remainingAfterPartialBlock4;
420
421 const unsigned int factorTopLeft = factorTop * factorLeft;
422 const unsigned int factorTopRight = factorTop * factorRight;
423
424 // TL 0 TR 0 TL 0 TR 0 TL 0 TR 0 TL 0 TR 0
425 const __m128i factorsTop_u_16x8 = _mm_set1_epi32(int(factorTopLeft) | int(factorTopRight) << 16);
426
427 const unsigned int factorBottomLeft = factorBottom * factorLeft;
428 const unsigned int factorBottomRight = factorBottom * factorRight;
429
430 // BL 0 BR 0 BL 0 BR 0 BL 0 BR 0 BL 0 BR 0
431 const __m128i factorsBottom_u_16x8 = _mm_set1_epi32(int(factorBottomLeft) | int(factorBottomRight) << 16);
432
433 for (unsigned int y = 0u; y < tPatchSize; ++y)
434 {
435 SSE::prefetchT0(imageTopLeft + imageStrideElements * 2u);
436 SSE::prefetchT0(imageTopLeft + imageStrideElements * 3u);
437
438 for (unsigned int x = 0u; x < blocks4; ++x)
439 {
440 const bool canReadLastElements = y < tPatchSize - 1u || x < blocks4 - 1u;
441
442 __m128i top_u_8x16;
443 __m128i bottom_u_8x16;
444
445 if (canReadLastElements)
446 {
447 top_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft));
448 bottom_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements));
449 }
450 else
451 {
452 top_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(imageTopLeft - 1)), 1);
453 bottom_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements - 1u)), 1);
454 }
455
456
457 // de-interleaving
458
459 // R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 0
460 // -> R0 0 R1 0 R1 0 R2 0 R2 0 R3 0 R3 0 R4 0
461 const __m128i topChannel0_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
462 const __m128i bottomChannel0_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
463
464 // -> G0 0 G1 0 G1 0 G2 0 G2 0 G3 0 G3 0 G4 0
465 const __m128i topChannel1_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
466 const __m128i bottomChannel1_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
467
468 // -> B0 0 B1 0 B1 0 B2 0 B2 0 B3 0 B3 0 B4 0
469 const __m128i topChannel2_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
470 const __m128i bottomChannel2_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
471
472
473 // R0 * TL + R1 * TR, R1 * TL + R2 * TR, R2 * TL + R3 * TR, R3 * TL + R4 * TR
474 const __m128i topChannel0_u_32x4 = _mm_madd_epi16(topChannel0_u_16x8, factorsTop_u_16x8);
475 const __m128i topChannel1_u_32x4 = _mm_madd_epi16(topChannel1_u_16x8, factorsTop_u_16x8);
476 const __m128i topChannel2_u_32x4 = _mm_madd_epi16(topChannel2_u_16x8, factorsTop_u_16x8);
477
478 const __m128i bottomChannel0_u_32x4 = _mm_madd_epi16(bottomChannel0_u_16x8, factorsBottom_u_16x8);
479 const __m128i bottomChannel1_u_32x4 = _mm_madd_epi16(bottomChannel1_u_16x8, factorsBottom_u_16x8);
480 const __m128i bottomChannel2_u_32x4 = _mm_madd_epi16(bottomChannel2_u_16x8, factorsBottom_u_16x8);
481
482
483 const __m128i resultChannel0_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel0_u_32x4, bottomChannel0_u_32x4), _mm_set1_epi32(8192)), 14);
484 const __m128i resultChannel1_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel1_u_32x4, bottomChannel1_u_32x4), _mm_set1_epi32(8192)), 14);
485 const __m128i resultChannel2_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel2_u_32x4, bottomChannel2_u_32x4), _mm_set1_epi32(8192)), 14);
486
487
488 // interleaving
489
490 const __m128i interleavedA_u_8x16 = _mm_shuffle_epi8(resultChannel0_u_32x4, SSE::set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
491 const __m128i interleavedB_u_8x16 = _mm_shuffle_epi8(resultChannel1_u_32x4, SSE::set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
492 const __m128i interleavedC_u_8x16 = _mm_shuffle_epi8(resultChannel2_u_32x4, SSE::set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
493
494
495 const __m128i result_u_8x16 = _mm_or_si128(interleavedA_u_8x16, _mm_or_si128(interleavedB_u_8x16, interleavedC_u_8x16));
496
497 const bool isLastBlock = (y + 1u == tPatchSize) && (x + 1u == blocks4) && (!partialBlock4 && blocks1 <= 1u);
498
499 if (isLastBlock)
500 {
501 uint8_t tempBuffer[16];
502 _mm_storeu_si128((__m128i*)tempBuffer, result_u_8x16);
503
504 memcpy(buffer, tempBuffer, 12);
505 }
506 else
507 {
508 _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
509 }
510
511 imageTopLeft += 12;
512 buffer += 12;
513 }
514
515 if (partialBlock4)
516 {
517 const bool canReadLastElements = y < tPatchSize - 1u;
518
519 __m128i top_u_8x16;
520 __m128i bottom_u_8x16;
521
522 if (canReadLastElements)
523 {
524 top_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft));
525 bottom_u_8x16 = _mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements));
526 }
527 else
528 {
529 constexpr unsigned int overlappingElements = 16u - (remainingAfterBlocks4 * 3u + 3u);
530 ocean_assert(overlappingElements < 16u);
531
532 top_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(imageTopLeft - overlappingElements)), overlappingElements);
533 bottom_u_8x16 = _mm_srli_si128(_mm_lddqu_si128((const __m128i*)(imageTopLeft + imageStrideElements - overlappingElements)), overlappingElements);
534 }
535
536
537 // de-interleaving
538
539 // R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 0
540 // -> R0 0 R1 0 R1 0 R2 0 R2 0 R3 0 R3 0 R4 0
541 const __m128i topChannel0_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
542 const __m128i bottomChannel0_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0cFF09FF09FF06ull, 0xFF06FF03FF03FF00ull));
543
544 // -> G0 0 G1 0 G1 0 G2 0 G2 0 G3 0 G3 0 G4 0
545 const __m128i topChannel1_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
546 const __m128i bottomChannel1_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0dFF0aFF0aFF07ull, 0xFF07FF04FF04FF01ull));
547
548 // -> B0 0 B1 0 B1 0 B2 0 B2 0 B3 0 B3 0 B4 0
549 const __m128i topChannel2_u_16x8 = _mm_shuffle_epi8(top_u_8x16, SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
550 const __m128i bottomChannel2_u_16x8 = _mm_shuffle_epi8(bottom_u_8x16, SSE::set128i(0xFF0eFF0bFF0bFF08ull, 0xFF08FF05FF05FF02ull));
551
552
553 // R0 * TL + R1 * TR, R1 * TL + R2 * TR, R2 * TL + R3 * TR, R3 * TL + R4 * TR
554 const __m128i topChannel0_u_32x4 = _mm_madd_epi16(topChannel0_u_16x8, factorsTop_u_16x8);
555 const __m128i topChannel1_u_32x4 = _mm_madd_epi16(topChannel1_u_16x8, factorsTop_u_16x8);
556 const __m128i topChannel2_u_32x4 = _mm_madd_epi16(topChannel2_u_16x8, factorsTop_u_16x8);
557
558 const __m128i bottomChannel0_u_32x4 = _mm_madd_epi16(bottomChannel0_u_16x8, factorsBottom_u_16x8);
559 const __m128i bottomChannel1_u_32x4 = _mm_madd_epi16(bottomChannel1_u_16x8, factorsBottom_u_16x8);
560 const __m128i bottomChannel2_u_32x4 = _mm_madd_epi16(bottomChannel2_u_16x8, factorsBottom_u_16x8);
561
562
563 const __m128i resultChannel0_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel0_u_32x4, bottomChannel0_u_32x4), _mm_set1_epi32(8192)), 14);
564 const __m128i resultChannel1_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel1_u_32x4, bottomChannel1_u_32x4), _mm_set1_epi32(8192)), 14);
565 const __m128i resultChannel2_u_32x4 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(topChannel2_u_32x4, bottomChannel2_u_32x4), _mm_set1_epi32(8192)), 14);
566
567
568 // interleaving
569
570 const __m128i interleavedA_u_8x16 = _mm_shuffle_epi8(resultChannel0_u_32x4, SSE::set128i(0xFFFFFFFFFFFF0cFFull, 0xFF08FFFF04FFFF00ull));
571 const __m128i interleavedB_u_8x16 = _mm_shuffle_epi8(resultChannel1_u_32x4, SSE::set128i(0xFFFFFFFFFF0cFFFFull, 0x08FFFF04FFFF00FFull));
572 const __m128i interleavedC_u_8x16 = _mm_shuffle_epi8(resultChannel2_u_32x4, SSE::set128i(0xFFFFFFFF0cFFFF08ull, 0xFFFF04FFFF00FFFFull));
573
574
575 const __m128i result_u_8x16 = _mm_or_si128(interleavedA_u_8x16, _mm_or_si128(interleavedB_u_8x16, interleavedC_u_8x16));
576
577 ocean_assert(blocks1 == 0u);
578
579 const bool isLastBlock = y + 1u == tPatchSize;
580
581 if (isLastBlock)
582 {
583 uint8_t tempBuffer[16];
584 _mm_storeu_si128((__m128i*)tempBuffer, result_u_8x16);
585
586 memcpy(buffer, tempBuffer, remainingAfterBlocks4 * 3u);
587 }
588 else
589 {
590 _mm_storeu_si128((__m128i*)buffer, result_u_8x16);
591 }
592
593 imageTopLeft += remainingAfterBlocks4 * 3u;
594 buffer += remainingAfterBlocks4 * 3u;
595 }
596
597 if constexpr (blocks1 != 0u)
598 {
599 const uint8_t* const imageBottomLeft = imageTopLeft + imageStrideElements;
600
601 for (unsigned int n = 0u; n < blocks1; ++n)
602 {
603 for (unsigned int c = 0u; c < 3u; ++c)
604 {
605 buffer[n * 3u + c] = uint8_t((imageTopLeft[n * 3u + c] * factorTopLeft + imageTopLeft[n * 3u + 3u + c] * factorTopRight + imageBottomLeft[n * 3u + c] * factorBottomLeft + imageBottomLeft[n * 3u + 3u + c] * factorBottomRight + 8192u) / 16384u);
606 }
607 }
608
609 imageTopLeft += blocks1 * 3u;
610 buffer += blocks1 * 3u;
611 }
612
613 imageTopLeft += imageStrideElements - tPatchSize * 3u;
614 }
615}
616
617template <unsigned int tChannels>
618template <unsigned int tPatchSize>
619inline void AdvancedFrameInterpolatorBilinearSSE::SpecializedForChannels<tChannels>::interpolateSquarePatch8BitPerChannel(const uint8_t* imageTopLeft, const unsigned int imageStrideElements, uint8_t* buffer, const unsigned int factorRight, const unsigned int factorBottom)
620{
621 ocean_assert(imageTopLeft != nullptr && buffer != nullptr);
622 ocean_assert(imageStrideElements >= 1u * tPatchSize);
623
624 ocean_assert(factorRight <= 128u && factorBottom <= 128u);
625
626 const unsigned int factorLeft = 128u - factorRight;
627 const unsigned int factorTop = 128u - factorBottom;
628
629 const unsigned int factorTopLeft = factorTop * factorLeft;
630 const unsigned int factorTopRight = factorTop * factorRight;
631
632 const unsigned int factorBottomLeft = factorBottom * factorLeft;
633 const unsigned int factorBottomRight = factorBottom * factorRight;
634
635 const uint8_t* imageBottomLeft = imageTopLeft + imageStrideElements;
636
637 for (unsigned int y = 0u; y < tPatchSize; ++y)
638 {
639 for (unsigned int x = 0u; x < tPatchSize; ++x)
640 {
641 for (unsigned int n = 0u; n < tChannels; ++n)
642 {
643 buffer[n] = uint8_t((imageTopLeft[n] * factorTopLeft + imageTopLeft[tChannels + n] * factorTopRight + imageBottomLeft[n] * factorBottomLeft + imageBottomLeft[tChannels + n] * factorBottomRight + 8192u) / 16384u);
644 }
645
646 imageTopLeft += tChannels;
647 imageBottomLeft += tChannels;
648
649 buffer += tChannels;
650 }
651
652 imageTopLeft += imageStrideElements - tChannels * tPatchSize;
653 imageBottomLeft += imageStrideElements - tChannels * tPatchSize;
654 }
655}
656
657template <unsigned int tChannels, unsigned int tPatchSize, PixelCenter tPixelCenter, typename TScalar>
658inline void AdvancedFrameInterpolatorBilinearSSE::interpolateSquarePatch8BitPerChannel(const uint8_t* const image, const unsigned int width, const unsigned int imagePaddingElements, uint8_t* buffer, const VectorT2<TScalar>& position)
659{
660 static_assert(tChannels >= 1u, "Invalid channel number!");
661 static_assert(tPatchSize % 2u == 1u, "Invalid patch size!");
662
663 ocean_assert(image != nullptr && buffer != nullptr);
664 ocean_assert(tPatchSize + 1u <= width);
665
666 constexpr unsigned int tPatchSize_2 = tPatchSize / 2u;
667
668 const unsigned int imageStrideElements = width * tChannels + imagePaddingElements;
669
670 const VectorT2<TScalar> shiftedPosition = tPixelCenter == PC_TOP_LEFT ? position : position - VectorT2<TScalar>(TScalar(0.5), TScalar(0.5));
671
672 ocean_assert(shiftedPosition.x() >= TScalar(tPatchSize_2) && shiftedPosition.y() >= TScalar(tPatchSize_2));
673 ocean_assert(shiftedPosition.x() < TScalar(width - tPatchSize_2 - 1u));
674
675 const unsigned int left = (unsigned int)(shiftedPosition.x()) - tPatchSize_2;
676 const unsigned int top = (unsigned int)(shiftedPosition.y()) - tPatchSize_2;
677
678 ocean_assert(left + tPatchSize < width);
679
680 const TScalar tx = shiftedPosition.x() - TScalar(int(shiftedPosition.x()));
681 ocean_assert(tx >= TScalar(0) && tx <= TScalar(1));
682 const unsigned int factorRight = (unsigned int)(tx * TScalar(128) + TScalar(0.5));
683
684 const TScalar ty = shiftedPosition.y() - TScalar(int(shiftedPosition.y()));
685 ocean_assert(ty >= 0 && ty <= 1);
686 const unsigned int factorBottom = (unsigned int)(ty * TScalar(128) + TScalar(0.5));
687
688 const uint8_t* const imageTopLeft = image + top * imageStrideElements + left * tChannels;
689
690 SpecializedForChannels<tChannels>::template interpolateSquarePatch8BitPerChannel<tPatchSize>(imageTopLeft, imageStrideElements, buffer, factorRight, factorBottom);
691}
692
693}
694
695}
696
697}
698
699#endif // OCEAN_HARDWARE_SSE_VERSION >= 41
700
701#endif // META_OCEAN_CV_ADVANCED_ADVANCED_FRAME_INTERPOLATOR_BILINEAR_SSE_H
This class allows to specialize functions for individual channels.
Definition AdvancedFrameInterpolatorBilinearSSE.h:42
static void interpolateSquarePatch8BitPerChannel(const uint8_t *imageTopLeft, const unsigned int imageStrideElements, uint8_t *buffer, const unsigned int factorRight, const unsigned int factorBottom)
Interpolates the content of a square image patch with sub-pixel accuracy inside a given image and sto...
Definition AdvancedFrameInterpolatorBilinearSSE.h:619
This class implements advanced bilinear frame interpolation functions using SSE extensions.
Definition AdvancedFrameInterpolatorBilinearSSE.h:33
static void interpolateSquarePatch8BitPerChannel(const uint8_t *const image, const unsigned int width, const unsigned int imagePaddingElements, uint8_t *buffer, const VectorT2< TScalar > &position)
Interpolates the content of a square image patch with sub-pixel accuracy inside a given image and sto...
Definition AdvancedFrameInterpolatorBilinearSSE.h:658
static void prefetchT0(const void *const data)
Prefetches a block of temporal memory into all cache levels.
Definition SSE.h:1255
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition SSE.h:3770
This class implements a vector with two elements.
Definition Vector2.h:96
const T & x() const noexcept
Returns the x value.
Definition Vector2.h:710
const T & y() const noexcept
Returns the y value.
Definition Vector2.h:722
@ PC_TOP_LEFT
The center of a pixel is in the upper-left corner of each pixel's square.
Definition CV.h:133
The namespace covering the entire Ocean framework.
Definition Accessor.h:15