Ocean
Loading...
Searching...
No Matches
FrameInterpolatorBilinear.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_FRAME_INTERPOLATOR_BILINEAR_H
9#define META_OCEAN_CV_FRAME_INTERPOLATOR_BILINEAR_H
10
11#include "ocean/cv/CV.h"
13#include "ocean/cv/NEON.h"
15#include "ocean/cv/SSE.h"
16
17#include "ocean/base/DataType.h"
18#include "ocean/base/Frame.h"
19#include "ocean/base/Memory.h"
20#include "ocean/base/Worker.h"
21
23
27#include "ocean/math/Lookup2.h"
32#include "ocean/math/Vector2.h"
33
34namespace Ocean
35{
36
37namespace Test { namespace TestCV { class TestFrameInterpolatorBilinearNEON; } }
38
39namespace CV
40{
41
42/**
43 * This class implements bilinear frame interpolator functions.
44 * @ingroup cv
45 */
46class OCEAN_CV_EXPORT FrameInterpolatorBilinear
47{
49
50 public:
51
52 /**
53 * Definition of a lookup table for 2D vectors.
54 */
56
57 public:
58
59 /**
60 * The following comfort class provides comfortable functions simplifying prototyping applications but also increasing binary size of the resulting applications.
61 * Best practice is to avoid using these functions if binary size matters,<br>
62 * as for every comfort function a corresponding function exists with specialized functionality not increasing binary size significantly.<br>
63 */
64 class OCEAN_CV_EXPORT Comfort
65 {
66 public:
67
68 /**
69 * Resizes/rescales a given frame by application of a bilinear interpolation.
70 * @param source The source frame to resize, must be valid
71 * @param target Resulting target frame with identical frame pixel format and pixel origin as the source frame, must be valid
72 * @param worker Optional worker object used for load distribution
73 * @return True, if the frame could be resized
74 */
75 static bool resize(const Frame& source, Frame& target, Worker* worker = nullptr);
76
77 /**
78 * Resizes/rescales a given frame by application of a bilinear interpolation.
79 * @param frame The frame to resize, must be valid
80 * @param width The width of the resized frame in pixel, with range [1, infinity)
81 * @param height The height of the resized frame in pixel, with range [1, infinity)
82 * @param worker Optional worker object used for load distribution
83 * @return True, if the frame could be resized
84 */
85 static inline bool resize(Frame& frame, const unsigned int width, const unsigned int height, Worker* worker = nullptr);
86
87 /**
88 * Zooms into a given input frame (or zooms out) and stores the zoomed image content in an output frame.
89 * The resulting zoomed image will have the same frame type (frame resolution, pixel format, pixel origin) as the input image.<br>
90 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
91 * @param source The source frame for which the zoomed image content will be created, must be valid
92 * @param target The resulting target frame which will receive the zoomed image, will be set to the same frame type as the source frame, can be invalid
93 * @param zoomFactor The zoom factor to be applied, a factor < 1 will zoom out, a factor > 1 will zoom in, with range (0, infinity)
94 * @param worker Optional worker object to distribute the computation to several CPU cores
95 * @return True, if succeeded
96 */
97 static bool zoom(const Frame& source, Frame& target, const Scalar zoomFactor, Worker* worker = nullptr);
98
99 /**
100 * Transforms a given input frame into an output frame (with arbitrary frame dimension) by application of a homography.
101 * The output frame must have the same pixel format and pixel origin as the input frame, however the dimension (and position) of the output frame can be arbitrary.<br>
102 * This function allows the creation of an output frame fully covering the input frame (if the position and dimension of the output frame covers the transformation of the homography.<br>
103 * The 'outputOrigin' parameter simply applies an additional translation onto the provided homography i.e., homography * create_translation_matrix3x3(outputOrigin.x(), outputOrigin.y()).<br>
104 * Information: This function is the equivalent to OpenCV's cv::warpPerspective().
105 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
106 * @param input The input frame that will be transformed, must be valid
107 * @param output The output frame resulting by application of the given homography, with same pixel format and pixel origin as the input frame, must be valid
108 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
109 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels and the data type of the pixel elements, nullptr to assign 0 to each channel
110 * @param worker Optional worker object to distribute the computational load
111 * @param outputOrigin The origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
112 * @return True, if succeeded
113 */
114 static bool homography(const Frame& input, Frame& output, const SquareMatrix3& input_H_output, const void* borderColor = nullptr, Worker* worker = nullptr, const PixelPositionI& outputOrigin = PixelPositionI(0, 0));
115
116 /**
117 * Transforms a given input frame into an output frame (with arbitrary frame dimension) by application of four homographies.
118 * For each quadrant of the output frame an individual homography is applied while the final result is interpolated between the four homographies.<br>
119 * The quadrant order of the homographies is as follows: top left, top right, bottom left, bottom right.<br>
120 * The output frame must have the same pixel format and pixel origin as the input frame, however the dimension (and position) of the output frame can be arbitrary.<br>
121 * This function allows the creation of an output frame fully covering the input frame (if the position and dimension of the output frame covers the transformation of the homography.<br>
122 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
123 * @param input The input frame that will be transformed
124 * @param output The output frame resulting by application of the given homography, with same pixel format and pixel origin as the input frame, must have a valid dimension
125 * @param homographies Four homographies used to transform the given input frame, transforming points defined in the output frame into points defined in the input frame
126 * @param outputQuadrantCenter The center position of the four quadrants in the output frame (the local center not respecting the optional outputOrigin parameter), with range [0, output.width())x[0, output.height())
127 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
128 * @param worker Optional worker object to distribute the computational load
129 * @param outputOrigin The origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
130 * @return True, if succeeded
131 */
132 static bool homographies(const Frame& input, Frame& output, const SquareMatrix3 homographies[4], const Vector2& outputQuadrantCenter, const uint8_t* borderColor = nullptr, Worker* worker = nullptr, const PixelPositionI& outputOrigin = PixelPositionI(0, 0));
133
134 /**
135 * Transforms a given input frame into an output frame (with arbitrary frame dimension) by application of a homography.
136 * The output frame must have the same pixel format and pixel origin as the input frame, however the dimension (and position) of the output frame can be arbitrary.<br>
137 * This function allows the creation of an output frame fully covering the input frame (if the position and dimension of the output frame covers the transformation of the homography).<br>
138 * Input frame pixels lying outside the frame will be masked in the resulting output mask frame, further theses pixels are untouched in the output frame.<br>
139 * The 'outputOrigin' parameter simply applies an additional translation onto the provided homography i.e., homography * create_translation_matrix3x3(outputOrigin.x(), outputOrigin.y()).
140 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
141 * @param input The input frame that will be transformed, must be valid
142 * @param output The Output frame resulting by application of the given homography, with same pixel format and pixel origin as the input frame, must have a valid dimension
143 * @param outputMask Resulting mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame, must be valid and must have the same frame dimension as the output frame
144 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
145 * @param worker Optional worker object to distribute the computational load
146 * @param maskValue 8 bit mask values for reference pixels lying inside the given camera frame, reference pixels lying outside the camera frame will be assigned with (0xFF - maskValue)
147 * @param outputOrigin The origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
148 * @return True, if succeeded
149 * @see coversHomographyInputFrame().
150 */
151 static bool homographyMask(const Frame& input, Frame& output, Frame& outputMask, const SquareMatrix3& input_H_output, Worker* worker = nullptr, const uint8_t maskValue = 0xFF, const PixelPositionI& outputOrigin = PixelPositionI(0, 0));
152
153 /**
154 * Transforms a given input frame into an output frame (with arbitrary frame dimension) by application of four homographies.
155 * For each quadrant of the output frame an individual homography is applied while the final result is interpolated between the four homographies.<br>
156 * The quadrant order of the homographies is as follows: top left, top right, bottom left, bottom right.<br>
157 * The output frame must have the same pixel format and pixel origin as the input frame, however the dimension (and position) of the output frame can be arbitrary.<br>
158 * This function allows the creation of an output frame fully covering the input frame (if the position and dimension of the output frame covers the transformation of the homography.<br>
159 * Input frame pixels lying outside the frame will be masked in the resulting output mask frame, further theses pixels are untouched in the output frame.<br>
160 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
161 * @param input The input frame that will be transformed, must be valid
162 * @param output The output frame resulting by application of the given homography, with same pixel format and pixel origin as the input frame, must have a valid dimension
163 * @param outputMask Mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame
164 * @param homographies Four homographies used to transform the given input frame, transforming points defined in the output frame into points defined in the input frame
165 * @param outputQuadrantCenter The center position of the four quadrants in the output frame (the local center not respecting the optional outputOrigin parameter), with range [0, output.width())x[0, output.height())
166 * @param worker Optional worker object to distribute the computational load
167 * @param maskValue 8 bit mask values for reference pixels lying inside the given camera frame, reference pixels lying outside the camera frame will be assigned with (0xFF - maskValue)
168 * @param outputOrigin The origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
169 * @return True, if succeeded
170 * @see coversHomographyInputFrame().
171 */
172 static bool homographiesMask(const Frame& input, Frame& output, Frame& outputMask, const SquareMatrix3* homographies, const Vector2& outputQuadrantCenter, Worker* worker = nullptr, const uint8_t maskValue = 0xFF, const PixelPositionI& outputOrigin = PixelPositionI(0, 0));
173
174 /**
175 * Transforms a given input frame into an output frame by application of a homography.
176 * This function also uses a camera profile to improve the interpolation accuracy.<br>
177 * The given homography is transformed into a homography for normalized image coordinates.<br>
178 * Thus, also distortion parameters of the camera profile can be applied.<br>
179 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
180 * @param inputCamera The pinhole camera profile to be applied for the input frame
181 * @param outputCamera The pinhole camera profile to be applied for the output frame
182 * @param input The input frame that will be transformed, the frame dimension must match the dimension of the left camera
183 * @param output The output frame resulting by application of the given homography, the frame dimension must match the dimension of the right camera
184 * @param homography The homography used to transform the given input frame, with includes both camera profiles: H = Ki * H' * Ko^-1
185 * @param useDistortionParameters True, to apply the distortion parameters of the camera profile
186 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
187 * @param worker Optional worker object to distribute the computational load
188 * @return True, if succeeded
189 * @see homographyWithCameraMask(), homography().
190 */
191 static bool homographyWithCamera(const PinholeCamera& inputCamera, const PinholeCamera& outputCamera, const Frame& input, Frame& output, const SquareMatrix3& homography, const bool useDistortionParameters, const uint8_t* borderColor = nullptr, Worker* worker = nullptr);
192
193 /**
194 * Transforms a given input frame into an output frame by application of a homography.
195 * Input frame pixels lying outside the frame will be masked in the resulting output mask frame, further theses pixels are untouched in the output frame.<br>
196 * This function also uses a camera profile to improve the interpolation accuracy.<br>
197 * The given homography is transformed into a homography for normalized image coordinates.<br>
198 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
199 * Thus, also distortion parameters of the camera profile can be applied.<br>
200 * @param inputCamera The pinhole camera profile to be applied for the input frame
201 * @param outputCamera The pinhole camera profile to be applied for the output frame
202 * @param input The input frame that will be transformed, the frame dimension must match the dimension of the left camera
203 * @param output The output frame resulting by application of the given homography, the frame dimension must match the dimension of the right camera
204 * @param outputMask Mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame
205 * @param homography The homography used to transform the given input frame, with includes both camera profiles: H = Ki * H' * Ko^-1
206 * @param worker Optional worker object to distribute the computational load
207 * @param maskValue 8 bit mask values for reference pixels lying inside the given camera frame, reference pixels lying outside the camera frame will be assigned with (0xFF - maskValue)
208 * @return True, if succeeded
209 * @see homographyWithCamera(), homography().
210 */
211 static bool homographyWithCameraMask(const AnyCamera& inputCamera, const AnyCamera& outputCamera, const Frame& input, Frame& output, Frame& outputMask, const SquareMatrix3& homography, Worker* worker = nullptr, const uint8_t maskValue = 0xFFu);
212
213 /**
214 * Transforms a given input frame into an output frame by application of an interpolation lookup table.
215 * The output frame must have the same pixel format and pixel origin as the input frame.<br>
216 * Information: This function is the equivalent to OpenCV's cv::remap().
217 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
218 * @param input The input frame that will be transformed
219 * @param output Resulting output frame, the dimension will be set to match the size of the lookup table, pixel format and pixel origin will be set to match the given input frame
220 * @param input_LT_output The lookup table which defines the transformation from locations defined in the output frame to locations defined in the input frame (the lookup table stores the corresponding locations in the input frame), must be valid
221 * @param offset True, if the lookup table store local offsets; False, if the lookup table stores absolute positions
222 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
223 * @param worker Optional worker object to distribute the computation
224 * @return True, if succeeded
225 */
226 static bool lookup(const Frame& input, Frame& output, const LookupTable& input_LT_output, const bool offset, const void* borderColor, Worker* worker = nullptr);
227
228 /**
229 * Transforms a given input frame into an output frame by application of an interpolation lookup table and creates and additional mask as output.
230 * The output frame must have the same pixel format and pixel origin as the input frame.<br>
231 * Input frame pixels lying outside the frame will be masked in the resulting output mask frame, further theses pixels are untouched in the output frame.<br>
232 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
233 * @param input The input frame which will be transformed
234 * @param output Resulting output frame, the frame dimension will be set to match the size of the lookup table, pixel format and pixel origin will be set to match the given input frame
235 * @param outputMask Resulting mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame
236 * @param input_LT_output The lookup table which defines the transformation from locations defined in the output frame to locations defined in the input frame (the lookup table stores the corresponding locations in the input frame), must be valid
237 * @param offset True, if the lookup table store local offsets; False, if the lookup table stores absolute positions
238 * @param worker Optional worker object to distribute the computation
239 * @param maskValue 8 bit mask values for pixels lying inside the input frame, pixels lying outside the input frame will be assigned with (0xFF - maskValue)
240 * @return True, if succeeded
241 */
242 static bool lookupMask(const Frame& input, Frame& output, Frame& outputMask, const LookupTable& input_LT_output, const bool offset, Worker* worker = nullptr, const uint8_t maskValue = 0xFF);
243
244 /**
245 * Applies an affine transformation to an image.
246 * The target frame must have the same pixel format and pixel origin as the source frame, however the dimension (and position) of the target frame can be arbitrary.
247 * This function allows the creation of an target frame fully covering the source frame (if the position and dimension of the target frame covers the transformation of the affine transformation.
248 * The multiplication of the affine transformation with pixel location in the target image yield their location in the source image, i.e., sourcePoint = source_A_target * targetPoint.
249 * The parameter 'targetOrigin' applies an additional translation to the provided affine transformation i.e., source_A_target * create_translation_matrix3x3(targetOrigin.x(), targetOrigin.y()).
250 * Please note that here the affine transformation is specified as a 3-by-3 matrix (in contrast to the more commonly used 2-by-3 matrix) and should take of the form:
251 * <pre>
252 * a c e
253 * b d f
254 * 0 0 1
255 * </pre>
256 * However, this function disregards the last row completely and only uses the top two rows, i.e., the elements a through f.
257 * Information: This function is the equivalent to OpenCV's cv::warpAffine().
258 * Note: For applications running on mobile devices, in order to keep the impact on binary size to a minimum please prefer a specialized transformation function (those that work on image pointers instead of Frame instances).
259 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
260 * @param source The source frame that will be transformed, must be valid
261 * @param target The resulting frame after applying the affine transformation to the source frame; pixel format and pixel origin must be identical to source frame; memory of target frame must be allocated by the caller
262 * @param source_A_target Affine transform used to transform the given source frame, transforming points defined in the target frame into points defined in the source frame
263 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
264 * @param worker Optional worker object to distribute the computational load
265 * @param targetOrigin The origin of the target frame defining the global position of the target frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
266 * @return True, if succeeded
267 */
268 static bool affine(const Frame& source, Frame& target, const SquareMatrix3& source_A_target, const uint8_t* borderColor = nullptr, Worker* worker = nullptr, const PixelPositionI& targetOrigin = PixelPositionI(0, 0));
269
270 /**
271 * Rotates a given frame by a bilinear interpolation.
272 * The frame will be rotated around a specified anchor position (inside or outside the frame).<br>
273 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).
274 * @param source The source frame to be rotated, must be valid
275 * @param target The target frame which will receive the rotated image, will be set to the same frame type as the source frame, can be invalid
276 * @param horizontalAnchorPosition Position of the rotation anchor in the horizontal direction, with range (-infinity, infinity)
277 * @param verticalAnchorPosition Position of the rotation anchor in the vertical direction, with range (-infinity, infinity)
278 * @param angle The counter clockwise rotation angle in radian, with range [0, 2PI)
279 * @param worker Optional worker object to distribute the computation to several CPU cores
280 * @param borderColor The color of border pixels for which now visual content exists, provide one value for each channel, nullptr to use 0x00 for each channel
281 * @return True, if succeeded
282 */
283 static bool rotate(const Frame& source, Frame& target, const Scalar horizontalAnchorPosition, const Scalar verticalAnchorPosition, const Scalar angle, Worker* worker = nullptr, const uint8_t* borderColor = nullptr);
284
285 /**
286 * Re-samples a camera image which has been captured with a camera profile as if the image would have been captured with a second camera profile.
287 * The function can be used e.g., to rectify a fisheye camera image into a pinhole camera image.
288 * @param sourceFrame The source image captured with the source camera profile, must be valid
289 * @param sourceCamera The source camera profile which has been used to capture the source image, with resolution sourceFrame.width() x sourceFrame.height(), must be valid
290 * @param source_R_target The rotation transforming 3D points defined in the coordinate system of the target camera image to 3D points defined in the coordinate system of the source camera image, must be valid
291 * @param targetCamera The camera profile of the target frame, must be valid
292 * @param targetFrame The resulting target image, with resolution targetCamera.width() x targetCamera.height(), must be valid
293 * @param source_OLT_target Optional resulting offset lookup table between target image points and source image points
294 * @param worker Optional worker object to distribute the computational load
295 * @param binSizeInPixel The size in pixel of the interpolation bins used for building the lookup table, with range [1, infinity)
296 * @param borderColor The color of border pixels for which now visual content exists, provide one value for each channel, nullptr to use ElementType(0) for each channel
297 * @return True, if succeeded
298 * @see resampleCameraImageImage8BitPerChannel().
299 */
300 static bool resampleCameraImage(const Frame& sourceFrame, const AnyCamera& sourceCamera, const SquareMatrix3& source_R_target, const AnyCamera& targetCamera, Frame& targetFrame, LookupCorner2<Vector2>* source_OLT_target = nullptr, Worker* worker = nullptr, const unsigned int binSizeInPixel = 8u, const void* borderColor = nullptr);
301
302 /**
303 * Determines the interpolated pixel values for a given pixel position in an 8 bit per channel frame.
304 * This function uses an integer interpolation with a precision of 1/128.
305 * @param frame The frame to determine the pixel values from, must be valid
306 * @param channels Number of channels of the given frame, with range [1, 8]
307 * @param width The width of the frame in pixel, with range [1, infinity)
308 * @param height The height of the frame in pixel, with range [1, infinity)
309 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
310 * @param pixelCenter The pixel center to be used during interpolation, either 'PC_TOP_LEFT' or 'PC_CENTER'
311 * @param position The position for which the interpolated pixel will be determined, with ranges [0, width - 1]x[0, height - 1] for PC_TOP_LEFT, [0, width]x[0, height] for PC_CENTER
312 * @param result Resulting pixel values, must be valid, must be valid
313 * @return True, if succeeded
314 * @tparam TScalar The scalar data type of the sub-pixel position
315 */
316 template <typename TScalar = Scalar>
317 static bool interpolatePixel8BitPerChannel(const uint8_t* frame, const unsigned int channels, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const PixelCenter pixelCenter, const VectorT2<TScalar>& position, uint8_t* result);
318
319 /**
320 * Determines the interpolated pixel values for a given pixel position in a frame with arbitrary data type.
321 * This function uses floating point precision during interpolation.
322 * @param frame The frame to determine the pixel values from, must be valid
323 * @param channels Number of channels of the given frame, with range [1, 8]
324 * @param width The width of the frame in pixel, with range [1, infinity)
325 * @param height The height of the frame in pixel, with range [1, infinity)
326 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
327 * @param pixelCenter The pixel center to be used during interpolation, either 'PC_TOP_LEFT' or 'PC_CENTER'
328 * @param position The position to determine the interpolated pixel values for, with range [0, width)x[0, height)
329 * @param result Resulting interpolated pixel value(s), must be valid
330 * @param resultBias Optional bias value which will be added to the interpolation result e.g. to handle rounding, with range (-infinity, infinity), default is zero
331 * @return True, if succeeded
332 * @tparam TSource The data type of the provided pixel values in the (source) frame
333 * @tparam TTarget The data type of the resulting interpolated value(s)
334 * @tparam TScalar The data type of each coordinate of the provided interpolation location, should be either Scalar, float, or double
335 * @tparam TIntermediate The data type of the intermediate interpolation result before assigning the result
336 */
337 template <typename TSource, typename TTarget, typename TScalar = Scalar, typename TIntermediate = TScalar>
338 static bool interpolatePixel(const TSource* frame, const unsigned int channels, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const PixelCenter pixelCenter, const VectorT2<TScalar>& position, TTarget* result, const TIntermediate& resultBias = TIntermediate(0));
339 };
340
341 /**
342 * This class implements highly optimized interpolation functions with fixed properties.
343 * The functions can be significantly faster as these functions are tailored to the specific properties.
344 */
345 class OCEAN_CV_EXPORT SpecialCases
346 {
347 public:
348
349 /**
350 * Resizes a given FORMAT_Y8 frame with resolution 400x400 to a FORMAT_Y8 frame with resolution 224x224 by using a bilinear interpolation.
351 * This function exploits the fact that lookup locations and interpolation factors repeat after 25 pixels (16 pixels in the target resolution).
352 * @param source The source frame buffer with resolution 400x400, must be valid
353 * @param target The target frame buffer receiving the resized image information, with resolution 224x224, must be valid
354 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
355 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
356 * @see FrameInterpolatorBilinear::resize<T, tChannels>().
357 */
358 static void resize400x400To224x224_8BitPerChannel(const uint8_t* const source, uint8_t* const target, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements);
359
360 /**
361 * Resizes a given FORMAT_Y8 frame with resolution 400x400 to a FORMAT_Y8 frame with resolution 256x256 by using a bilinear interpolation.
362 * This function exploits the fact that lookup locations and interpolation factors repeat after 25 pixels (16 pixels in the target resolution).
363 * @param source The source frame buffer with resolution 400x400, must be valid
364 * @param target The target frame buffer receiving the resized image information, with resolution 256x256, must be valid
365 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
366 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
367 * @see FrameInterpolatorBilinear::resize<T, tChannels>().
368 */
369 static void resize400x400To256x256_8BitPerChannel(const uint8_t* const source, uint8_t* const target, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements);
370 };
371
372 /**
373 * Resizes a given frame with (almost) arbitrary data type (e.g., float, double, int) by using a bilinear interpolation.
374 * This function is actually a wrapper for scale().
375 * @param source The source frame buffer providing the image information to be resized, must be valid
376 * @param target The target frame buffer receiving the resized image information, must be valid
377 * @param sourceWidth Width of the source frame in pixel, with range [1, infinity)
378 * @param sourceHeight Height of the source frame in pixel, with range [1, infinity)
379 * @param targetWidth Width of the target frame in pixel, with range [1, infinity)
380 * @param targetHeight Height of the target frame in pixel, with range [1, infinity)
381 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
382 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
383 * @param worker Optional worker object to distribute the computation to several CPU cores
384 * @tparam T Data type of each pixel channel, e.g., float, double, int
385 * @tparam tChannels Number of channels of the frame, with range [1, infinity)
386 * @see scale<T, tChannels>().
387 */
388 template <typename T, unsigned int tChannels>
389 static inline void resize(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
390
391 /**
392 * Rescales a given frame with arbitrary data type (e.g., float, double, int) by using a bilinear interpolation with user-defined scaling factors.
393 * Beware: This function is not optimized for performance but supports arbitrary data types.<br>
394 * Try to use scale8BitPerChannel() if possible.
395 * @param source The source frame buffer providing the image information to be resized, must be valid
396 * @param target The target frame buffer receiving the rescaled image information, must be valid
397 * @param sourceWidth Width of the source frame in pixel, with range [1, infinity)
398 * @param sourceHeight Height of the source frame in pixel, with range [1, infinity)
399 * @param targetWidth Width of the target frame in pixel, with range [1, infinity)
400 * @param targetHeight Height of the target frame in pixel, with range [1, infinity)
401 * @param sourceX_s_targetX The horizontal scale factor converting a location in the target frame to a location in the source frame (xSource = sourceX_s_targetX * xTarget), with range (0, sourceWidth/targetWidth]
402 * @param sourceY_s_targetY The vertical scale factor converting a location in the target frame to a location in the source frame (ySource = sourceY_s_targetY * yTarget), with range (0, sourceHeight/targetHeight]
403 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
404 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
405 * @param worker Optional worker object to distribute the computation to several CPU cores
406 * @tparam T Data type of each pixel channel, e.g., float, double, int
407 * @tparam tChannels Number of channels of the frame, with range [1, infinity)
408 * @see resize<T, tChannels>().
409 */
410 template <typename T, unsigned int tChannels>
411 static inline void scale(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
412
413 /**
414 * Rotates a given frame by a bilinear interpolation.
415 * The frame will be rotated around a specified anchor position (inside or outside the frame).
416 * @param source The source frame to be rotated, must be valid
417 * @param target The target frame which will receive the rotated image, with same frame type as the source frame, must be valid
418 * @param width The width of the source and target frame in pixel, with range [1, infinity)
419 * @param height The height of the source and target frame in pixel, with range [1, infinity)
420 * @param horizontalAnchorPosition Position of the rotation anchor in the horizontal direction, with range (-infinity, infinity)
421 * @param verticalAnchorPosition Position of the rotation anchor in the vertical direction, with range (-infinity, infinity)
422 * @param angle The counter clockwise rotation angle in radian, with range [0, 2PI)
423 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
424 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
425 * @param worker Optional worker object to distribute the computation to several CPU cores
426 * @param borderColor The color of border pixels for which now visual content exists, provide one value for each channel, nullptr to use 0x00 for each channel
427 * @tparam tChannels The number of channels both frames have, with range [1, infinity)
428 */
429 template <unsigned int tChannels>
430 static inline void rotate8BitPerChannel(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const Scalar horizontalAnchorPosition, const Scalar verticalAnchorPosition, const Scalar angle, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr, const uint8_t* borderColor = nullptr);
431
432 /**
433 * Apply an affine transforms to a N-channel, 8-bit frame
434 * The target frame must have the same pixel format and pixel origin as the source frame, however the dimension (and position) of the target frame can be arbitrary.<br>
435 * This function allows the creation of an target frame fully covering the source frame (if the position and dimension of the target frame covers the transformation of the affine transformation).<br>
436 * The 'targetOrigin' parameter simply applies an additional translation onto the provided affine transformation i.e., affine * create_translation_matrix3x3(targetOrigin.x(), targetOrigin.y()).
437 * Please note that here the affine transformation is specified as a 3-by-3 matrix (in contrast to the more commonly used 2-by-3 matrix) and should take of the form:
438 * <pre>
439 * a c e
440 * b d f
441 * 0 0 1
442 * </pre>
443 * However, this function disregards the last row completely and only uses the top two rows, i.e., the elements a through f.
444 * @param source Input frame that will be transformed, must be valid
445 * @param sourceWidth Width of both images in pixel, with range [1, infinity)
446 * @param sourceHeight Height of both images pixel, with range [1, infinity)
447 * @param source_A_target Affine transformation, such that: sourcePoint = source_A_target * targetPoint
448 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
449 * @param target The target frame using the given affine transform, must be valid
450 * @param targetOrigin The origin of the target frame defining the global position of the target frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
451 * @param targetWidth The width of the target image in pixel, with range [1, infinity)
452 * @param targetHeight The height of the target image in pixel, with range [1, infinity)
453 * @param sourcePaddingElements The number of padding elements at the end of each source frame row, in elements, with range [0, infinity)
454 * @param targetPaddingElements The number of padding elements at the end of each target frame row, in elements, with range [0, infinity)
455 * @param worker Optional worker object to distribute the computational load
456 * @tparam tChannels Number of channels of the frame
457 * @see homographyMask8BitPerChannel(), homographyWithCamera8BitPerChannel(), homography().
458 */
459 template <unsigned int tChannels>
460 static inline void affine8BitPerChannel(const uint8_t* source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3& source_A_target, const uint8_t* borderColor, uint8_t* target, const PixelPositionI& targetOrigin, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
461
462 /**
463 * Transforms a input frame with (almost) arbitrary pixel format into an output frame by application of a homography.
464 * The output frame must have the same pixel format and pixel origin as the input frame, however the dimension (and position) of the output frame can be arbitrary.<br>
465 * This function allows the creation of an output frame fully covering the input frame (if the position and dimension of the output frame covers the transformation of the homography.<br>
466 * The 'outputOrigin' parameter simply applies an additional translation onto the provided homography i.e., homography * create_translation_matrix3x3(outputOrigin.x(), outputOrigin.y()).
467 * @param input The input frame that will be transformed, must be valid
468 * @param inputWidth Width of both images in pixel, with range [1, infinity)
469 * @param inputHeight Height of both images pixel, with range [1, infinity)
470 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
471 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
472 * @param output The output frame using the given homography, must be valid
473 * @param outputOrigin The origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
474 * @param outputWidth The width of the output image in pixel, with range [1, infinity)
475 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
476 * @param inputPaddingElements The number of padding elements at the end of each input frame row, in elements, with range [0, infinity)
477 * @param outputPaddingElements The number of padding elements at the end of each output frame row, in elements, with range [0, infinity)
478 * @param worker Optional worker object to distribute the computational load
479 * @tparam T Data type of each pixel channel, e.g., float, double, int
480 * @tparam tChannels Number of channels of the frame, with range [1, infinity)
481 * @see homographyMask8BitPerChannel(), homographyWithCamera8BitPerChannel().
482 */
483 template <typename T, unsigned int tChannels>
484 static inline void homography(const T* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3& input_H_output, const T* borderColor, T* output, const PixelPositionI& outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker* worker = nullptr);
485
486 /**
487 * Transforms a given 8 bit per channel input frame into an output frame by application of four homographies.
488 * For each quadrant of the output frame an individual homography is applied while the final result is interpolated between the four homographies.<br>
489 * The quadrant order of the homographies is as follows: top left, top right, bottom left, bottom right.<br>
490 * The output frame must have the same pixel format and pixel origin as the input frame, however the dimension (and position) of the output frame can be arbitrary.<br>
491 * This function allows the creation of an output frame fully covering the input frame (if the position and dimension of the output frame covers the transformation of the homography.<br>
492 * @param input The input frame that will be transformed
493 * @param inputWidth Width of both images in pixel, with range [1, infinity)
494 * @param inputHeight Height of both images pixel, with range [1, infinity)
495 * @param homographies Four homographies used to transform the given input frame, transforming points defined in the output frame into points defined in the input frame
496 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
497 * @param output The output frame using the given homography
498 * @param outputQuadrantCenter The center position of the four quadrants in the output frame (the local center not respecting the optional outputOrigin parameter), with range [0, outputWidth)x[0, outputHeight)
499 * @param outputOrigin The origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
500 * @param outputWidth The width of the output image in pixel, with range [1, infinity)
501 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
502 * @param inputPaddingElements The number of padding elements at the end of each input frame row, in elements, with range [0, infinity)
503 * @param outputPaddingElements The number of padding elements at the end of each output frame row, in elements, with range [0, infinity)
504 * @param worker Optional worker object to distribute the computational load
505 * @tparam tChannels Number of channels of the frame
506 * @see homographyMask8BitPerChannel(), homographyWithCamera8BitPerChannel().
507 */
508 template <unsigned int tChannels>
509 static inline void homographies8BitPerChannel(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 homographies[4], const uint8_t* borderColor, uint8_t* output, const Vector2& outputQuadrantCenter, const PixelPositionI& outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker* worker = nullptr);
510
511 /**
512 * Transforms a given 8 bit per channel input frame into an output frame by application of a homography.
513 * The output frame must have the same pixel format and pixel origin as the input frame, however the dimension (and position) of the output frame can be arbitrary.<br>
514 * This function allows the creation of an output frame fully covering the input frame (if the position and dimension of the output frame covers the transformation of the homography.<br>
515 * The 'outputOrigin' parameter simply applies an additional translation onto the provided homography i.e., homography * create_translation_matrix3x3(outputOrigin.x(), outputOrigin.y()).
516 * @param input The input frame that will be transformed, must be valid
517 * @param inputWidth Width of both images in pixel, with range [1, infinity)
518 * @param inputHeight Height of both images pixel, with range [1, infinity)
519 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
520 * @param output The output frame using the given homography, must be valid
521 * @param outputMask Mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame, must be valid
522 * @param outputOrigin The origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
523 * @param outputWidth The width of the output image in pixel, with range [1, infinity)
524 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
525 * @param maskValue 8 bit mask values for reference pixels lying inside the given camera frame, reference pixels lying outside the camera frame will be assigned with (0xFF - maskValue)
526 * @param inputPaddingElements The number of padding elements at the end of each input row, in elements, with range [0, infinity)
527 * @param outputPaddingElements The number of padding elements at the end of each output row, in elements, with range [0, infinity)
528 * @param outputMaskPaddingElements The number of padding elements at the end of output mask input row, in elements, with range [0, infinity)
529 * @param worker Optional worker object to distribute the computational load
530 * @tparam tChannels Number of channels of the frame, with range [1, infinity)
531 * @see homography(), homographyWithCamera8BitPerChannel().
532 */
533 template <unsigned int tChannels>
534 static inline void homographyMask8BitPerChannel(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3& input_H_output, uint8_t* output, uint8_t* outputMask, const PixelPositionI& outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const uint8_t maskValue /* = 0xFF*/, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker* worker = nullptr);
535
536 /**
537 * Transforms a given 8 bit per channel input frame into an output frame by application of four homographies.
538 * For each quadrant of the output frame an individual homography is applied while the final result is interpolated between the four homographies.<br>
539 * The quadrant order of the homographies is as follows: top left, top right, bottom left, bottom right.<br>
540 * The output frame must have the same pixel format and pixel origin as the input frame, however the dimension (and position) of the output frame can be arbitrary.<br>
541 * This function allows the creation of an output frame fully covering the input frame (if the position and dimension of the output frame covers the transformation of the homography.<br>
542 * @param input The input frame that will be transformed
543 * @param inputWidth Width of both images in pixel, with range [1, infinity)
544 * @param inputHeight Height of both images pixel, with range [1, infinity)
545 * @param homographies Four homographies used to transform the given input frame, transforming points defined in the output frame into points defined in the input frame
546 * @param output The output frame using the given homography
547 * @param outputMask Mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame
548 * @param outputQuadrantCenter The center position of the four quadrants in the output frame (the local center not respecting the optional outputOrigin parameter), with range [0, outputWidth)x[0, outputHeight)
549 * @param outputOrigin The origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
550 * @param outputWidth The width of the output image in pixel, with range [1, infinity)
551 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
552 * @param inputPaddingElements The number of padding elements at the end of each input row, in elements, with range [0, infinity)
553 * @param outputPaddingElements The number of padding elements at the end of each output row, in elements, with range [0, infinity)
554 * @param outputMaskPaddingElements The number of padding elements at the end of each row of the output mask, in elements, with range [0, infinity)
555 * @param worker Optional worker object to distribute the computational load
556 * @param maskValue 8 bit mask values for reference pixels lying inside the given camera frame, reference pixels lying outside the camera frame will be assigned with (0xFF - maskValue)
557 * @tparam tChannels Number of channels of the frame
558 * @see homography(), homographyWithCamera8BitPerChannel().
559 */
560 template <unsigned int tChannels>
561 static inline void homographiesMask8BitPerChannel(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 homographies[4], uint8_t* output, uint8_t* outputMask, const Vector2& outputQuadrantCenter, const PixelPositionI& outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker* worker = nullptr, const uint8_t maskValue = 0xFF);
562
563 /**
564 * Transforms a given 8 bit per channel input frame into an output frame by application of a homography.
565 * This function also uses a camera profile to improve the interpolation accuracy.<br>
566 * The given homography is transformed into a homography for normalized image coordinates.<br>
567 * Thus, also distortion parameters of the camera profile can be applied.<br>
568 * @param inputCamera The pinhole camera profile to be applied for the input frame
569 * @param outputCamera The pinhole camera profile to be applied for the output frame
570 * @param input The input frame that will be transformed
571 * @param homography The homography used to transform the given input frame by following equation: inputPoint = homography * outputPoint
572 * @param useDistortionParameters True, to apply the distortion parameters of the camera profile
573 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
574 * @param output The output frame using the given homography
575 * @param inputPaddingElements The number of padding elements at the end of each input row, in elements, with range [0, infinity)
576 * @param outputPaddingElements The number of padding elements at the end of each output row, in elements, with range [0, infinity)
577 * @param worker Optional worker object to distribute the computational load
578 * @tparam tChannels Number of channels of the frame
579 * @see homography().
580 */
581 template <unsigned int tChannels>
582 static inline void homographyWithCamera8BitPerChannel(const PinholeCamera& inputCamera, const PinholeCamera& outputCamera, const uint8_t* input, const SquareMatrix3& homography, const bool useDistortionParameters, const uint8_t* borderColor, uint8_t* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker* worker = nullptr);
583
584 /**
585 * Transforms a given 8 bit per channel input frame into an output frame by application of a homography.
586 * Input frame pixels lying outside the frame will be masked in the resulting output mask frame.<br>
587 * This function also uses a camera profile to improve the interpolation accuracy.<br>
588 * The given homography is transformed into a homography for normalized image coordinates.<br>
589 * Thus, also distortion parameters of the camera profile can be applied.
590 * @param inputCamera The pinhole camera profile to be applied for the input frame, must be valid
591 * @param outputCamera The pinhole camera profile to be applied for the output frame, must be valid
592 * @param input The input frame that will be transformed, must be valid
593 * @param inputPaddingElements The number of padding elements at the end of each input row, in elements, with range [0, infinity)
594 * @param homography The homography used to transform the given input frame by following equation: inputPoint = homography * outputPoint
595 * @param output The output frame using the given homography
596 * @param outputMask Mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame
597 * @param outputPaddingElements The number of padding elements at the end of each output row, in elements, with range [0, infinity)
598 * @param outputMaskPaddingElements The number of padding elements at the end of each output mask row, in elements, with range [0, infinity)
599 * @param worker Optional worker object to distribute the computational load
600 * @param maskValue 8 bit mask values for reference pixels lying inside the given camera frame, reference pixels lying outside the camera frame will be assigned with (0xFF - maskValue)
601 * @tparam tChannels Number of channels of the frame
602 */
603 template <unsigned int tChannels>
604 static inline void homographyWithCameraMask8BitPerChannel(const PinholeCamera& inputCamera, const PinholeCamera& outputCamera, const uint8_t* input, const unsigned int inputPaddingElements, const SquareMatrix3& homography, uint8_t* output, uint8_t* outputMask, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker* worker = nullptr, const uint8_t maskValue = 0xFF);
605
606 /**
607 * Transforms a given input frame into an output frame by application of an interpolation lookup table.
608 * The frame must have a 1-plane pixel format.<br>
609 * The output frame must have the same pixel format and pixel origin as the input frame.
610 * @param input The input frame which will be transformed, must be valid
611 * @param inputWidth The width of the given input frame in pixel, with range [1, infinity)
612 * @param inputHeight The height of the given input frame in pixel, with range [1, infinity)
613 * @param input_LT_output The lookup table which defines the transformation from locations defined in the output frame to locations defined in the input frame (the lookup table stores the corresponding locations in the input frame), must be valid
614 * @param offset True, if the lookup table store local offsets; False, if the lookup table stores absolute positions
615 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign T(0) to each channel
616 * @param output Resulting output frame with frame dimension equal to the size of the given lookup table, must be valid
617 * @param inputPaddingElements The number of padding elements at the end of each input row, in elements, with range [0, infinity)
618 * @param outputPaddingElements The number of padding elements at the end of each output row, in elements, with range [0, infinity)
619 * @param worker Optional worker object to distribute the computation
620 * @tparam T Data type of each pixel channel, e.g., float, double, int
621 * @tparam tChannels Number of channels of the frame, with range [1, infinity)
622 */
623 template <typename T, unsigned int tChannels>
624 static inline void lookup(const T* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable& input_LT_output, const bool offset, const T* borderColor, T* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker* worker = nullptr, const bool useOptimizedNEON = false, const bool useOptimizedBilinearValuesAndFactorCalculation = false, const bool useOptimizedNEONFactorReplication = false);
625
626 /**
627 * Transforms a given input frame into an output frame by application of an interpolation lookup table.
628 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).<br>
629 * The output frame must have the same pixel format and pixel origin as the input frame.<br>
630 * Input frame pixels lying outside the frame will be masked in the resulting output mask frame, further theses pixels are untouched in the output frame.<br>
631 * @param input The input frame which will be transformed
632 * @param inputWidth The width of the given input frame in pixel, with range [1, infinity)
633 * @param inputHeight The height of the given input frame in pixel, with range [1, infinity)
634 * @param input_LT_output The lookup table which defines the transformation from locations defined in the output frame to locations defined in the input frame (the lookup table stores the corresponding locations in the input frame), must be valid
635 * @param offset True, if the lookup table store local offsets; False, if the lookup table stores absolute positions
636 * @param output Resulting output frame with frame dimension equal to the size of the given lookup table
637 * @param outputMask Resulting mask frame with 8 bits per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame
638 * @param inputPaddingElements The number of padding elements at the end of each row of `input`, in elements, with range [0, infinity)
639 * @param outputPaddingElements The number of padding elements at the end of each row of `output`, in elements, with range [0, infinity)
640 * @param outputMaskPaddingElements The number of padding elements at the end of each row of `outputMask`, in elements, with range [0, infinity)
641 * @param worker Optional worker object to distribute the computation
642 * @param maskValue 8 bit mask values for pixels lying inside the input frame, pixels lying outside the input frame will be assigned with (0xFF - maskValue)
643 * @tparam tChannels Number of channels of the frame
644 */
645 template <unsigned int tChannels>
646 static inline void lookupMask8BitPerChannel(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable& input_LT_output, const bool offset, uint8_t* output, uint8_t* outputMask, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker* worker = nullptr, const uint8_t maskValue = 0xFF);
647
648 /**
649 * Re-samples a camera image which has been captured with a camera profile as if the image would have been captured with a second camera profile.
650 * The function can be used e.g., to rectify a fisheye camera image into a pinhole camera image.
651 * @param sourceFrame The source image captured with the source camera profile, must be valid
652 * @param sourceCamera The source camera profile which has been used to capture the source image, with resolution sourceFrame.width() x sourceFrame.height(), must be valid
653 * @param source_R_target The rotation transforming 3D points defined in the coordinate system of the target camera image to 3D points defined in the coordinate system of the source camera image, must be valid
654 * @param targetCamera The camera profile of the target frame, must be valid
655 * @param targetFrame The resulting target image, with resolution targetCamera.width() x targetCamera.height(), must be valid
656 * @param sourceFramePaddingElements The number of padding elements at the end of each source frame row, in elements, with range [0, infinity)
657 * @param targetFramePaddingElements The number of padding elements at the end of each target frame row, in elements, with range [0, infinity)
658 * @param source_OLT_target Optional resulting offset lookup table between target image points and source image points
659 * @param worker Optional worker object to distribute the computational load
660 * @param binSizeInPixel The size in pixel of the interpolation bins used for building the lookup table, with range [1, infinity)
661 * @param borderColor The color of border pixels for which now visual content exists, provide one value for each channel, nullptr to use T(0) for each channel
662 * @tparam T Data type of each pixel channel, e.g., uint8_t, int16_t, float, double
663 * @tparam tChannels The number of frame channels, with range [1, infinity)
664 * @see Comfort::resampleCameraImage().
665 */
666 template <typename T, unsigned int tChannels>
667 static void resampleCameraImage(const T* sourceFrame, const AnyCamera& sourceCamera, const SquareMatrix3& source_R_target, const AnyCamera& targetCamera, T* targetFrame, const unsigned int sourceFramePaddingElements, const unsigned int targetFramePaddingElements, LookupCorner2<Vector2>* source_OLT_target = nullptr, Worker* worker = nullptr, const unsigned int binSizeInPixel = 8u, const T* borderColor = nullptr);
668
669 /**
670 * Determines the interpolated pixel values for a given pixel position in an 8 bit per channel frame.
671 * This function uses an integer interpolation with a precision of 1/128.
672 * @param frame The frame to determine the pixel values from, must be valid
673 * @param width The width of the frame in pixel, with range [1, infinity)
674 * @param height The height of the frame in pixel, with range [1, infinity)
675 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
676 * @param position The position for which the interpolated pixel will be determined, with ranges [0, width - 1]x[0, height - 1] for PC_TOP_LEFT, [0, width]x[0, height] for PC_CENTER
677 * @param result Resulting pixel values, must be valid, must be valid
678 * @tparam tChannels Number of channels of the given frame, with range [1, infinity)
679 * @tparam tPixelCenter The pixel center to be used during interpolation, either 'PC_TOP_LEFT' or 'PC_CENTER'
680 * @tparam TScalar The scalar data type of the sub-pixel position
681 * @see interpolatePixel().
682 */
683 template <unsigned int tChannels, PixelCenter tPixelCenter = PC_TOP_LEFT, typename TScalar = Scalar>
684 static inline void interpolatePixel8BitPerChannel(const uint8_t* frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const VectorT2<TScalar>& position, uint8_t* result);
685
686 /**
687 * Determines the interpolated pixel values for a given pixel position in a frame with arbitrary data type.
688 * This function uses floating point precision during interpolation.
689 * @param frame The frame to determine the pixel values from, must be valid
690 * @param width The width of the frame in pixel, with range [1, infinity)
691 * @param height The height of the frame in pixel, with range [1, infinity)
692 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
693 * @param position The position for which the interpolated pixel will be determined, with ranges [0, width - 1]x[0, height - 1] for PC_TOP_LEFT, [0, width]x[0, height] for PC_CENTER
694 * @param result Resulting interpolated pixel value(s), must be valid
695 * @param resultBias Optional bias value which will be added to the interpolation result e.g. to handle rounding, with range (-infinity, infinity), default is zero
696 * @tparam TSource The data type of the provided pixel values in the (source) frame
697 * @tparam TTarget The data type of the resulting interpolated value(s)
698 * @tparam tChannels Number of channels of the given frame, with range [1, infinity)
699 * @tparam tPixelCenter The pixel center to be used during interpolation, either 'PC_TOP_LEFT' or 'PC_CENTER'
700 * @tparam TScalar The data type of each coordinate of the provided interpolation location, should be either Scalar, float, or double
701 * @tparam TIntermediate The data type of the intermediate interpolation result before assigning the result
702 * @see interpolatePixel8BitPerChannel().
703 */
704 template <typename TSource, typename TTarget, unsigned int tChannels, PixelCenter tPixelCenter = PC_TOP_LEFT, typename TScalar = Scalar, typename TIntermediate = TScalar>
705 static inline void interpolatePixel(const TSource* frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const VectorT2<TScalar>& position, TTarget* result, const TIntermediate& resultBias = TIntermediate(0));
706
707 /**
708 * Determines the interpolated pixel values for a given pixel position in an 8 bit per channel frame with alpha channel.
709 * The center of each pixel is located with an offset of (0.5 x 0.5) in relation to the real pixel position.<br>
710 * The given frame is virtually extended by a fully transparent border so that this functions supports arbitrary interpolation positions.<br>
711 * If the given position lies inside the frame area of (-0.5, -0.5) -> (width + 0.5, height + 0.5) the resulting interpolation result will contain color information of the frame, otherwise a fully transparent interpolation result is provided.<br>
712 * @param frame The frame to determine the pixel values from, must be valid
713 * @param width The width of the frame in pixel, with range [1, infinity)
714 * @param height The height of the frame in pixel, with range [1, infinity)
715 * @param position The position to determine the interpolated pixel values for, with range (-infinity, infinity)x(-infinity, infinity)
716 * @param result Resulting pixel values, must be valid
717 * @param framePaddingElements The number of padding elements at the end of each frame row, in elements, with range [0, infinity)
718 * @tparam tChannels Number of channels of the given frame, with range [1, infinity)
719 * @tparam tAlphaAtFront True, if the alpha channel is in the front of the data channels
720 * @tparam tTransparentIs0xFF True, if 0xFF is interpreted as fully transparent
721 */
722 template <unsigned int tChannels, bool tAlphaAtFront, bool tTransparentIs0xFF>
723 static inline void interpolate1PixelFullAlphaBorder8BitPerChannel(const uint8_t* frame, const unsigned int width, const unsigned int height, const Vector2& position, uint8_t* result, const unsigned int framePaddingElements);
724
725 /**
726 * Interpolate the sum of intensity values of an image patch in a frame, while the frame is provided as lined integral frame.
727 * @param linedIntegralFrame The lined integral image created from the actual gray-scale image for which the patch intensity sum will be determined, must be valid
728 * @param frameWidth Width of the original frame in pixel (not the width of the lined-integral frame), with range [1, infinity)
729 * @param frameHeight Height of the original frame in pixel (not the height of the lined-integral frame), with range [1, infinity)
730 * @param lineIntegralFramePaddingElements The number of padding elements at the end of each integral image row, in elements, with range [0, infinity)
731 * @param center 2D coordinates of the center point of the patch, with range [patchWidth/2, frameWidth - patchWidth/2)x[patchHeight/2, frameHeight - patchHeight/2) for PC_CENTER
732 * @param pixelCenter The pixel center to be used during interpolation, either 'PC_TOP_LEFT' or 'PC_CENTER'
733 * @param patchWidth Width of the calculated patch in pixel with range [1, frameWidth - 1]
734 * @param patchHeight Height of the calculated patch in pixel with range [1, frameHeight - 1]
735 * @return The resulting sum of the pixel intensities
736 */
737 static Scalar patchIntensitySum1Channel(const uint32_t* linedIntegralFrame, const unsigned int frameWidth, const unsigned int frameHeight, const unsigned int lineIntegralFramePaddingElements, const Vector2& center, const CV::PixelCenter pixelCenter, const unsigned int patchWidth, const unsigned int patchHeight);
738
739 /**
740 * Checks whether the application of a given homography for a specified input frame and output frame covers the entire image information (which is necessary for the bilinear interpolation) or whether the homography relies on missing image information.
741 * @param inputWidth The width of the input frame in pixel, with range [1, infinity)
742 * @param inputHeight The height of the input frame in pixel, with range [1, infinity)
743 * @param outputWidth The width of the output frame in pixel, with range [1, infinity)
744 * @param outputHeight The height of the output frame in pixel, with range [1, infinity)
745 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
746 * @param outputOriginX The horizontal origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)
747 * @param outputOriginY The vertical origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)
748 * @return True, if the homography covers the entire input image information (if all output pixels will receive valid data from the input frame); False, otherwise
749 */
750 static bool coversHomographyInputFrame(const unsigned int inputWidth, const unsigned int inputHeight, const unsigned int outputWidth, const unsigned int outputHeight, const SquareMatrix3& input_H_output, const int outputOriginX = 0, const int outputOriginY = 0);
751
752 private:
753
754 /**
755 * Transforms a given 8 bit per channel input frame into an output frame by application of a homography.
756 * The output frame must have the same pixel format and pixel origin as the input frame, however the dimension (and position) of the output frame can be arbitrary.<br>
757 * This function allows the creation of an output frame fully covering the input frame (if the position and dimension of the output frame covers the transformation of the homography).<br>
758 * The 'outputOrigin' parameter simply applies an additional translation onto the provided homography i.e., homography * create_translation_matrix3x3(outputOrigin.x(), outputOrigin.y()).
759 * @param input The input frame that will be transformed, must be valid
760 * @param inputWidth Width of both images in pixel, with range [1, infinity)
761 * @param inputHeight Height of both images pixel, with range [1, infinity)
762 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
763 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
764 * @param output The output frame using the given homography, must be valid
765 * @param outputOrigin The origin of the output frame defining the global position of the output frame's pixel coordinate (0, 0), with range (-infinity, infinity)x(-infinity, infinity)
766 * @param outputWidth The width of the output image in pixel, with range [1, infinity)
767 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
768 * @param inputPaddingElements The number of padding elements at the end of each input frame row, in elements, with range [0, infinity)
769 * @param outputPaddingElements The number of padding elements at the end of each output frame row, in elements, with range [0, infinity)
770 * @param worker Optional worker object to distribute the computational load
771 * @tparam tChannels Number of channels of the frame
772 * @see homographyMask8BitPerChannel(), homographyWithCamera8BitPerChannel(), homography().
773 */
774 template <unsigned int tChannels>
775 static inline void homography8BitPerChannel(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3& input_H_output, const uint8_t* borderColor, uint8_t* output, const PixelPositionI& outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker* worker = nullptr);
776
777 /**
778 * Rescales a given frame with 8 bit per data channel by using a bilinear interpolation with user-defined scaling factors.
779 * The frame must have a 1-plane pixel format with DT_UNSIGNED_INTEGER_8 as data type (e.g., FORMAT_Y8, FORMAT_RGB24, FORMAT_RGBA32, ...).<br>
780 * Information: This function is the equivalent to OpenCV's cv::resize().
781 * @param source The source frame buffer providing the image information to be resized, must be valid
782 * @param target The target frame buffer receiving the rescaled image information, must be valid
783 * @param sourceWidth Width of the source frame in pixel, with range [1, infinity)
784 * @param sourceHeight Height of the source frame in pixel, with range [1, infinity)
785 * @param targetWidth Width of the target frame in pixel, with range [1, infinity)
786 * @param targetHeight Height of the target frame in pixel, with range [1, infinity)
787 * @param sourceX_s_targetX The horizontal scale factor converting a location in the target frame to a location in the source frame (xSource = sourceX_s_targetX * xTarget), with range (0, sourceWidth/targetWidth]
788 * @param sourceY_s_targetY The vertical scale factor converting a location in the target frame to a location in the source frame (ySource = sourceY_s_targetY * yTarget), with range (0, sourceHeight/targetHeight]
789 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
790 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
791 * @param worker Optional worker object to distribute the computation to several CPU cores
792 * @tparam tChannels Number of channels of the frame, with range [1, infinity)
793 */
794 template <unsigned int tChannels>
795 static inline void scale8BitPerChannel(const uint8_t* source, uint8_t* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker = nullptr);
796
797 /**
798 * Resizes a subset of a given frame with 8 bit per channel by a bilinear interpolation.
799 * @param source The image data of the source frame to be resized, must be valid
800 * @param target The target frame buffer receiving the interpolated (resized) source frame, must be valid
801 * @param sourceWidth Width of the source frame in pixel, with range [1, infinity)
802 * @param sourceHeight Height of the source frame in pixel, with range [1, infinity)
803 * @param targetWidth Width of the target frame in pixel, with range [1, infinity)
804 * @param targetHeight Height of the target frame in pixel, with range [1, infinity)
805 * @param sourceX_s_targetX The horizontal scale factor converting a location in the target frame to a location in the source frame (xSource = sourceX_s_targetX * xTarget), with range (0, sourceWidth/targetWidth]
806 * @param sourceY_s_targetY The vertical scale factor converting a location in the target frame to a location in the source frame (ySource = sourceY_s_targetY * yTarget), with range (0, sourceHeight/targetHeight]
807 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
808 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
809 * @param firstTargetRow The first target row to be handled, with range [0, targetHeight)
810 * @param numberTargetRows The number of target row to be handled, with range [1, targetHeight - firstTargetRow]
811 * @tparam tChannels Number of frame channels, with range [0, infinity)
812 */
813 template <unsigned int tChannels>
814 static void scale8BitPerChannelSubset(const uint8_t* source, uint8_t* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows);
815
816 /**
817 * Applies a (horizontal) linear interpolation for one row with 8 bit per channel.
818 * This function uses interpolation factors with 7 bit precision and does not apply any SIMD instructions.
819 * @param extendedSourceRow The source row extended with a copy of the last pixel for which the interpolation will be applied, must be valid
820 * @param targetRow The target row receiving the interpolation result, must be valid
821 * @param targetWidth The with of the target row in pixel, with range [8, infinity)
822 * @param channels The number of frame channels, must be identical with 'tChannels', possible values are 1, 4
823 * @param interpolationLocations The successive locations within the source row defining the location of the left pixels to be interpolated (specified in elements !not! in pixels - e.g., interpolationLocations[0] = firstInterpolationPixel * channels), one for each target pixel, with range [0, (targetWidth - 1) * channels]
824 * @param interpolationFactors The two successive (left !and! right) interpolation factors for each left and right source pixel, with range [0, 128]
825 * @see interpolateRowHorizontal8BitPerChannel7BitPrecisionNEON<tChannels>().
826 */
827 static void interpolateRowHorizontal8BitPerChannel7BitPrecision(const uint8_t* extendedSourceRow, uint8_t* targetRow, const unsigned int targetWidth, const unsigned int channels, const unsigned int* interpolationLocations, const uint8_t* interpolationFactors);
828
829 /**
830 * Applies a (vertical) linear interpolation between two rows with arbitrary data types.
831 * This function does not apply any SIMD instructions.<br>
832 * The length of both source rows is identical with the length of the target row.
833 * @param sourceRowTop The top source row to be used for interpolation, must be valid
834 * @param sourceRowBottom The bottom source row to be used for interpolation, must be valid
835 * @param targetRow The target row receiving the interpolation result, must be valid
836 * @param elements The number of elements in the row to (width * channels), with range [1, infinity)
837 * @param factorBottom The interpolation factor for all elements of the bottom row, with factorTop = 1 - factorBottom, with range [0, 1]
838 * @tparam T The data type of each element, should be 'float'
839 */
840 template <typename T>
841 static void interpolateRowVertical(const T* sourceRowTop, const T* sourceRowBottom, T* targetRow, const unsigned int elements, const float factorBottom);
842
843 /**
844 * Applies a (horizontal) linear interpolation for one row with arbitrary data type.
845 * This function does not apply any SIMD instructions.
846 * @param extendedSourceRow The source row extended with a copy of the last pixel for which the interpolation will be applied, must be valid
847 * @param targetRow The target row receiving the interpolation result, must be valid
848 * @param targetWidth The with of the target row in pixel, with range [8, infinity)
849 * @param channels The number of frame channels, must be identical with 'tChannels', possible values are 1, 4
850 * @param interpolationLocations The successive locations within the source row defining the location of the left pixels to be interpolated (specified in elements !not! in pixels - e.g., interpolationLocations[0] = firstInterpolationPixel * channels), one for each target pixel, with range [0, (targetWidth - 1) * channels]
851 * @param interpolationFactorsRight The right interpolation factors for each right source pixel, with range [0, 1]
852 * @tparam T The data type of each element, should be 'float'
853 * @tparam tChannels The number of frame channels this function can handle, should be 1
854 * @see interpolateRowHorizontal8BitPerChannel7BitPrecision().
855 */
856 template <typename T, unsigned int tChannels>
857 static void interpolateRowHorizontal(const T* extendedSourceRow, T* targetRow, const unsigned int targetWidth, const unsigned int channels, const unsigned int* interpolationLocations, const float* interpolationFactorsRight);
858
859#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
860
861 /**
862 * Applies a (vertical) linear interpolation between two rows with 8 bit per channel.
863 * This function applies NEON instructions and uses interpolation factors with 7 bit precision.<br>
864 * The length of both source rows is identical with the length of the target row.
865 * @param sourceRowTop The top source row to be used for interpolation, must be valid
866 * @param sourceRowBottom The bottom source row to be used for interpolation, must be valid
867 * @param targetRow The target row receiving the interpolation result, must be valid
868 * @param elements The number of elements in the row to (width * channels), with range [16, infinity)
869 * @param factorBottom The interpolation factor for all elements of the bottom row, with factorTop = 128 - factorBottom, with range [0, 128]
870 */
871 static void interpolateRowVertical8BitPerChannel7BitPrecisionNEON(const uint8_t* sourceRowTop, const uint8_t* sourceRowBottom, uint8_t* targetRow, const unsigned int elements, const unsigned int factorBottom);
872
873 /**
874 * Applies a (vertical) linear interpolation between two rows with arbitrary data types.
875 * This function applies NEON instructions.<br>
876 * The length of both source rows is identical with the length of the target row.
877 * @param sourceRowTop The top source row to be used for interpolation, must be valid
878 * @param sourceRowBottom The bottom source row to be used for interpolation, must be valid
879 * @param targetRow The target row receiving the interpolation result, must be valid
880 * @param elements The number of elements in the row to (width * channels), with range [16, infinity)
881 * @param factorBottom The interpolation factor for all elements of the bottom row, with factorTop = 1 - factorBottom, with range [0, 1]
882 * @tparam T The data type of each element, should be 'float'
883 */
884 template <typename T>
885 static void interpolateRowVerticalNEON(const T* sourceRowTop, const T* sourceRowBottom, T* targetRow, const unsigned int elements, const float factorBottom);
886
887 /**
888 * Applies a (horizontal) linear interpolation for one row with 8 bit per channel.
889 * This function applies NEON instructions and uses interpolation factors with 7 bit precision.
890 * @param extendedSourceRow The source row extended with a copy of the last pixel for which the interpolation will be applied, must be valid
891 * @param targetRow The target row receiving the interpolation result, must be valid
892 * @param targetWidth The with of the target row in pixel, with range [8, infinity)
893 * @param channels The number of frame channels, must be identical with 'tChannels', possible values are 1, 4
894 * @param interpolationLocations The successive locations within the source row defining the location of the left pixels to be interpolated (specified in elements !not! in pixels - e.g., interpolationLocations[0] = firstInterpolationPixel * channels), one for each target pixel, with range [0, (targetWidth - 1) * channels]
895 * @param interpolationFactors The two successive (left !and! right) interpolation factors for each left and right source pixel, with range [0, 128]
896 * @tparam tChannels The number of frame channels this function can handle, possible values are 1, 4
897 * @see interpolateRowHorizontal8BitPerChannel7BitPrecision().
898 */
899 template <unsigned int tChannels>
900 static void interpolateRowHorizontal8BitPerChannel7BitPrecisionNEON(const uint8_t* extendedSourceRow, uint8_t* targetRow, const unsigned int targetWidth, const unsigned int channels, const unsigned int* interpolationLocations, const uint8_t* interpolationFactors);
901
902 /**
903 * Applies a (horizontal) linear interpolation for one row with arbitrary data type.
904 * This function applies NEON instructions.
905 * @param extendedSourceRow The source row extended with a copy of the last pixel for which the interpolation will be applied, must be valid
906 * @param targetRow The target row receiving the interpolation result, must be valid
907 * @param targetWidth The with of the target row in pixel, with range [8, infinity)
908 * @param channels The number of frame channels, must be identical with 'tChannels', possible values are 1, 4
909 * @param interpolationLocations The successive locations within the source row defining the location of the left pixels to be interpolated (specified in elements !not! in pixels - e.g., interpolationLocations[0] = firstInterpolationPixel * channels), one for each target pixel, with range [0, (targetWidth - 1) * channels]
910 * @param interpolationFactorsRight The right interpolation factors for each right source pixel, with range [0, 1]
911 * @tparam T The data type of each element, should be 'float'
912 * @tparam tChannels The number of frame channels this function can handle, should be 1
913 * @see interpolateRowHorizontal8BitPerChannel7BitPrecision().
914 */
915 template <typename T, unsigned int tChannels>
916 static void interpolateRowHorizontalNEON(const T* extendedSourceRow, T* targetRow, const unsigned int targetWidth, const unsigned int channels, const unsigned int* interpolationLocations, const float* interpolationFactorsRight);
917
918 /**
919 * Rescales a subset of a given frame with 8 bit per channel by a bilinear interpolation.
920 * This function applies NEON instructions and uses interpolation factors with 7 bit precision.
921 * @param source The image data of the source frame to be resized, must be valid
922 * @param target The target frame buffer receiving the interpolated (resized) source frame, must be valid
923 * @param sourceWidth Width of the source frame in pixel, with range [2, 65.535]
924 * @param sourceHeight Height of the source frame in pixel, with range [1, 65.535]
925 * @param targetWidth Width of the target frame in pixel, with range [tMinimalTargetWidth, 65.535]
926 * @param targetHeight Height of the target frame in pixel, with range [1, infinity)
927 * @param channels The number of channels both frames have, with range [1, infinity)
928 * @param sourceX_s_targetX The horizontal scale factor converting a location in the target frame to a location in the source frame (xSource = sourceX_s_targetX * xTarget), with range (0, sourceWidth/targetWidth]
929 * @param sourceY_s_targetY The vertical scale factor converting a location in the target frame to a location in the source frame (ySource = sourceY_s_targetY * yTarget), with range (0, sourceHeight/targetHeight]
930 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
931 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
932 * @param firstTargetRow The first target row to be handled, with range [0, targetHeight)
933 * @param numberTargetRows The number of target row to be handled, with range [1, targetHeight - firstTargetRow]
934 * @see interpolateRowVertical8BitPerChannel7BitPrecisionNEON(), interpolateRowHorizontal8BitPerChannel7BitPrecisionNEON().
935 */
936 static void scale8BitPerChannelSubset7BitPrecisionNEON(const uint8_t* source, uint8_t* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int channels, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows);
937
938#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
939
940 /**
941 * Resizes a subset of a given frame with arbitrary data type by a bilinear interpolation.
942 * @param source The image data of the source frame to be resized, must be valid
943 * @param target The target frame buffer receiving the interpolated (resized) source frame, must be valid
944 * @param sourceWidth Width of the source frame in pixel, with range [1, infinity)
945 * @param sourceHeight Height of the source frame in pixel, with range [1, infinity)
946 * @param targetWidth Width of the target frame in pixel, with range [1, infinity)
947 * @param targetHeight Height of the target frame in pixel, with range [1, infinity)
948 * @param sourceX_s_targetX The horizontal scale factor converting a location in the target frame to a location in the source frame (xSource = sourceX_s_targetX * xTarget), with range (0, sourceWidth/targetWidth]
949 * @param sourceY_s_targetY The vertical scale factor converting a location in the target frame to a location in the source frame (ySource = sourceY_s_targetY * yTarget), with range (0, sourceHeight/targetHeight]
950 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
951 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
952 * @param firstTargetRow The first target row to be handled, with range [0, targetHeight)
953 * @param numberTargetRows The number of target row to be handled, with range [1, targetHeight - firstTargetRow]
954 * @tparam T The data type of each pixel channel, e.g., float, double, int, short, ...
955 * @tparam TScale The data type of the internal scaling factors to be used, should be 'float' or 'double'
956 * @tparam tChannels Number of frame channels, with range [0, infinity)
957 */
958 template <typename T, typename TScale, unsigned int tChannels>
959 static void scaleSubset(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows);
960
961 /**
962 * Rotates a subset of a given frame by a bilinear interpolation.
963 * @param source The source frame to be rotated, must be valid
964 * @param target The target frame which will receive the rotated image, with same frame type as the source frame, must be valid
965 * @param width The width of the source and target frame in pixel, with range [1, infinity)
966 * @param height The height of the source and target frame in pixel, with range [1, infinity)
967 * @param horizontalAnchorPosition Position of the rotation anchor in the horizontal direction, with range (-infinity, infinity)
968 * @param verticalAnchorPosition Position of the rotation anchor in the vertical direction, with range (-infinity, infinity)
969 * @param angle The counter clockwise rotation angle in radian, with range [0, 2PI)
970 * @param borderColor The color of border pixels for which now visual content exists, provide one value for each channel, nullptr to use 0x00 for each channel
971 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
972 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
973 * @param firstTargetRow The first row of the target frame to be handled, with range [0, height)
974 * @param numberTargetRows The number of rows in the target frame to be handled, with range [1, height - firstTargetRow]
975 * @tparam tChannels Number of frame channels, with range [1, infinity)
976 */
977 template <unsigned int tChannels>
978 static void rotate8BitPerChannelSubset(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const Scalar horizontalAnchorPosition, const Scalar verticalAnchorPosition, const Scalar angle, const uint8_t* borderColor, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows);
979
980 /**
981 * Subset function to apply an affine transform to an N-channel, 8-bit unsigned image.
982 * The affine transform must be provided in the following form: `sourcePoint = source_A_target * targetPoint`
983 * This function does not apply SIMD instructions and can be used for any frame dimensions.
984 * Please note that here the affine transformation is specified as a 3-by-3 matrix (in contrast to the more commonly used 2-by-3 matrix) and should take of the form:
985 * <pre>
986 * a c e
987 * b d f
988 * 0 0 1
989 * </pre>
990 * However, this function disregards the last row completely and only uses the top two rows, i.e., the elements a through f.
991 * @param source Input frame that will be transformed
992 * @param sourceWidth Width of both source images in pixel, with range [1, infinity)
993 * @param sourceHeight Height of both source images pixel, with range [1, infinity)
994 * @param source_A_target Affine transformation which is applied to the source frame.
995 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
996 * @param target Output frame using the given affine transform
997 * @param targetWidth The width of the target image in pixel, with range [1, infinity)
998 * @param targetHeight The height of the target image in pixel, with range [1, infinity)
999 * @param firstTargetRow The first target row to be handled
1000 * @param numberTargetRows Number of target rows to be handled
1001 * @param sourcePaddingElements The number of padding elements at the end of each source frame, in elements, with range [0, infinity)
1002 * @param targetPaddingElements The number of padding elements at the end of each target frame, in elements, with range [0, infinity)
1003 * @tparam tChannels Number of frame channels, with range [1, infinity)
1004 * @see affine8BitPerChannelSSESubset(), affine8BitPerChannelNEONSubset()
1005 */
1006 template <unsigned int tChannels>
1007 static inline void affine8BitPerChannelSubset(const uint8_t* source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3* source_A_target, const uint8_t* borderColor, uint8_t* target, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int firstTargetRow, const unsigned int numberTargetRows, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements);
1008
1009 /**
1010 * Transforms an 8 bit per channel frame using the given homography.
1011 * The homography must provide the following transformation: inputPoint = homography * outputPoint
1012 * This function does not apply SIMD instructions and can be used for any frame dimensions.
1013 * @param input The input frame that will be transformed
1014 * @param inputWidth Width of both input images in pixel, with range [1, infinity)
1015 * @param inputHeight Height of both input images pixel, with range [1, infinity)
1016 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
1017 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
1018 * @param output The output frame using the given homography
1019 * @param outputWidth The width of the output image in pixel, with range [1, infinity)
1020 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
1021 * @param inputPaddingElements The number of padding elements at the end of each input frame row, in elements, with range [0, infinity)
1022 * @param outputPaddingElements The number of padding elements at the end of each output frame row, in elements, with range [0, infinity)
1023 * @param firstOutputRow The first output row to be handled, with range [0, height - 1]
1024 * @param numberOutputRows Number of output rows to be handled, with range [1, height - firstOutputRow]
1025 * @tparam tChannels Number of frame channels, with range [1, infinity)
1026 * @see homography8BitPerChannelSSESubset(), homography8BitPerChannelNEONSubset()
1027 */
1028 template <unsigned int tChannels>
1029 static inline void homography8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* input_H_output, const uint8_t* borderColor, uint8_t* output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows);
1030
1031 /**
1032 * Transforms a frame with (almost) arbitrary pixel format using the given homography.
1033 * This function does not apply SIMD instructions and can be used for any frame dimensions.
1034 * @param input The input frame that will be transformed
1035 * @param inputWidth Width of both input images in pixel, with range [1, infinity)
1036 * @param inputHeight Height of both input images pixel, with range [1, infinity)
1037 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
1038 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
1039 * @param output The output frame using the given homography
1040 * @param outputWidth The width of the output image in pixel, with range [1, infinity)
1041 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
1042 * @param inputPaddingElements The number of padding elements at the end of each input frame row, in elements, with range [0, infinity)
1043 * @param outputPaddingElements The number of padding elements at the end of each output frame row, in elements, with range [0, infinity)
1044 * @param firstOutputRow The first output row to be handled, with range [0, height - 1]
1045 * @param numberOutputRows Number of output rows to be handled, with range [1, height - firstOutputRow]
1046 * @tparam T Data type of each pixel channel, e.g., float, double, int
1047 * @tparam tChannels Number of frame channels, with range [1, infinity)
1048 * @see homography8BitPerChannelSSESubset().
1049 */
1050 template <typename T, unsigned int tChannels>
1051 static inline void homographySubset(const T* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* input_H_output, const T* borderColor, T* output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows);
1052
1053#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
1054
1055 /**
1056 * Subset function to apply an affine transform to an N-channel, 8-bit unsigned image (using SSE).
1057 * This function applies SSE instructions.<br>
1058 * Beware: The target width 'targetWidth' must be >= 4, use affine8BitPerChannelSubset for small target frames
1059 * This function has the property: sourcePoint = source_A_target * targetPoint
1060 * Please note that here the affine transformation is specified as a 3-by-3 matrix (in contrast to the more commonly used 2-by-3 matrix) and should take of the form:
1061 * <pre>
1062 * a c e
1063 * b d f
1064 * 0 0 1
1065 * </pre>
1066 * However, this function disregards the last row completely and only uses the top two rows, i.e., the elements a through f.
1067 * @param source Input frame that will be transformed
1068 * @param sourceWidth Width of both source images in pixel, with range [1, infinity)
1069 * @param sourceHeight Height of both source images pixel, with range [1, infinity)
1070 * @param source_A_target Affine transformation which is applied to source frame.
1071 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
1072 * @param target The target frame where the result of the transformation will be stored
1073 * @param targetWidth The width of the target image in pixel, with range [4, infinity)
1074 * @param targetHeight The height of the target image in pixel, with range [1, infinity)
1075 * @param firstTargetRow The first target row to be handled
1076 * @param numberTargetRows Number of target rows to be handled
1077 * @param sourcePaddingElements The number of padding elements at the end of each source frame, in elements, with range [0, infinity)
1078 * @param targetPaddingElements The number of padding elements at the end of each target frame, in elements, with range [0, infinity)
1079 * @tparam tChannels Number of frame channels
1080 * @see affine8BitPerChannelSubset(), affine8BitPerChannelSubsetNEON().
1081 */
1082 template <unsigned int tChannels>
1083 static inline void affine8BitPerChannelSSESubset(const uint8_t* source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3* source_A_target, const uint8_t* borderColor, uint8_t* target, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int firstTargetRow, const unsigned int numberTargetRows, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements);
1084
1085 /**
1086 * Transforms an 8 bit per channel frame using the given homography.
1087 * This function applies SSE instructions.<br>
1088 * Beware: The output width 'outputWidth' must be >= 4, use homography8BitPerChannelSubset for small output frames
1089 * @param input The input frame that will be transformed, must be valid
1090 * @param inputWidth Width of both input images in pixel, with range [1, infinity)
1091 * @param inputHeight Height of both input images pixel, with range [1, infinity)
1092 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
1093 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
1094 * @param output The output frame using the given homography, must be valid
1095 * @param outputWidth The width of the output image in pixel, with range [4, infinity)
1096 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
1097 * @param inputPaddingElements The number of padding elements at the end of each input frame row, in elements, with range [0, infinity)
1098 * @param outputPaddingElements The number of padding elements at the end of each output frame row, in elements, with range [0, infinity)
1099 * @param firstOutputRow The first output row to be handled, with range [0, outputHeight - 1]
1100 * @param numberOutputRows Number of output rows to be handled, with range [1, outputHeight - firstOutputRow]
1101 * @tparam tChannels Number of frame channels, with range [1, infinity)
1102 * @see homography8BitPerChannelSubset().
1103 */
1104 template <unsigned int tChannels>
1105 static inline void homography8BitPerChannelSSESubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* input_H_output, const uint8_t* borderColor, uint8_t* output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows);
1106
1107 /**
1108 * Interpolates 4 independent pixels concurrently based on already known locations (top-left, top-right, bottom-left, and bottom-right) and interpolation factors for the source pixels.
1109 * This function also supports to interpolate the pixel values for a subset of the four pixels only, valid pixels will be interpolated, invalid pixels receive a defined border color.
1110 * @param source The source image in which the four independent pixels are located, must be valid
1111 * @param offsetsTopLeft The four offsets within the source image for the four top-left pixels used for the interpolation, with range [0, (width * tChannels + sourcePaddingElements) * (height - 1) + width * tChannels), or the corresponding validPixel information is 0x00000000
1112 * @param offsetsTopRight The four offsets within the source image for the four top-right pixels used for the interpolation, with range [0, (width * tChannels + sourcePaddingElements) * (height - 1) + width * tChannels), or the corresponding validPixel information is 0x00000000
1113 * @param offsetsBottomLeft The four offsets within the source image for the four bottom-right pixels used for the interpolation, with range [0, (width * tChannels + sourcePaddingElements) * (height - 1) + width * tChannels), or the corresponding validPixel information is 0x00000000
1114 * @param offsetsBottomRight The four offsets within the source image for the four bottom-right pixels used for the interpolation, with range [0, (width * tChannels + sourcePaddingElements) * (height - 1) + width * tChannels), or the corresponding validPixel information is 0x00000000
1115 * @param validPixels For boolean states specifying whether which of the given four pixels will be interpolated and which one will receive the defined border color, 0x00000000 for invalid pixels everything else for valid pixels (e.g., 0xFFFFFFFF)
1116 * @param borderColor The border color that will be assigned to each resulting pixel that is invalid (for which the corresponding validPixel information is 0x00000000
1117 * @param m128_factorsRight The horizontal interpolation factors for right pixels, with range [0, 128], 128 to use the color information of the right pixels only, 0 to use the color information of the left pixels only
1118 * @param m128_factorsBottom The vertical interpolation factors for bottom pixels, with range [0, 128], 128 to use the color information of the bottom pixels only, 0 to use the color information of the top pixels only
1119 * @param targetPositionPixels The buffer that will receive the interpolated color values, must be valid
1120 * @tparam tChannels The number of frame channels, with range [1, infinity)
1121 */
1122 template <unsigned int tChannels>
1123 static OCEAN_FORCE_INLINE void interpolate4Pixels8BitPerChannelSSE(const uint8_t* source, const unsigned int offsetsTopLeft[4], const unsigned int offsetsTopRight[4], const unsigned int offsetsBottomLeft[4], const unsigned int offsetsBottomRight[4], const unsigned int validPixels[4], const typename DataType<uint8_t, tChannels>::Type& borderColor, const __m128i& m128_factorsRight, const __m128i& m128_factorsBottom, typename DataType<uint8_t, tChannels>::Type* targetPositionPixels);
1124
1125 /**
1126 * Interpolates 4 independent pixels concurrently based on already known locations (top-left, top-right, bottom-left, and bottom-right) and interpolation factors for the source pixels.
1127 * This function also supports to interpolate the pixel values for a subset of the four pixels only, valid pixels will be interpolated, invalid pixels receive a defined border color.
1128 * @param m128_sourcesTopLeft The pixel values of the four top left pixels, starting at the first byte may contain unused bytes at the end, e.g., RGBARGBARGBARGBA or YUVYUVYUVYUV----
1129 * @param m128_sourcesTopRight The pixel values of the four top right pixels, starting at the first byte may contain unused bytes at the end
1130 * @param m128_sourcesBottomLeft The pixel values of the four bottom left pixels, starting at the first byte may contain unused bytes at the end
1131 * @param m128_sourcesBottomRight The pixel values of the four bottom right pixels, starting at the first byte may contain unused bytes at the end
1132 * @param m128_factorsTopLeft The four interpolation factors of the four top left pixels, with ranges [0, 128 * 128], so that (m128_factorsTopLeft + m128_factorsTopRight + m128_factorsBottomLeft + m128_factorsBottomRight) == (128 * 128)
1133 * @param m128_factorsTopRight The four interpolation factors of the four top right pixels, with ranges [0, 128 * 128], so that (m128_factorsTopLeft + m128_factorsTopRight + m128_factorsBottomLeft + m128_factorsBottomRight) == (128 * 128)
1134 * @param m128_factorsBottomLeft The four interpolation factors of the four bottom left pixels, with ranges [0, 128 * 128], so that (m128_factorsTopLeft + m128_factorsTopRight + m128_factorsBottomLeft + m128_factorsBottomRight) == (128 * 128)
1135 * @param m128_factorsBottomRight The four interpolation factors of the four bottom right pixels, with ranges [0, 128 * 128], so that (m128_factorsTopLeft + m128_factorsTopRight + m128_factorsBottomLeft + m128_factorsBottomRight) == (128 * 128)
1136 * @return The resulting interpolated pixel values, starting at the first byte may contain unused bytes at the end, e.g., RGBARGBARGBARGBA or YUVYUVYUVYUV----
1137 * @tparam tChannels The number of frame channels, with range [3, 4]
1138 */
1139 template <unsigned int tChannels>
1140 static OCEAN_FORCE_INLINE __m128i interpolate4Pixels8BitPerChannelSSE(const __m128i& m128_sourcesTopLeft, const __m128i& m128_sourcesTopRight, const __m128i& m128_sourcesBottomLeft, const __m128i& m128_sourcesBottomRight, const __m128i& m128_factorsTopLeft, const __m128i& m128_factorsTopRight, const __m128i& m128_factorsBottomLeft, const __m128i& m128_factorsBottomRight);
1141
1142#endif // OCEAN_HARDWARE_SSE_VERSION
1143
1144#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1145
1146 /**
1147 * Subset function to apply an affine transform to an N-channel, 8-bit unsigned image (using NEON).
1148 * This function applies NEON instructions.<br>
1149 * This one has the property: sourcePoint = source_A_target * targetPoint
1150 * Beware: The target width 'targetWidth' must be >= 4, use affine8BitPerChannelSubset for small target frames
1151 * Please note that here the affine transformation is specified as a 3-by-3 matrix (in contrast to the more commonly used 2-by-3 matrix) and should take of the form:
1152 * <pre>
1153 * a c e
1154 * b d f
1155 * 0 0 1
1156 * </pre>
1157 * However, this function disregards the last row completely and only uses the top two rows, i.e., the elements a through f.
1158 * @param source The source frame that will be transformed
1159 * @param sourceWidth Width of both source images in pixel, with range [1, infinity)
1160 * @param sourceHeight Height of both source images pixel, with range [1, infinity)
1161 * @param source_A_target Affine transform used to transform the given source frame.
1162 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
1163 * @param target The target frame using the given affine transform
1164 * @param targetWidth The width of the target image in pixel, with range [4, infinity)
1165 * @param targetHeight The height of the target image in pixel, with range [1, infinity)
1166 * @param firstTargetRow The first target row to be handled
1167 * @param numberTargetRows Number of target rows to be handled
1168 * @param sourcePaddingElements The number of padding elements at the end of each source frame, in elements, with range [0, infinity)
1169 * @param targetPaddingElements The number of padding elements at the end of each target frame, in elements, with range [0, infinity)
1170 * @tparam tChannels Number of frame channels, with range [1, infinity)
1171 * @see homography8BitPerChannelSubset().
1172 */
1173 template <unsigned int tChannels>
1174 static inline void affine8BitPerChannelNEONSubset(const uint8_t* source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3* source_A_target, const uint8_t* borderColor, uint8_t* target, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int firstTargetRow, const unsigned int numberTargetRows, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements);
1175
1176 /**
1177 * Transforms an 8 bit per channel frame using the given homography.
1178 * This function applies NEON instructions.<br>
1179 * Beware: The output width 'outputWidth' must be >= 4, use homography8BitPerChannelSubset for small output frames.
1180 * @param input The input frame that will be transformed
1181 * @param inputWidth Width of both input images in pixel, with range [1, infinity)
1182 * @param inputHeight Height of both input images pixel, with range [1, infinity)
1183 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
1184 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
1185 * @param output The output frame using the given homography
1186 * @param outputWidth The width of the output image in pixel, with range [4, infinity)
1187 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
1188 * @param inputPaddingElements The number of padding elements at the end of each input frame row, in elements, with range [0, infinity)
1189 * @param outputPaddingElements The number of padding elements at the end of each output frame row, in elements, with range [0, infinity)
1190 * @param firstOutputRow The first output row to be handled, with range [0, outputHeight - 1]
1191 * @param numberOutputRows Number of output rows to be handled, with range [1, outputHeight - firstOutputRow]
1192 * @tparam tChannels Number of frame channels, with range [1, infinity)
1193 * @see homography8BitPerChannelSubset().
1194 */
1195 template <unsigned int tChannels>
1196 static inline void homography8BitPerChannelNEONSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* input_H_output, const uint8_t* borderColor, uint8_t* output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows);
1197
1198 /**
1199 * Interpolates 4 independent pixels concurrently based on already known locations (top-left, top-right, bottom-left, and bottom-right) and interpolation factors for the source pixels.
1200 * This function also supports to interpolate the pixel values for a subset of the four pixels only, valid pixels will be interpolated, invalid pixels receive a defined border color.
1201 * @param source The source image in which the four independent pixels are located, must be valid
1202 * @param offsetsTopLeftElements The four offsets within the source image for the four top-left pixels used for the interpolation, in elements, with ranges [0, strideElements * height), or the corresponding validPixel information is 0x00000000
1203 * @param offsetsTopRightElements The four offsets within the source image for the four top-right pixels used for the interpolation, in elements, with ranges [0, strideElements * height), or the corresponding validPixel information is 0x00000000
1204 * @param offsetsBottomLeftElements The four offsets within the source image for the four bottom-right pixels used for the interpolation, in elements, with ranges [0, strideElements * height), or the corresponding validPixel information is 0x00000000
1205 * @param offsetsBottomRightElements The four offsets within the source image for the four bottom-right pixels used for the interpolation, in elements, with ranges [0, strideElements * height), or the corresponding validPixel information is 0x00000000
1206 * @param validPixels For boolean states specifying whether which of the given four pixels will be interpolated and which one will receive the defined border color, 0x00000000 for invalid pixels everything else for valid pixels (e.g., 0xFFFFFFFF)
1207 * @param borderColor The border color that will be assigned to each resulting pixel that is invalid (for which the corresponding validPixel information is 0x00000000
1208 * @param m128_factorsRight The horizontal interpolation factors for right pixels, with range [0, 128], 128 to use the color information of the right pixels only, 0 to use the color information of the left pixels only
1209 * @param m128_factorsBottom The vertical interpolation factors for bottom pixels, with range [0, 128], 128 to use the color information of the bottom pixels only, 0 to use the color information of the top pixels only
1210 * @param targetPositionPixels The buffer that will receive the interpolated color values, must be valid
1211 * @tparam tChannels The number of frame channels, with range [1, infinity)
1212 */
1213 template <unsigned int tChannels>
1214 static OCEAN_FORCE_INLINE void interpolate4Pixels8BitPerChannelNEON(const uint8_t* source, const unsigned int offsetsTopLeftElements[4], const unsigned int offsetsTopRightElements[4], const unsigned int offsetsBottomLeftElements[4], const unsigned int offsetsBottomRightElements[4], const unsigned int validPixels[4], const typename DataType<uint8_t, tChannels>::Type& borderColor, const uint32x4_t& m128_factorsRight, const uint32x4_t& m128_factorsBottom, typename DataType<uint8_t, tChannels>::Type* targetPositionPixels);
1215
1216 /**
1217 * Interpolates 8 independent pixels concurrently of a 1 channel frame, the source pixel locations must be known already (top-left, top-right, bottom-left, and bottom-right), further the interpolation factors must be known already.
1218 * @param topLeft_u_8x8 The 8 top left pixel values to be used for interpolation
1219 * @param topRight_u_8x8 The 8 top right pixel values to be used for interpolation
1220 * @param bottomLeft_u_8x8 The 8 bottom left pixel values to be used for interpolation
1221 * @param bottomRight_u_8x8 The 8 bottom right pixel values to be used for interpolation
1222 * @param factorsRight_factorsBottom_128_u_8x16 The eight horizontal interpolation factors for right pixels, and the eight vertical interpolation factors for the bottom pixels, with range [0, 128], 128 to use the color information of the right pixels only, 0 to use the color information of the left pixels only
1223 * @param targetPositionPixels The buffer that will receive the interpolated color values, must be valid
1224 */
1225 static OCEAN_FORCE_INLINE void interpolate8Pixels1Channel8BitNEON(const uint8x8_t& topLeft_u_8x8, const uint8x8_t& topRight_u_8x8, const uint8x8_t& bottomLeft_u_8x8, const uint8x8_t& bottomRight_u_8x8, const uint8x16_t& factorsRight_factorsBottom_128_u_8x16, uint8_t* targetPositionPixels);
1226
1227 /**
1228 * Interpolates 4 independent 4-channel pixels using widening byte multiply.
1229 * Accepts pre-gathered pixel data in NEON registers, eliminating the
1230 * NEON->stack->scalar->stack->NEON roundtrip of the offset-based overload.
1231 * Uses vmull_u8/vmlal_u8 widening pattern (~22 NEON ops vs ~56 in the u32 decomposition path).
1232 * @param topLeftPixels_u8x16 The 4 top-left pixels packed as 16 bytes [R0G0B0A0 R1G1B1A1 R2G2B2A2 R3G3B3A3]
1233 * @param topRightPixels_u8x16 The 4 top-right pixels packed identically
1234 * @param bottomLeftPixels_u8x16 The 4 bottom-left pixels packed identically
1235 * @param bottomRightPixels_u8x16 The 4 bottom-right pixels packed identically
1236 * @param m128_factorsRight The horizontal interpolation factors, one per pixel, with range [0, 128]
1237 * @param m128_factorsBottom The vertical interpolation factors, one per pixel, with range [0, 128]
1238 * @param targetPositionPixels The buffer that will receive the interpolated color values, must be valid
1239 * @param useOptimizedNEONFactorReplication True to use the optimized TBL-based factor replication (AArch64 only); False to use the original narrow+zip factor replication
1240 */
1241 static OCEAN_FORCE_INLINE void interpolate4Pixels4Channel8BitPerChannelNEON(const uint8x16_t& topLeftPixels_u8x16, const uint8x16_t& topRightPixels_u8x16, const uint8x16_t& bottomLeftPixels_u8x16, const uint8x16_t& bottomRightPixels_u8x16, const uint32x4_t& m128_factorsRight, const uint32x4_t& m128_factorsBottom, typename DataType<uint8_t, 4u>::Type* targetPositionPixels, const bool useOptimizedNEONFactorReplication = false);
1242
1243#endif // OCEAN_HARDWARE_SSE_VERSION
1244
1245 /**
1246 * Transforms an 8 bit per channel frame using the given homographies.
1247 * @param input The input frame that will be transformed
1248 * @param inputWidth Width of both input images in pixel, with range [1, infinity)
1249 * @param inputHeight Height of both input images pixel, with range [1, infinity)
1250 * @param homographies Homographies used to transform the given input frame
1251 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
1252 * @param output The output frame using the given homography
1253 * @param outputQuadrantCenterX The horizontal position of the four quadrants in the output frame (the local center not respecting the optional outputOrigin parameter), with range [0, outputWidth)
1254 * @param outputQuadrantCenterY The vertical position of the four quadrants in the output frame (the local center not respecting the optional outputOrigin parameter), with range [0, outputHeight)
1255 * @param outputOriginX The horizontal coordinate of the output frame's origin
1256 * @param outputOriginY The vertical coordinate of the output frame's origin
1257 * @param outputWidth The width of the output image in pixel, with range [1, infinity)
1258 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
1259 * @param inputPaddingElements The number of padding elements at the end of each input frame, in elements, with range [0, infinity)
1260 * @param outputPaddingElements The number of padding elements at the end of each output frame, in elements, with range [0, infinity)
1261 * @param firstOutputRow The first output row to be handled
1262 * @param numberOutputRows Number of output rows to be handled
1263 * @tparam tChannels Number of frame channels
1264 */
1265 template <unsigned int tChannels>
1266 static inline void homographies8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* homographies, const uint8_t* borderColor, uint8_t* output, const Scalar outputQuadrantCenterX, const Scalar outputQuadrantCenterY, const int outputOriginX, const int outputOriginY, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows);
1267
1268 /**
1269 * Transforms an 8 bit per channel frame using the given homography.
1270 * @param input The input frame that will be transformed, must be valid
1271 * @param inputWidth Width of both input images in pixel, with range [1, infinity)
1272 * @param inputHeight Height of both input images pixel, with range [1, infinity)
1273 * @param input_H_output Homography used to transform the given input frame by following equation: inputPoint = input_H_output * outputPoint, must be valid
1274 * @param output The output frame resulting by application of the given homography, must be valid
1275 * @param outputMask Mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame
1276 * @param maskValue 8 bit mask values for reference pixels lying inside the given camera frame, reference pixels lying outside the camera frame will be assigned with (0xFF - maskValue)
1277 * @param outputWidth The width of the output image in pixel, with range [1, infinity)
1278 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
1279 * @param inputPaddingElements The number of padding elements at the end of each input row, in elements, with range [0, infinity)
1280 * @param outputPaddingElements The number of padding elements at the end of each output row, in elements, with range [0, infinity)
1281 * @param outputMaskPaddingElements The number of padding elements at the end of output mask input row, in elements, with range [0, infinity)
1282 * @param firstOutputRow The first output row to be handled
1283 * @param numberOutputRows Number of output rows to be handled
1284 * @tparam tChannels Number of frame channels, with range [1, infinity)
1285 */
1286 template <unsigned int tChannels>
1287 static inline void homographyMask8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* input_H_output, uint8_t* output, uint8_t* outputMask, const uint8_t maskValue, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows);
1288
1289 /**
1290 * Transforms an 8 bit per channel frame using the given homography.
1291 * @param input The input frame that will be transformed
1292 * @param inputWidth Width of both input images in pixel, with range [1, infinity)
1293 * @param inputHeight Height of both input images pixel, with range [1, infinity)
1294 * @param homographies Homographies used to transform the given input frame
1295 * @param output The output frame resulting by application of the given homography
1296 * @param outputMask Mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame
1297 * @param maskValue 8 bit mask values for reference pixels lying inside the given camera frame, reference pixels lying outside the camera frame will be assigned with (0xFF - maskValue)
1298 * @param outputQuadrantCenterX The horizontal position of the four quadrants in the output frame (the local center not respecting the optional outputOrigin parameter), with range [0, outputWidth)
1299 * @param outputQuadrantCenterY The vertical position of the four quadrants in the output frame (the local center not respecting the optional outputOrigin parameter), with range [0, outputHeight)
1300 * @param outputOriginX The horizontal coordinate of the output frame's origin
1301 * @param outputOriginY The vertical coordinate of the output frame's origin
1302 * @param outputWidth The width of the output image in pixel, with range [1, infinity)
1303 * @param outputHeight The height of the output image in pixel, with range [1, infinity)
1304 * @param inputPaddingElements The number of padding elements at the end of each input row, in elements, with range [0, infinity)
1305 * @param outputPaddingElements The number of padding elements at the end of each output row, in elements, with range [0, infinity)
1306 * @param outputMaskPaddingElements The number of padding elements at the end of each row of the output mask, in elements, with range [0, infinity)
1307 * @param firstOutputRow The first output row to be handled
1308 * @param numberOutputRows Number of output rows to be handled
1309 * @tparam tChannels Number of frame channels
1310 */
1311 template <unsigned int tChannels>
1312 static inline void homographiesMask8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* homographies, uint8_t* output, uint8_t* outputMask, const uint8_t maskValue, const Scalar outputQuadrantCenterX, const Scalar outputQuadrantCenterY, const int outputOriginX, const int outputOriginY, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows);
1313
1314 /**
1315 * Transforms an 8 bit per channel frame using the given homography.
1316 * @param inputCamera The pinhole camera profile to be applied for the input frame
1317 * @param outputCamera The pinhole camera profile to be applied for the output frame
1318 * @param outputCameraDistortionLookup The distortion lookup table of the of the output camera
1319 * @param input The input frame that will be transformed
1320 * @param normalizedHomography The homography used to transform the given input frame specified in normalized camera coordinates
1321 * @param useDistortionParameters True, to apply the distortion parameters of the camera profile
1322 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
1323 * @param output The output frame resulting by application of the given homography
1324 * @param inputPaddingElements The number of padding elements at the end of each input row, in elements, with range [0, infinity)
1325 * @param outputPaddingElements The number of padding elements at the end of each output row, in elements, with range [0, infinity)
1326 * @param firstRow The first row to be handled
1327 * @param numberRows Number of rows to be handled
1328 * @tparam tChannels Number of frame channels
1329 */
1330 template <unsigned int tChannels>
1331 static void homographyWithCamera8BitPerChannelSubset(const PinholeCamera* inputCamera, const PinholeCamera* outputCamera, const PinholeCamera::DistortionLookup* outputCameraDistortionLookup, const uint8_t* input, const SquareMatrix3* normalizedHomography, const bool useDistortionParameters, const uint8_t* borderColor, uint8_t* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
1332
1333 /**
1334 * Transforms an 8 bit per channel frame using the given homography.
1335 * @param inputCamera The pinhole camera profile to be applied for the input frame
1336 * @param outputCamera The pinhole camera profile to be applied for the output frame
1337 * @param outputCameraDistortionLookup The distortion lookup table of the of the output camera
1338 * @param input The input frame that will be transformed, must be valid
1339 * @param inputPaddingElements The number of padding elements at the end of each input row, in elements, with range [0, infinity)
1340 * @param normalizedHomography The homography used to transform the given input frame specified in normalized camera coordinates
1341 * @param output The output frame resulting by application of the given homography
1342 * @param outputMask Mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame
1343 * @param outputPaddingElements The number of padding elements at the end of each output row, in elements, with range [0, infinity)
1344 * @param outputMaskPaddingElements The number of padding elements at the end of each output mask row, in elements, with range [0, infinity)
1345 * @param maskValue 8 bit mask values for reference pixels lying inside the given camera frame, reference pixels lying outside the camera frame will be assigned with (0xFF - maskValue)
1346 * @param firstRow The first row to be handled
1347 * @param numberRows Number of rows to be handled
1348 * @tparam tChannels Number of frame channels
1349 */
1350 template <unsigned int tChannels>
1351 static void homographyWithCameraMask8BitPerChannelSubset(const PinholeCamera* inputCamera, const PinholeCamera* outputCamera, const PinholeCamera::DistortionLookup* outputCameraDistortionLookup, const uint8_t* input, const unsigned int inputPaddingElements, const SquareMatrix3* normalizedHomography, uint8_t* output, uint8_t* outputMask, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const uint8_t maskValue, const unsigned int firstRow, const unsigned int numberRows);
1352
1353 /**
1354 * Transforms a subset of a given input frame with uint8_t as element type into an output frame by application of an interpolation lookup table.
1355 * The output frame must have the same pixel format and pixel origin as the input frame.<br>
1356 * @param input The input frame which will be transformed, must be valid
1357 * @param inputWidth The width of the given input frame in pixel, with range [1, infinity)
1358 * @param inputHeight The height of the given input frame in pixel, with range [1, infinity)
1359 * @param input_LT_output The lookup table which defines the transformation from locations defined in the output frame to locations defined in the input frame (the lookup table stores the corresponding locations in the input frame), must be valid
1360 * @param offset True, if the lookup table store local offsets; False, if the lookup table stores absolute positions
1361 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
1362 * @param output Resulting output frame with frame dimension equal to the size of the given lookup table
1363 * @param inputPaddingElements Number of padding elements at the end of each input row, in elements, with range [0, infinity)
1364 * @param outputPaddingElements Number of padding elements at the end of each output row, in elements, with range [0, infinity)
1365 * @param firstRow First row to be handled, with range [0, input_LT_output->sizeY())
1366 * @param numberRows Number of rows to be handled, with range [1, input_LT_output->sizeY() - firstRow]
1367 * @tparam tChannels Number of channels of the frame, with range [1, infinity)
1368 */
1369 template <unsigned int tChannels>
1370 static void lookup8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable* input_LT_output, const bool offset, const uint8_t* borderColor, uint8_t* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
1371
1372 /**
1373 * Transforms a subset of a given input frame with arbitrary element type into an output frame by application of an interpolation lookup table.
1374 * The output frame must have the same pixel format and pixel origin as the input frame.<br>
1375 * @param input The input frame which will be transformed, must be valid
1376 * @param inputWidth The width of the given input frame in pixel, with range [1, infinity)
1377 * @param inputHeight The height of the given input frame in pixel, with range [1, infinity)
1378 * @param input_LT_output The lookup table which defines the transformation from locations defined in the output frame to locations defined in the input frame (the lookup table stores the corresponding locations in the input frame), must be valid
1379 * @param offset True, if the lookup table store local offsets; False, if the lookup table stores absolute positions
1380 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign T(0) to each channel
1381 * @param output Resulting output frame with frame dimension equal to the size of the given lookup table, must be valid
1382 * @param inputPaddingElements Number of padding elements at the end of each input row, in elements, with range [0, infinity)
1383 * @param outputPaddingElements Number of padding elements at the end of each output row, in elements, with range [0, infinity)
1384 * @param firstRow First row to be handled, with range [0, input_LT_output->sizeY())
1385 * @param numberRows Number of rows to be handled, with range [1, input_LT_output->sizeY() - firstRow]
1386 * @tparam T Data type of each pixel channel, must not be 'uint8_t'
1387 * @tparam tChannels Number of channels of the frame, with range [1, infinity)
1388 */
1389 template <typename T, unsigned int tChannels>
1390 static void lookupSubset(const T* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable* input_LT_output, const bool offset, const T* borderColor, T* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
1391
1392#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1393
1394 /**
1395 * Transforms a subset of a given input frame into an output frame by application of an interpolation lookup table and uses NEON instructions.
1396 * The output frame must have the same pixel format and pixel origin as the input frame.<br>
1397 * @param input The input frame which will be transformed, must be valid
1398 * @param inputWidth The width of the given input frame in pixel, with range [1, infinity)
1399 * @param inputHeight The height of the given input frame in pixel, with range [1, infinity)
1400 * @param input_LT_output The lookup table which defines the transformation from locations defined in the output frame to locations defined in the input frame (the lookup table stores the corresponding locations in the input frame), with table width >= 4, must be valid
1401 * @param offset True, if the lookup table store local offsets; False, if the lookup table stores absolute positions
1402 * @param borderColor Color of undefined pixel positions, the size of the buffer must match to the number of channels, nullptr to assign 0x00 to each channel
1403 * @param output Resulting output frame with frame dimension equal to the size of the given lookup table
1404 * @param inputPaddingElements Number of padding elements at the end of each input row, in elements, with range [0, infinity)
1405 * @param outputPaddingElements Number of padding elements at the end of each output row, in elements, with range [0, infinity)
1406 * @param firstRow First row to be handled, with range [0, input_LT_output->sizeY())
1407 * @param numberRows Number of rows to be handled, with range [1, input_LT_output->sizeY() - firstRow]
1408 * @tparam tChannels Number of channels of the frame, with range [1, infinity)
1409 */
1410 template <unsigned int tChannels>
1411 static void lookup8BitPerChannelSubsetNEON(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable* input_LT_output, const bool offset, const uint8_t* borderColor, uint8_t* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows, const bool useOptimizedNEON = false, const bool useOptimizedBilinearValuesAndFactorCalculation = false, const bool useOptimizedNEONFactorReplication = false);
1412
1413#endif // defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1414
1415 /**
1416 * Transforms a given input frame into an output frame by application of an interpolation lookup table.
1417 * The output frame must have the same pixel format and pixel origin as the input frame.<br>
1418 * Input frame pixels lying outside the frame will be masked in the resulting output mask frame, further theses pixels are untouched in the output frame.<br>
1419 * @param input The input frame which will be transformed
1420 * @param inputWidth The width of the given input frame in pixel, with range [1, infinity)
1421 * @param inputHeight The height of the given input frame in pixel, with range [1, infinity)
1422 * @param input_LT_output The lookup table which defines the transformation from locations defined in the output frame to locations defined in the input frame (the lookup table stores the corresponding locations in the input frame), must be valid
1423 * @param offset True, if the lookup table store local offsets; False, if the lookup table stores absolute positions
1424 * @param output Resulting output frame with frame dimension equal to the size of the given lookup table
1425 * @param outputMask Resulting mask frame with 8 bit per pixel defining whether an output frame pixel has a valid corresponding pixel in the input frame
1426 * @param maskValue 8 bit mask values for pixels lying inside the input frame, pixels lying outside the input frame will be assigned with (0xFF - maskValue)
1427 * @param inputPaddingElements The number of padding elements at the end of each row of `input`, in elements, with range [0, infinity)
1428 * @param outputPaddingElements The number of padding elements at the end of each row of `output`, in elements, with range [0, infinity)
1429 * @param outputMaskPaddingElements The number of padding elements at the end of each row of `outputMask`, in elements, with range [0, infinity)
1430 * @param firstRow First row to be handled
1431 * @param numberRows Number of rows to be handled
1432 * @tparam tChannels Number of channels of the frame
1433 */
1434 template <unsigned int tChannels>
1435 static void lookupMask8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable* input_LT_output, const bool offset, uint8_t* output, uint8_t* outputMask, const uint8_t maskValue, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
1436};
1437
1438inline bool FrameInterpolatorBilinear::Comfort::resize(Frame& frame, const unsigned int width, const unsigned int height, Worker* worker)
1439{
1440 ocean_assert(frame.isValid());
1441 ocean_assert(width >= 1u && height >= 1u);
1442
1443 Frame target(FrameType(frame, width, height));
1444
1445 if (!resize(frame, target, worker))
1446 {
1447 return false;
1448 }
1449
1450 target.setTimestamp(frame.timestamp());
1452
1453 frame = std::move(target);
1454 return true;
1455}
1456
1457template <typename TScalar>
1458bool FrameInterpolatorBilinear::Comfort::interpolatePixel8BitPerChannel(const uint8_t* frame, const unsigned int channels, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const PixelCenter pixelCenter, const VectorT2<TScalar>& position, uint8_t* result)
1459{
1460 ocean_assert(frame != nullptr);
1461 ocean_assert(channels >= 1u && channels <= 8u);
1462
1463 if (pixelCenter == PC_TOP_LEFT)
1464 {
1465 switch (channels)
1466 {
1467 case 1u:
1468 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<1u, PC_TOP_LEFT, TScalar>(frame, width, height, framePaddingElements, position, result);
1469 return true;
1470
1471 case 2u:
1472 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<2u, PC_TOP_LEFT, TScalar>(frame, width, height, framePaddingElements, position, result);
1473 return true;
1474
1475 case 3u:
1476 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<3u, PC_TOP_LEFT, TScalar>(frame, width, height, framePaddingElements, position, result);
1477 return true;
1478
1479 case 4u:
1480 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<4u, PC_TOP_LEFT, TScalar>(frame, width, height, framePaddingElements, position, result);
1481 return true;
1482
1483 case 5u:
1484 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<5u, PC_TOP_LEFT, TScalar>(frame, width, height, framePaddingElements, position, result);
1485 return true;
1486
1487 case 6u:
1488 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<6u, PC_TOP_LEFT, TScalar>(frame, width, height, framePaddingElements, position, result);
1489 return true;
1490
1491 case 7u:
1492 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<7u, PC_TOP_LEFT, TScalar>(frame, width, height, framePaddingElements, position, result);
1493 return true;
1494
1495 case 8u:
1496 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<8u, PC_TOP_LEFT, TScalar>(frame, width, height, framePaddingElements, position, result);
1497 return true;
1498
1499 default:
1500 break;
1501 }
1502 }
1503 else
1504 {
1505 ocean_assert(pixelCenter == PC_CENTER);
1506
1507 switch (channels)
1508 {
1509 case 1u:
1510 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<1u, PC_CENTER, TScalar>(frame, width, height, framePaddingElements, position, result);
1511 return true;
1512
1513 case 2u:
1514 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<2u, PC_CENTER, TScalar>(frame, width, height, framePaddingElements, position, result);
1515 return true;
1516
1517 case 3u:
1518 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<3u, PC_CENTER, TScalar>(frame, width, height, framePaddingElements, position, result);
1519 return true;
1520
1521 case 4u:
1522 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<4u, PC_CENTER, TScalar>(frame, width, height, framePaddingElements, position, result);
1523 return true;
1524
1525 case 5u:
1526 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<5u, PC_CENTER, TScalar>(frame, width, height, framePaddingElements, position, result);
1527 return true;
1528
1529 case 6u:
1530 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<6u, PC_CENTER, TScalar>(frame, width, height, framePaddingElements, position, result);
1531 return true;
1532
1533 case 7u:
1534 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<7u, PC_CENTER, TScalar>(frame, width, height, framePaddingElements, position, result);
1535 return true;
1536
1537 case 8u:
1538 FrameInterpolatorBilinear::interpolatePixel8BitPerChannel<8u, PC_CENTER, TScalar>(frame, width, height, framePaddingElements, position, result);
1539 return true;
1540
1541 default:
1542 break;
1543 }
1544 }
1545
1546 ocean_assert(false && "Invalid channel number");
1547 return false;
1548}
1549
1550template <typename TSource, typename TTarget, typename TScalar, typename TIntermediate>
1551bool FrameInterpolatorBilinear::Comfort::interpolatePixel(const TSource* frame, const unsigned int channels, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const PixelCenter pixelCenter, const VectorT2<TScalar>& position, TTarget* result, const TIntermediate& resultBias)
1552{
1553 ocean_assert(frame != nullptr);
1554 ocean_assert(channels >= 1u && channels <= 8u);
1555
1556 if (pixelCenter == PC_TOP_LEFT)
1557 {
1558 switch (channels)
1559 {
1560 case 1u:
1561 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 1u, PC_TOP_LEFT, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1562 return true;
1563
1564 case 2u:
1565 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 2u, PC_TOP_LEFT, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1566 return true;
1567
1568 case 3u:
1569 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 3u, PC_TOP_LEFT, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1570 return true;
1571
1572 case 4u:
1573 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 4u, PC_TOP_LEFT, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1574 return true;
1575
1576 case 5u:
1577 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 5u, PC_TOP_LEFT, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1578 return true;
1579
1580 case 6u:
1581 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 6u, PC_TOP_LEFT, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1582 return true;
1583
1584 case 7u:
1585 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 7u, PC_TOP_LEFT, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1586 return true;
1587
1588 case 8u:
1589 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 8u, PC_TOP_LEFT, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1590 return true;
1591
1592 default:
1593 break;
1594 }
1595 }
1596 else
1597 {
1598 ocean_assert(pixelCenter == PC_CENTER);
1599
1600 switch (channels)
1601 {
1602 case 1u:
1603 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 1u, PC_CENTER, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1604 return true;
1605
1606 case 2u:
1607 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 2u, PC_CENTER, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1608 return true;
1609
1610 case 3u:
1611 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 3u, PC_CENTER, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1612 return true;
1613
1614 case 4u:
1615 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 4u, PC_CENTER, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1616 return true;
1617
1618 case 5u:
1619 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 5u, PC_CENTER, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1620 return true;
1621
1622 case 6u:
1623 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 6u, PC_CENTER, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1624 return true;
1625
1626 case 7u:
1627 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 7u, PC_CENTER, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1628 return true;
1629
1630 case 8u:
1631 FrameInterpolatorBilinear::interpolatePixel<TSource, TTarget, 8u, PC_CENTER, TScalar, TIntermediate>(frame, width, height, framePaddingElements, position, result, resultBias);
1632 return true;
1633
1634 default:
1635 break;
1636 }
1637 }
1638
1639 ocean_assert(false && "Invalid channel number");
1640 return false;
1641}
1642
1643template <typename T, unsigned int tChannels>
1644inline void FrameInterpolatorBilinear::resize(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
1645{
1646 ocean_assert(source != nullptr && target != nullptr);
1647 ocean_assert(sourceWidth >= 1u && sourceHeight >= 1u);
1648 ocean_assert(targetWidth >= 1u && targetHeight >= 1u);
1649
1650 const double sourceX_s_targetX = double(sourceWidth) / double(targetWidth);
1651 const double sourceY_s_targetY = double(sourceHeight) / double(targetHeight);
1652
1653 scale<T, tChannels>(source, target, sourceWidth, sourceHeight, targetWidth, targetHeight, sourceX_s_targetX, sourceY_s_targetY, sourcePaddingElements, targetPaddingElements, worker);
1654}
1655
1656template <typename T, unsigned int tChannels>
1657inline void FrameInterpolatorBilinear::scale(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
1658{
1659 ocean_assert(source != nullptr && target != nullptr);
1660 ocean_assert(sourceWidth >= 1u && sourceHeight >= 1u);
1661 ocean_assert(targetWidth >= 1u && targetHeight >= 1u);
1662 ocean_assert(sourceX_s_targetX > 0.0);
1663 ocean_assert(sourceY_s_targetY > 0.0);
1664
1665 if (sourceWidth == targetWidth && sourceHeight == targetHeight)
1666 {
1667 FrameConverter::subFrame<T>(source, target, sourceWidth, sourceHeight, targetWidth, targetHeight, tChannels, 0u, 0u, 0u, 0u, sourceWidth, sourceHeight, sourcePaddingElements, targetPaddingElements);
1668 return;
1669 }
1670
1671 if (std::is_same<T, uint8_t>::value)
1672 {
1673 // we have a SIMD-based optimized version for 'uint8_t' data types
1674
1675 scale8BitPerChannel<tChannels>((const uint8_t*)source, (uint8_t*)target, sourceWidth, sourceHeight, targetWidth, targetHeight, sourceX_s_targetX, sourceY_s_targetY, sourcePaddingElements, targetPaddingElements, worker);
1676 }
1677 else
1678 {
1679 using TScale = typename FloatTyper<T>::Type;
1680
1681 if (worker)
1682 {
1683 worker->executeFunction(Worker::Function::createStatic(&scaleSubset<T, TScale, tChannels>, source, target, sourceWidth, sourceHeight, targetWidth, targetHeight, sourceX_s_targetX, sourceY_s_targetY, sourcePaddingElements, targetPaddingElements, 0u, 0u), 0u, targetHeight);
1684 }
1685 else
1686 {
1687 scaleSubset<T, TScale, tChannels>(source, target, sourceWidth, sourceHeight, targetWidth, targetHeight, sourceX_s_targetX, sourceY_s_targetY, sourcePaddingElements, targetPaddingElements, 0u, targetHeight);
1688 }
1689 }
1690}
1691
1692template <unsigned int tChannels>
1693inline void FrameInterpolatorBilinear::affine8BitPerChannel(const uint8_t* source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3& source_A_target, const uint8_t* borderColor, uint8_t* target, const CV::PixelPositionI& targetOrigin, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
1694{
1695 // If applicable, apply an additional translation to the affine transformation.
1696 const SquareMatrix3 adjustedAffineTransform = source_A_target * SquareMatrix3(Vector3(1, 0, 0), Vector3(0, 1, 0), Vector3(Scalar(targetOrigin.x()), Scalar(targetOrigin.y()), 1));
1697
1698 if (worker)
1699 {
1700 if (targetWidth >= 4u)
1701 {
1702#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
1703 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::affine8BitPerChannelSSESubset<tChannels>, source, sourceWidth, sourceHeight, &adjustedAffineTransform, borderColor, target, targetWidth, targetHeight, 0u, 0u, sourcePaddingElements, targetPaddingElements), 0, targetHeight, 8u, 9u, 20u);
1704 return;
1705#elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1706 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::affine8BitPerChannelNEONSubset<tChannels>, source, sourceWidth, sourceHeight, &adjustedAffineTransform, borderColor, target, targetWidth, targetHeight, 0u, 0u, sourcePaddingElements, targetPaddingElements), 0, targetHeight, 8u, 9u, 20u);
1707 return;
1708#endif
1709 }
1710
1711 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::affine8BitPerChannelSubset<tChannels>, source, sourceWidth, sourceHeight, &adjustedAffineTransform, borderColor, target, targetWidth, targetHeight, 0u, 0u, sourcePaddingElements, targetPaddingElements), 0, targetHeight, 8u, 9u, 20u);
1712 }
1713 else
1714 {
1715 if (targetWidth >= 4u)
1716 {
1717#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
1718 affine8BitPerChannelSSESubset<tChannels>(source, sourceWidth, sourceHeight, &adjustedAffineTransform, borderColor, target, targetWidth, targetHeight, 0u, targetHeight, sourcePaddingElements, targetPaddingElements);
1719 return;
1720#elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1721 affine8BitPerChannelNEONSubset<tChannels>(source, sourceWidth, sourceHeight, &adjustedAffineTransform, borderColor, target, targetWidth, targetHeight, 0u, targetHeight, sourcePaddingElements, targetPaddingElements);
1722 return;
1723#endif
1724 }
1725
1726 affine8BitPerChannelSubset<tChannels>(source, sourceWidth, sourceHeight, &adjustedAffineTransform, borderColor, target, targetWidth, targetHeight, 0u, targetHeight, sourcePaddingElements, targetPaddingElements);
1727 }
1728}
1729
1730template <unsigned int tChannels>
1731inline void FrameInterpolatorBilinear::homography8BitPerChannel(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3& input_H_output, const uint8_t* borderColor, uint8_t* output, const CV::PixelPositionI& outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker* worker)
1732{
1733 // we adjust the homography to address 'outputOrigin'
1734 const SquareMatrix3 input_H_shiftedOutput = input_H_output * SquareMatrix3(Vector3(1, 0, 0), Vector3(0, 1, 0), Vector3(Scalar(outputOrigin.x()), Scalar(outputOrigin.y()), 1));
1735
1736 if (worker)
1737 {
1738 if (outputWidth >= 4u)
1739 {
1740#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
1741 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::homography8BitPerChannelSSESubset<tChannels>, input, inputWidth, inputHeight, &input_H_shiftedOutput, borderColor, output, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, 0u, 0u), 0, outputHeight, 10u, 11u, 20u);
1742 return;
1743#elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1744 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::homography8BitPerChannelNEONSubset<tChannels>, input, inputWidth, inputHeight, &input_H_shiftedOutput, borderColor, output, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, 0u, 0u), 0, outputHeight, 10u, 11u, 20u);
1745 return;
1746#endif
1747 }
1748
1749 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::homography8BitPerChannelSubset<tChannels>, input, inputWidth, inputHeight, &input_H_shiftedOutput, borderColor, output, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, 0u, 0u), 0, outputHeight, 10u, 11u, 20u);
1750 }
1751 else
1752 {
1753 if (outputWidth >= 4u)
1754 {
1755#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
1756 homography8BitPerChannelSSESubset<tChannels>(input, inputWidth, inputHeight, &input_H_shiftedOutput, borderColor, output, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, 0u, outputHeight);
1757 return;
1758#elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1759 homography8BitPerChannelNEONSubset<tChannels>(input, inputWidth, inputHeight, &input_H_shiftedOutput, borderColor, output, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, 0u, outputHeight);
1760 return;
1761#endif
1762 }
1763
1764 homography8BitPerChannelSubset<tChannels>(input, inputWidth, inputHeight, &input_H_shiftedOutput, borderColor, output, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, 0u, outputHeight);
1765 }
1766}
1767
1768template <typename T, unsigned int tChannels>
1769inline void FrameInterpolatorBilinear::homography(const T* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3& input_H_output, const T* borderColor, T* output, const CV::PixelPositionI& outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker* worker)
1770{
1771 if (std::is_same<T, uint8_t>::value)
1772 {
1773 homography8BitPerChannel<tChannels>((const uint8_t*)input, inputWidth, inputHeight, input_H_output, (const uint8_t*)borderColor, (uint8_t*)output, outputOrigin, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, worker);
1774 return;
1775 }
1776 else
1777 {
1778 // we adjust the homography to address 'outputOrigin'
1779 const SquareMatrix3 input_H_shiftedOutput = input_H_output * SquareMatrix3(Vector3(1, 0, 0), Vector3(0, 1, 0), Vector3(Scalar(outputOrigin.x()), Scalar(outputOrigin.y()), 1));
1780
1781 if (worker)
1782 {
1783 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::homographySubset<T, tChannels>, input, inputWidth, inputHeight, &input_H_shiftedOutput, borderColor, output, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, 0u, 0u), 0, outputHeight, 10u, 11u, 20u);
1784 }
1785 else
1786 {
1787 homographySubset<T, tChannels>(input, inputWidth, inputHeight, &input_H_shiftedOutput, borderColor, output, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, 0u, outputHeight);
1788 }
1789 }
1790}
1791
1792template <unsigned int tChannels>
1793inline void FrameInterpolatorBilinear::homographies8BitPerChannel(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 homographies[4], const uint8_t* borderColor, uint8_t* output, const Vector2& outputQuadrantCenter, const PixelPositionI& outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker* worker)
1794{
1795 if (worker)
1796 {
1797 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::homographies8BitPerChannelSubset<tChannels>, input, inputWidth, inputHeight, homographies, borderColor, output, outputQuadrantCenter.x(), outputQuadrantCenter.y(), outputOrigin.x(), outputOrigin.y(), outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, 0u, 0u), 0, outputHeight, 14u, 15u, 20u);
1798 }
1799 else
1800 {
1801 homographies8BitPerChannelSubset<tChannels>(input, inputWidth, inputHeight, homographies, borderColor, output, outputQuadrantCenter.x(), outputQuadrantCenter.y(), outputOrigin.x(), outputOrigin.y(), outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, 0u, outputHeight);
1802 }
1803}
1804
1805template <unsigned int tChannels>
1806inline void FrameInterpolatorBilinear::homographyMask8BitPerChannel(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3& input_H_output, uint8_t* output, uint8_t* outputMask, const CV::PixelPositionI& outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const uint8_t maskValue, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker* worker)
1807{
1808 // we adjust the homography to address 'outputOrigin'
1809 const SquareMatrix3 input_H_shiftedOutput = input_H_output * SquareMatrix3(Vector3(1, 0, 0), Vector3(0, 1, 0), Vector3(Scalar(outputOrigin.x()), Scalar(outputOrigin.y()), 1));
1810
1811 if (worker)
1812 {
1813 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::homographyMask8BitPerChannelSubset<tChannels>, input, inputWidth, inputHeight, &input_H_shiftedOutput, output, outputMask, maskValue, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, outputMaskPaddingElements, 0u, 0u), 0, outputHeight, 12u, 13u, 20u);
1814 }
1815 else
1816 {
1817 homographyMask8BitPerChannelSubset<tChannels>(input, inputWidth, inputHeight, &input_H_shiftedOutput, output, outputMask, maskValue, outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, outputMaskPaddingElements, 0u, outputHeight);
1818 }
1819}
1820
1821template <unsigned int tChannels>
1822inline void FrameInterpolatorBilinear::homographiesMask8BitPerChannel(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 homographies[4], uint8_t* output, uint8_t* outputMask, const Vector2& outputQuadrantCenter, const CV::PixelPositionI& outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker* worker, const uint8_t maskValue)
1823{
1824 if (worker)
1825 {
1826 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::homographiesMask8BitPerChannelSubset<tChannels>, input, inputWidth, inputHeight, homographies, output, outputMask, maskValue, outputQuadrantCenter.x(), outputQuadrantCenter.y(), outputOrigin.x(), outputOrigin.y(), outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, outputMaskPaddingElements, 0u, 0u), 0, outputHeight);
1827 }
1828 else
1829 {
1830 homographiesMask8BitPerChannelSubset<tChannels>(input, inputWidth, inputHeight, homographies, output, outputMask, maskValue, outputQuadrantCenter.x(), outputQuadrantCenter.y(), outputOrigin.x(), outputOrigin.y(), outputWidth, outputHeight, inputPaddingElements, outputPaddingElements, outputMaskPaddingElements, 0u, outputHeight);
1831 }
1832}
1833
1834template <unsigned int tChannels>
1835inline void FrameInterpolatorBilinear::homographyWithCamera8BitPerChannel(const PinholeCamera& inputCamera, const PinholeCamera& outputCamera, const uint8_t* input, const SquareMatrix3& homography, const bool useDistortionParameters, const uint8_t* borderColor, uint8_t* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker* worker)
1836{
1837 const SquareMatrix3 normalizedHomography(inputCamera.invertedIntrinsic() * homography * outputCamera.intrinsic());
1838
1839 const PinholeCamera::DistortionLookup outputCameraDistortionLookup(outputCamera, 10u);
1840
1841 if (worker)
1842 {
1843 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::homographyWithCamera8BitPerChannelSubset<tChannels>, &inputCamera, &outputCamera, &outputCameraDistortionLookup, input, &normalizedHomography, useDistortionParameters, borderColor, output, inputPaddingElements, outputPaddingElements, 0u, 0u), 0, outputCamera.height());
1844 }
1845 else
1846 {
1847 homographyWithCamera8BitPerChannelSubset<tChannels>(&inputCamera, &outputCamera, &outputCameraDistortionLookup, input, &normalizedHomography, useDistortionParameters, borderColor, output, inputPaddingElements, outputPaddingElements, 0u, outputCamera.height());
1848 }
1849}
1850
1851template <unsigned int tChannels>
1852inline void FrameInterpolatorBilinear::homographyWithCameraMask8BitPerChannel(const PinholeCamera& inputCamera, const PinholeCamera& outputCamera, const uint8_t* input, const unsigned int inputPaddingElements, const SquareMatrix3& homography, uint8_t* output, uint8_t* outputMask, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker* worker, const uint8_t maskValue)
1853{
1854 const SquareMatrix3 normalizedHomography(inputCamera.invertedIntrinsic() * homography * outputCamera.intrinsic());
1855
1856 const PinholeCamera::DistortionLookup outputCameraDistortionLookup(outputCamera, 10u);
1857
1858 if (worker)
1859 {
1860 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::homographyWithCameraMask8BitPerChannelSubset<tChannels>, &inputCamera, &outputCamera, &outputCameraDistortionLookup, input, inputPaddingElements, &normalizedHomography, output, outputMask, outputPaddingElements, outputMaskPaddingElements, maskValue, 0u, 0u), 0, outputCamera.height(), 11u, 12u, 10u);
1861 }
1862 else
1863 {
1864 homographyWithCameraMask8BitPerChannelSubset<tChannels>(&inputCamera, &outputCamera, &outputCameraDistortionLookup, input, inputPaddingElements, &normalizedHomography, output, outputMask, outputPaddingElements, outputMaskPaddingElements, maskValue, 0u, outputCamera.height());
1865 }
1866}
1867
1868template <typename T, unsigned int tChannels>
1869inline void FrameInterpolatorBilinear::lookup(const T* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable& input_LT_output, const bool offset, const T* borderColor, T* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker* worker, const bool useOptimizedNEON, const bool useOptimizedBilinearValuesAndFactorCalculation, [[maybe_unused]] const bool useOptimizedNEONFactorReplication)
1870{
1871 if constexpr (std::is_same<T, uint8_t>::value)
1872 {
1873#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1874 if ((tChannels >= 1u && input_LT_output.sizeX() >= 8) || (tChannels >= 2u && input_LT_output.sizeX() >= 4))
1875 {
1876 // NEON implementation for 1 channel: min width 8; for 2+ channels: min width 4
1877
1878 if (worker)
1879 {
1880 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::lookup8BitPerChannelSubsetNEON<tChannels>, input, inputWidth, inputHeight, &input_LT_output, offset, borderColor, output, inputPaddingElements, outputPaddingElements, 0u, 0u, useOptimizedNEON, useOptimizedBilinearValuesAndFactorCalculation, useOptimizedNEONFactorReplication), 0u, (unsigned int)(input_LT_output.sizeY()), 9u, 10u, 20u);
1881 }
1882 else
1883 {
1884 lookup8BitPerChannelSubsetNEON<tChannels>(input, inputWidth, inputHeight, &input_LT_output, offset, borderColor, output, inputPaddingElements, outputPaddingElements, 0u, (unsigned int)(input_LT_output.sizeY()), useOptimizedNEON, useOptimizedBilinearValuesAndFactorCalculation, useOptimizedNEONFactorReplication);
1885 }
1886
1887 return;
1888 }
1889#endif // defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1890
1891 if (worker)
1892 {
1893 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::lookup8BitPerChannelSubset<tChannels>, input, inputWidth, inputHeight, &input_LT_output, offset, borderColor, output, inputPaddingElements, outputPaddingElements, 0u, 0u), 0u, (unsigned int)input_LT_output.sizeY(), 9u, 10u, 20u);
1894 }
1895 else
1896 {
1897 lookup8BitPerChannelSubset<tChannels>(input, inputWidth, inputHeight, &input_LT_output, offset, borderColor, output, inputPaddingElements, outputPaddingElements, 0u, (unsigned int)(input_LT_output.sizeY()));
1898 }
1899 }
1900 else
1901 {
1902 ocean_assert((!std::is_same<T, uint8_t>::value));
1903
1904 if (worker)
1905 {
1906 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::lookupSubset<T, tChannels>, input, inputWidth, inputHeight, &input_LT_output, offset, borderColor, output, inputPaddingElements, outputPaddingElements, 0u, 0u), 0u, (unsigned int)(input_LT_output.sizeY()), 9u, 10u, 20u);
1907 }
1908 else
1909 {
1910 lookupSubset<T, tChannels>(input, inputWidth, inputHeight, &input_LT_output, offset, borderColor, output, inputPaddingElements, outputPaddingElements, 0u, (unsigned int)(input_LT_output.sizeY()));
1911 }
1912 }
1913}
1914
1915template <unsigned int tChannels>
1916inline void FrameInterpolatorBilinear::lookupMask8BitPerChannel(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable& input_LT_output, const bool offset, uint8_t* output, uint8_t* outputMask, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker* worker, const uint8_t maskValue)
1917{
1918 if (worker)
1919 {
1920 worker->executeFunction(Worker::Function::createStatic(&FrameInterpolatorBilinear::lookupMask8BitPerChannelSubset<tChannels>, input, inputWidth, inputHeight, &input_LT_output, offset, output, outputMask, maskValue, inputPaddingElements, outputPaddingElements, outputMaskPaddingElements, 0u, 0u), 0u, (unsigned int)(input_LT_output.sizeY()), 11u, 12u, 20u);
1921 }
1922 else
1923 {
1924 lookupMask8BitPerChannelSubset<tChannels>(input, inputWidth, inputHeight, &input_LT_output, offset, output, outputMask, maskValue, inputPaddingElements, outputPaddingElements, outputMaskPaddingElements, 0u, (unsigned int)(input_LT_output.sizeY()));
1925 }
1926}
1927
1928template <typename T, unsigned int tChannels>
1929void FrameInterpolatorBilinear::resampleCameraImage(const T* sourceFrame, const AnyCamera& sourceCamera, const SquareMatrix3& source_R_target, const AnyCamera& targetCamera, T* targetFrame, const unsigned int sourceFramePaddingElements, const unsigned int targetFramePaddingElements, LookupCorner2<Vector2>* source_OLT_target, Worker* worker, const unsigned int binSizeInPixel, const T* borderColor)
1930{
1931 static_assert(tChannels >= 1u, "Invalid channel number!");
1932
1933 ocean_assert(sourceFrame != nullptr);
1934 ocean_assert(sourceCamera.isValid());
1935 ocean_assert(source_R_target.isOrthonormal());
1936 ocean_assert(targetCamera.isValid());
1937 ocean_assert(targetFrame != nullptr);
1938 ocean_assert(binSizeInPixel >= 1u);
1939
1940 const size_t binsX = std::max(1u, targetCamera.width() / binSizeInPixel);
1941 const size_t binsY = std::max(1u, targetCamera.height() / binSizeInPixel);
1942 CV::FrameInterpolatorBilinear::LookupTable lookupTable(targetCamera.width(), targetCamera.height(), binsX, binsY);
1943
1944 for (size_t yBin = 0; yBin <= lookupTable.binsY(); ++yBin)
1945 {
1946 for (size_t xBin = 0; xBin <= lookupTable.binsX(); ++xBin)
1947 {
1948 const Vector2 cornerPosition = lookupTable.binTopLeftCornerPosition(xBin, yBin);
1949
1950 constexpr bool makeUnitVector = false; // we don't need a unit/normalized vector as we project the vector into the camera again
1951
1952 const Vector3 rayI = source_R_target * targetCamera.vector(cornerPosition, makeUnitVector);
1953 const Vector3 rayIF = Vector3(rayI.x(), -rayI.y(), -rayI.z());
1954
1955 if (rayIF.z() > Numeric::eps())
1956 {
1957 const Vector2 projectedPoint = sourceCamera.projectToImageIF(rayIF);
1958
1959 lookupTable.setBinTopLeftCornerValue(xBin, yBin, projectedPoint - cornerPosition);
1960 }
1961 else
1962 {
1963 // simply a coordinate far outside the input
1964 lookupTable.setBinTopLeftCornerValue(xBin, yBin, Vector2(Scalar(sourceCamera.width() * 10u), Scalar(sourceCamera.height() * 10u)));
1965 }
1966 }
1967 }
1968
1969 lookup<T, tChannels>(sourceFrame, sourceCamera.width(), sourceCamera.height(), lookupTable, true /*offset*/, borderColor, targetFrame, sourceFramePaddingElements, targetFramePaddingElements, worker);
1970
1971 if (source_OLT_target)
1972 {
1973 *source_OLT_target = std::move(lookupTable);
1974 }
1975}
1976
1977template <unsigned int tChannels>
1978void FrameInterpolatorBilinear::rotate8BitPerChannel(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const Scalar horizontalAnchorPosition, const Scalar verticalAnchorPosition, const Scalar angle, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker, const uint8_t* borderColor)
1979{
1980 static_assert(tChannels != 0u, "Invalid channel number!");
1981
1982 ocean_assert(source != nullptr && target != nullptr);
1983 ocean_assert(width >= 1u && height >= 1u);
1984
1985 if (worker)
1986 {
1987 worker->executeFunction(Worker::Function::createStatic(&rotate8BitPerChannelSubset<tChannels>, source, target, width, height, horizontalAnchorPosition, verticalAnchorPosition, angle, borderColor, sourcePaddingElements, targetPaddingElements, 0u, 0u), 0u, height);
1988 }
1989 else
1990 {
1991 rotate8BitPerChannelSubset<tChannels>(source, target, width, height, horizontalAnchorPosition, verticalAnchorPosition, angle, borderColor, sourcePaddingElements, targetPaddingElements, 0u, height);
1992 }
1993}
1994
1995template <unsigned int tChannels, PixelCenter tPixelCenter, typename TScalar>
1996inline void FrameInterpolatorBilinear::interpolatePixel8BitPerChannel(const uint8_t* frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const VectorT2<TScalar>& position, uint8_t* result)
1997{
1998 static_assert(tChannels != 0u, "Invalid channel number!");
1999 static_assert(tPixelCenter == PC_TOP_LEFT || tPixelCenter == PC_CENTER, "Invalid pixel center!");
2000
2001 ocean_assert(frame != nullptr && result != nullptr);
2002 ocean_assert(width != 0u && height != 0u);
2003
2004 const unsigned int frameStrideElements = width * tChannels + framePaddingElements;
2005
2006 ocean_assert(position.x() >= TScalar(0));
2007 ocean_assert(position.y() >= TScalar(0));
2008
2009 if constexpr (tPixelCenter == PC_TOP_LEFT)
2010 {
2011 ocean_assert(position.x() <= TScalar(width - 1u));
2012 ocean_assert(position.y() <= TScalar(height - 1u));
2013
2014 const unsigned int left = (unsigned int)(position.x());
2015 const unsigned int top = (unsigned int)(position.y());
2016 ocean_assert(left < width && top < height);
2017
2018 const TScalar tx = position.x() - TScalar(left);
2019 ocean_assert(tx >= 0 && tx <= 1);
2020 const unsigned int txi = (unsigned int)(tx * TScalar(128) + TScalar(0.5));
2021 const unsigned int txi_ = 128u - txi;
2022
2023 const TScalar ty = position.y() - TScalar(top);
2024 ocean_assert(ty >= 0 && ty <= 1);
2025 const unsigned int tyi = (unsigned int)(ty * TScalar(128) + TScalar(0.5));
2026 const unsigned int tyi_ = 128u - tyi;
2027
2028 const unsigned int rightOffset = left + 1u < width ? tChannels : 0u;
2029 const unsigned int bottomOffset = top + 1u < height ? frameStrideElements : 0u;
2030
2031 const uint8_t* const topLeft = frame + top * frameStrideElements + tChannels * left;
2032
2033 const unsigned int txty = txi * tyi;
2034 const unsigned int txty_ = txi * tyi_;
2035 const unsigned int tx_ty = txi_ * tyi;
2036 const unsigned int tx_ty_ = txi_ * tyi_;
2037
2038 for (unsigned int n = 0u; n < tChannels; ++n)
2039 {
2040 result[n] = uint8_t((topLeft[n] * tx_ty_ + topLeft[rightOffset + n] * txty_ + topLeft[bottomOffset + n] * tx_ty + topLeft[bottomOffset + rightOffset + n] * txty + 8192u) >> 14u);
2041 }
2042 }
2043 else
2044 {
2045 ocean_assert(tPixelCenter == PC_CENTER);
2046
2047 ocean_assert(position.x() <= TScalar(width));
2048 ocean_assert(position.y() <= TScalar(height));
2049
2050 const TScalar xShifted = std::max(TScalar(0.0), position.x() - TScalar(0.5));
2051 const TScalar yShifted = std::max(TScalar(0.0), position.y() - TScalar(0.5));
2052
2053 const unsigned int left = (unsigned int)(xShifted);
2054 const unsigned int top = (unsigned int)(yShifted);
2055
2056 ocean_assert(left < width);
2057 ocean_assert(top < height);
2058
2059 const TScalar tx = xShifted - TScalar(left);
2060 const TScalar ty = yShifted - TScalar(top);
2061
2062 ocean_assert(tx >= 0 && tx <= 1);
2063 ocean_assert(ty >= 0 && ty <= 1);
2064
2065 const unsigned int txi = (unsigned int)(tx * TScalar(128) + TScalar(0.5));
2066 const unsigned int txi_ = 128u - txi;
2067
2068 const unsigned int tyi = (unsigned int)(ty * TScalar(128) + TScalar(0.5));
2069 const unsigned int tyi_ = 128u - tyi;
2070
2071 const unsigned int rightOffset = left + 1u < width ? tChannels : 0u;
2072 const unsigned int bottomOffset = top + 1u < height ? frameStrideElements : 0u;
2073
2074 const uint8_t* const topLeft = frame + top * frameStrideElements + left * tChannels;
2075
2076 const unsigned int txty = txi * tyi;
2077 const unsigned int txty_ = txi * tyi_;
2078 const unsigned int tx_ty = txi_ * tyi;
2079 const unsigned int tx_ty_ = txi_ * tyi_;
2080
2081 for (unsigned int n = 0u; n < tChannels; ++n)
2082 {
2083 result[n] = uint8_t((topLeft[n] * tx_ty_ + topLeft[rightOffset + n] * txty_ + topLeft[bottomOffset + n] * tx_ty + topLeft[bottomOffset + rightOffset + n] * txty + 8192u) >> 14u);
2084 }
2085 }
2086}
2087
2088template <typename TSource, typename TTarget, unsigned int tChannels, PixelCenter tPixelCenter, typename TScalar, typename TIntermediate>
2089inline void FrameInterpolatorBilinear::interpolatePixel(const TSource* frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const VectorT2<TScalar>& position, TTarget* result, const TIntermediate& resultBias)
2090{
2091 static_assert(tChannels != 0u, "Invalid channel number!");
2092 static_assert(tPixelCenter == PC_TOP_LEFT || tPixelCenter == PC_CENTER, "Invalid pixel center!");
2093
2094 ocean_assert(frame != nullptr && result != nullptr);
2095 ocean_assert(width != 0u && height != 0u);
2096
2097 const unsigned int frameStrideElements = width * tChannels + framePaddingElements;
2098
2099 ocean_assert(position.x() >= TScalar(0));
2100 ocean_assert(position.y() >= TScalar(0));
2101
2102 if constexpr (tPixelCenter == PC_TOP_LEFT)
2103 {
2104 ocean_assert(position.x() <= TScalar(width - 1u));
2105 ocean_assert(position.y() <= TScalar(height - 1u));
2106
2107 const unsigned int left = (unsigned int)(position.x());
2108 const unsigned int top = (unsigned int)(position.y());
2109
2110 const TScalar tx = position.x() - TScalar(left);
2111 ocean_assert(tx >= 0 && tx <= 1);
2112
2113 const TScalar ty = position.y() - TScalar(top);
2114 ocean_assert(ty >= 0 && ty <= 1);
2115
2116 const unsigned int rightOffset = left + 1u < width ? tChannels : 0u;
2117 const unsigned int bottomOffset = top + 1u < height ? frameStrideElements : 0u;
2118
2119 const TSource* const topLeft = frame + top * frameStrideElements + tChannels * left;
2120
2121 const TIntermediate txty = TIntermediate(tx) * TIntermediate(ty);
2122 const TIntermediate txty_ = TIntermediate(tx) * (TIntermediate(1) - TIntermediate(ty));
2123 const TIntermediate tx_ty = (TIntermediate(1) - TIntermediate(tx)) * TIntermediate(ty);
2124 const TIntermediate tx_ty_ = (TIntermediate(1) - TIntermediate(tx)) * (TIntermediate(1) - TIntermediate(ty));
2125
2126 ocean_assert_accuracy(NumericT<TIntermediate>::isEqual(txty + txty_ + tx_ty + tx_ty_, TIntermediate(1)));
2127
2128 for (unsigned int n = 0u; n < tChannels; ++n)
2129 {
2130 result[n] = TTarget(TIntermediate(topLeft[n]) * tx_ty_ + TIntermediate(topLeft[rightOffset + n]) * txty_ + TIntermediate(topLeft[bottomOffset + n]) * tx_ty + TIntermediate(topLeft[bottomOffset + rightOffset + n]) * txty + resultBias);
2131 }
2132 }
2133 else
2134 {
2135 ocean_assert(tPixelCenter == PC_CENTER);
2136
2137 ocean_assert(position.x() <= TScalar(width));
2138 ocean_assert(position.y() <= TScalar(height));
2139
2140 const TScalar xShifted = std::max(TScalar(0.0), position.x() - TScalar(0.5));
2141 const TScalar yShifted = std::max(TScalar(0.0), position.y() - TScalar(0.5));
2142
2143 const unsigned int left = (unsigned int)(xShifted);
2144 const unsigned int top = (unsigned int)(yShifted);
2145
2146 ocean_assert(left < width);
2147 ocean_assert(top < height);
2148
2149 const TScalar tx = xShifted - TScalar(left);
2150 const TScalar ty = yShifted - TScalar(top);
2151
2152 ocean_assert(tx >= 0 && tx <= 1);
2153 ocean_assert(ty >= 0 && ty <= 1);
2154
2155 const unsigned int rightOffset = left + 1u < width ? tChannels : 0u;
2156 const unsigned int bottomOffset = top + 1u < height ? frameStrideElements : 0u;
2157
2158 const TSource* const topLeft = frame + top * frameStrideElements + tChannels * left;
2159
2160 const TIntermediate txty = TIntermediate(tx) * TIntermediate(ty);
2161 const TIntermediate txty_ = TIntermediate(tx) * (TIntermediate(1) - TIntermediate(ty));
2162 const TIntermediate tx_ty = (TIntermediate(1) - TIntermediate(tx)) * TIntermediate(ty);
2163 const TIntermediate tx_ty_ = (TIntermediate(1) - TIntermediate(tx)) * (TIntermediate(1) - TIntermediate(ty));
2164
2165 ocean_assert_accuracy(NumericT<TIntermediate>::isEqual(txty + txty_ + tx_ty + tx_ty_, TIntermediate(1)));
2166
2167 for (unsigned int n = 0u; n < tChannels; ++n)
2168 {
2169 result[n] = TTarget(TIntermediate(topLeft[n]) * tx_ty_ + TIntermediate(topLeft[rightOffset + n]) * txty_ + TIntermediate(topLeft[bottomOffset + n]) * tx_ty + TIntermediate(topLeft[bottomOffset + rightOffset + n]) * txty + resultBias);
2170 }
2171 }
2172}
2173
2174template <unsigned int tChannels, bool tAlphaAtFront, bool tTransparentIs0xFF>
2175inline void FrameInterpolatorBilinear::interpolate1PixelFullAlphaBorder8BitPerChannel(const uint8_t* frame, const unsigned int width, const unsigned int height, const Vector2& position, uint8_t* result, const unsigned int framePaddingElements)
2176{
2177 static_assert(tChannels != 0u, "Invalid channel number!");
2178
2179 ocean_assert(frame && result);
2180
2181 const Vector2 pos(position.x() - Scalar(0.5), position.y() - Scalar(0.5));
2182
2183 // check whether the position is outside the frame and will therefore be 100% transparent
2184 if (pos.x() <= Scalar(-1) || pos.y() <= Scalar(-1) || pos.x() >= Scalar(width) || pos.y() >= Scalar(height))
2185 {
2186 for (unsigned int n = 0u; n < tChannels - 1u; ++n)
2187 {
2189 }
2190
2191 result[FrameBlender::SourceOffset<tAlphaAtFront>::template alpha<tChannels>()] = FrameBlender::fullTransparent8Bit<tTransparentIs0xFF>();
2192
2193 return;
2194 }
2195
2196 const unsigned int frameStrideElements = width * tChannels + framePaddingElements;
2197
2198 const int left = int(Numeric::floor(pos.x()));
2199 const int top = int(Numeric::floor(pos.y()));
2200
2201 ocean_assert(left >= -1 && left < int(width));
2202 ocean_assert(top >= -1 && top < int(height));
2203
2204 if ((unsigned int)left < width - 1u && (unsigned int)top < height - 1u)
2205 {
2206 // we have a valid pixel position for the left, top, right and bottom pixel
2207
2208 const unsigned int txi = (unsigned int)((pos.x() - Scalar(left)) * Scalar(128) + Scalar(0.5));
2209 const unsigned int txi_ = 128u - txi;
2210
2211 const unsigned int tyi = (unsigned int)((pos.y() - Scalar(top)) * Scalar(128) + Scalar(0.5));
2212 const unsigned int tyi_ = 128u - tyi;
2213
2214 const uint8_t* const topLeft = frame + top * frameStrideElements + left * tChannels;
2215
2216 const unsigned int txty = txi * tyi;
2217 const unsigned int txty_ = txi * tyi_;
2218 const unsigned int tx_ty = txi_ * tyi;
2219 const unsigned int tx_ty_ = txi_ * tyi_;
2220
2221 for (unsigned int n = 0u; n < tChannels; ++n)
2222 {
2223 result[n] = (topLeft[n] * tx_ty_ + topLeft[tChannels + n] * txty_
2224 + topLeft[frameStrideElements + n] * tx_ty + topLeft[frameStrideElements + tChannels + n] * txty + 8192u) >> 14u;
2225 }
2226 }
2227 else
2228 {
2229 // we do not have a valid pixel for all 4-neighborhood pixels
2230
2231 const unsigned int txi = (unsigned int)((pos.x() - Scalar(left)) * Scalar(128) + Scalar(0.5));
2232 const unsigned int txi_ = 128u - txi;
2233
2234 const unsigned int tyi = (unsigned int)((pos.y() - Scalar(top)) * Scalar(128) + Scalar(0.5));
2235 const unsigned int tyi_ = 128u - tyi;
2236
2237 const unsigned int rightOffset = (left >= 0 && left + 1u < width) ? tChannels : 0u;
2238 const unsigned int bottomOffset = (top >= 0 && top + 1u < height) ? frameStrideElements : 0u;
2239
2240 ocean_assert(left < int(width) && top < int(height));
2241 const uint8_t* const topLeft = frame + max(0, top) * frameStrideElements + max(0, left) * tChannels;
2242
2243 const unsigned int txty = txi * tyi;
2244 const unsigned int txty_ = txi * tyi_;
2245 const unsigned int tx_ty = txi_ * tyi;
2246 const unsigned int tx_ty_ = txi_ * tyi_;
2247
2248 for (unsigned int n = FrameBlender::SourceOffset<tAlphaAtFront>::data(); n < tChannels + FrameBlender::SourceOffset<tAlphaAtFront>::data() - 1u; ++n)
2249 {
2250 result[n] = (topLeft[n] * tx_ty_ + topLeft[rightOffset + n] * txty_
2251 + topLeft[bottomOffset + n] * tx_ty + topLeft[bottomOffset + rightOffset + n] * txty + 8192u) >> 14u;
2252 }
2253
2254 const uint8_t alphaTopLeft = (left >= 0 && top >= 0) ? topLeft[FrameBlender::SourceOffset<tAlphaAtFront>::template alpha<tChannels>()] : FrameBlender::fullTransparent8Bit<tTransparentIs0xFF>();
2255 const uint8_t alphaTopRight = (left + 1u < width && top >= 0) ? topLeft[rightOffset + FrameBlender::SourceOffset<tAlphaAtFront>::template alpha<tChannels>()] : FrameBlender::fullTransparent8Bit<tTransparentIs0xFF>();
2256 const uint8_t alphaBottomLeft = (left >= 0 && top + 1u < height) ? topLeft[bottomOffset + FrameBlender::SourceOffset<tAlphaAtFront>::template alpha<tChannels>()] : FrameBlender::fullTransparent8Bit<tTransparentIs0xFF>();
2257 const uint8_t alphaBottomRight = (left + 1u < width && top + 1u < height) ? topLeft[bottomOffset + rightOffset + FrameBlender::SourceOffset<tAlphaAtFront>::template alpha<tChannels>()] : FrameBlender::fullTransparent8Bit<tTransparentIs0xFF>();
2258
2259 result[FrameBlender::SourceOffset<tAlphaAtFront>::template alpha<tChannels>()] = (alphaTopLeft * tx_ty_ + alphaTopRight * txty_ + alphaBottomLeft * tx_ty + alphaBottomRight * txty + 8192u) >> 14u;
2260 }
2261}
2262
2263template <unsigned int tChannels>
2264void FrameInterpolatorBilinear::affine8BitPerChannelSubset(const uint8_t* source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3* source_A_target, const uint8_t* borderColor, uint8_t* target, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int firstTargetRow, const unsigned int numberOutputRows, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements)
2265{
2266 static_assert(tChannels >= 1u, "Invalid channel number!");
2267
2268 ocean_assert(source != nullptr && target != nullptr);
2269 ocean_assert(sourceWidth > 0u && sourceHeight > 0u);
2270 ocean_assert_and_suppress_unused(targetWidth > 0u && targetHeight > 0u, targetHeight);
2271 ocean_assert(source_A_target);
2272 ocean_assert(!source_A_target->isNull() && Numeric::isEqualEps((*source_A_target)[2]) && Numeric::isEqualEps((*source_A_target)[5]));
2273
2274 ocean_assert(firstTargetRow + numberOutputRows <= targetHeight);
2275
2276 const unsigned int targetStrideElements = tChannels * targetWidth + targetPaddingElements;
2277
2278 const Scalar scalarSourceWidth_1 = Scalar(sourceWidth - 1u);
2279 const Scalar scalarSourceHeight_1 = Scalar(sourceHeight - 1u);
2280
2281 using PixelType = typename DataType<uint8_t, tChannels>::Type;
2282
2283 uint8_t zeroColor[tChannels] = {uint8_t(0)};
2284 const PixelType* const bColor = borderColor ? (PixelType*)borderColor : (PixelType*)zeroColor;
2285
2286 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberOutputRows; ++y)
2287 {
2288 PixelType* targetRow = (PixelType*)(target + y * targetStrideElements);
2289
2290 /*
2291 * We can slightly optimize the 3x3 matrix multiplication:
2292 *
2293 * | X0 Y0 Z0 | | x |
2294 * | X1 Y1 Z1 | * | y |
2295 * | 0 0 1 | | 1 |
2296 *
2297 * | xx | | X0 * x | | Y0 * y + Z0 |
2298 * | yy | = | X1 * x | + | Y1 * y + Z1 |
2299 *
2300 * As y is constant within the inner loop, the two terms on the right side in the above equations can be pre-calculated:
2301 *
2302 * C0 = Y0 * y + Z0
2303 * C1 = Y1 * y + Z1
2304 *
2305 * So the computation becomes:
2306 *
2307 * | x' | | X0 * x | | C0 |
2308 * | y' | = | X1 * x | + | C1 |
2309 */
2310
2311 const Vector2 X(source_A_target->data() + 0);
2312 const Vector2 c(Vector2(source_A_target->data() + 3) * Scalar(y) + Vector2(source_A_target->data() + 6));
2313
2314 for (unsigned int x = 0u; x < targetWidth; ++x)
2315 {
2316 const Vector2 sourcePosition = X * Scalar(x) + c;
2317
2318#ifdef OCEAN_DEBUG
2319 const Scalar debugSourceX = (*source_A_target)[0] * Scalar(x) + (*source_A_target)[3] * Scalar(y) + (*source_A_target)[6];
2320 const Scalar debugSourceY = (*source_A_target)[1] * Scalar(x) + (*source_A_target)[4] * Scalar(y) + (*source_A_target)[7];
2321 ocean_assert(sourcePosition.isEqual(Vector2(debugSourceX, debugSourceY), Scalar(0.01)));
2322#endif
2323
2324 if (sourcePosition.x() < Scalar(0) || sourcePosition.x() > scalarSourceWidth_1 || sourcePosition.y() < Scalar(0) || sourcePosition.y() > scalarSourceHeight_1)
2325 {
2326 *targetRow = *bColor;
2327 }
2328 else
2329 {
2330 interpolatePixel8BitPerChannel<tChannels, PC_TOP_LEFT>(source, sourceWidth, sourceHeight, sourcePaddingElements, sourcePosition, (uint8_t*)(targetRow));
2331 }
2332
2333 targetRow++;
2334 }
2335 }
2336}
2337
2338template <unsigned int tChannels>
2339void FrameInterpolatorBilinear::homography8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* input_H_output, const uint8_t* borderColor, uint8_t* output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
2340{
2341 static_assert(tChannels >= 1u, "Invalid channel number!");
2342
2343 ocean_assert(input != nullptr && output != nullptr);
2344 ocean_assert(inputWidth > 0u && inputHeight > 0u);
2345 ocean_assert(outputWidth > 0u && outputHeight > 0u);
2346 ocean_assert(input_H_output != nullptr);
2347
2348 ocean_assert_and_suppress_unused(firstOutputRow + numberOutputRows <= outputHeight, outputHeight);
2349
2350 const unsigned int outputStrideElements = outputWidth * tChannels + outputPaddingElements;
2351
2352 const Scalar scalarInputWidth_1 = Scalar(inputWidth - 1u);
2353 const Scalar scalarInputHeight_1 = Scalar(inputHeight - 1u);
2354
2355 using PixelType = typename DataType<uint8_t, tChannels>::Type;
2356
2357 uint8_t zeroColor[tChannels] = {uint8_t(0)};
2358 const PixelType bColor = borderColor ? *(PixelType*)borderColor : *(PixelType*)zeroColor;
2359
2360 for (unsigned int y = firstOutputRow; y < firstOutputRow + numberOutputRows; ++y)
2361 {
2362 /*
2363 * We can slightly optimize the 3x3 matrix multiplication:
2364 *
2365 * | X0 Y0 Z0 | | x |
2366 * | X1 Y1 Z1 | * | y |
2367 * | X2 Y2 Z2 | | 1 |
2368 *
2369 * | xx | | X0 * x | | Y0 * y + Z0 |
2370 * | yy | = | X1 * x | + | Y1 * y + Z1 |
2371 * | zz | | X2 * x | | Y2 * y + Z2 |
2372 *
2373 * | xx | | X0 * x | | C0 |
2374 * | yy | = | X1 * x | + | C1 |
2375 * | zz | | X2 * x | | C2 |
2376 *
2377 * As y is constant within the inner loop, we can pre-calculate the following terms:
2378 *
2379 * | x' | | (X0 * x + C0) / (X2 * x + C2) |
2380 * | y' | = | (X1 * x + C1) / (X2 * x + C2) |
2381 */
2382
2383 const Vector2 X(input_H_output->data() + 0);
2384 const Vector2 c(Vector2(input_H_output->data() + 3) * Scalar(y) + Vector2(input_H_output->data() + 6));
2385
2386 const Scalar X2 = (*input_H_output)(2, 0);
2387 const Scalar constValue2 = (*input_H_output)(2, 1) * Scalar(y) + (*input_H_output)(2, 2);
2388
2389 PixelType* outputRowPixel = (PixelType*)(output + y * outputStrideElements);
2390
2391 for (unsigned int x = 0u; x < outputWidth; ++x)
2392 {
2393 ocean_assert_accuracy(Numeric::isNotEqualEps((X2 * Scalar(x) + constValue2)));
2394 const Vector2 inputPosition((X * Scalar(x) + c) / (X2 * Scalar(x) + constValue2));
2395
2396#ifdef OCEAN_DEBUG
2397 const Vector2 debugInputPosition(*input_H_output * Vector2(Scalar(x), Scalar(y)));
2398 ocean_assert(inputPosition.isEqual(debugInputPosition, Scalar(0.01)));
2399#endif
2400
2401 if (inputPosition.x() < Scalar(0) || inputPosition.x() > scalarInputWidth_1 || inputPosition.y() < Scalar(0) || inputPosition.y() > scalarInputHeight_1)
2402 {
2403 *outputRowPixel = bColor;
2404 }
2405 else
2406 {
2407 interpolatePixel8BitPerChannel<tChannels, PC_TOP_LEFT>(input, inputWidth, inputHeight, inputPaddingElements, inputPosition, (uint8_t*)(outputRowPixel));
2408 }
2409
2410 ++outputRowPixel;
2411 }
2412 }
2413}
2414
2415template <typename T, unsigned int tChannels>
2416void FrameInterpolatorBilinear::homographySubset(const T* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* input_H_output, const T* borderColor, T* output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
2417{
2418 static_assert(tChannels >= 1u, "Invalid channel number!");
2419
2420 ocean_assert(input != nullptr && output != nullptr);
2421 ocean_assert(inputWidth > 0u && inputHeight > 0u);
2422 ocean_assert_and_suppress_unused(outputWidth > 0u && outputHeight > 0u, outputHeight);
2423 ocean_assert(input_H_output != nullptr);
2424
2425 ocean_assert(firstOutputRow + numberOutputRows <= outputHeight);
2426
2427 const unsigned int outputStrideElements = outputWidth * tChannels + outputPaddingElements;
2428
2429 const Scalar scalarInputWidth1 = Scalar(inputWidth - 1u);
2430 const Scalar scalarInputHeight1 = Scalar(inputHeight - 1u);
2431
2432 // we need to find a best matching floating point data type for the intermediate interpolation results
2433 using TIntermediate = typename FloatTyper<T>::Type;
2434
2435 using PixelType = typename DataType<T, tChannels>::Type;
2436
2437 constexpr T zeroColor[tChannels] = {T(0)};
2438 const PixelType* const bColor = borderColor ? (PixelType*)(borderColor) : (PixelType*)(zeroColor);
2439
2440 constexpr TIntermediate bias = TIntermediate(0);
2441
2442 for (unsigned int y = firstOutputRow; y < firstOutputRow + numberOutputRows; ++y)
2443 {
2444 /*
2445 * We can slightly optimize the 3x3 matrix multiplication:
2446 *
2447 * | X0 Y0 Z0 | | x |
2448 * | X1 Y1 Z1 | * | y |
2449 * | X2 Y2 Z2 | | 1 |
2450 *
2451 * | xx | | X0 * x | | Y0 * y + Z0 |
2452 * | yy | = | X1 * x | + | Y1 * y + Z1 |
2453 * | zz | | X2 * x | | Y2 * y + Z2 |
2454 *
2455 * | xx | | X0 * x | | C0 |
2456 * | yy | = | X1 * x | + | C1 |
2457 * | zz | | X2 * x | | C3 |
2458 *
2459 * As y is constant within the inner loop, we can pre-calculate the following terms:
2460 *
2461 * | x' | | (X0 * x + C0) / (X2 * x + C2) |
2462 * | y' | = | (X1 * x + C1) / (X2 * x + C2) |
2463 */
2464
2465 const Vector2 X(input_H_output->data() + 0);
2466 const Vector2 c(Vector2(input_H_output->data() + 3) * Scalar(y) + Vector2(input_H_output->data() + 6));
2467
2468 const Scalar X2 = (*input_H_output)(2, 0);
2469 const Scalar constValue2 = (*input_H_output)(2, 1) * Scalar(y) + (*input_H_output)(2, 2);
2470
2471 PixelType* outputRowPixel = (PixelType*)(output + y * outputStrideElements);
2472
2473 for (unsigned int x = 0u; x < outputWidth; ++x)
2474 {
2475 ocean_assert_accuracy(Numeric::isNotEqualEps((X2 * Scalar(x) + constValue2)));
2476 const Vector2 inputPosition((X * Scalar(x) + c) / (X2 * Scalar(x) + constValue2));
2477
2478#ifdef OCEAN_DEBUG
2479 const Vector2 debugInputPosition(*input_H_output * Vector2(Scalar(x), Scalar(y)));
2480 ocean_assert((std::is_same<float, Scalar>::value) || inputPosition.isEqual(debugInputPosition, Scalar(0.01)));
2481#endif
2482
2483 if (inputPosition.x() >= Scalar(0) && inputPosition.x() <= scalarInputWidth1 && inputPosition.y() >= Scalar(0) && inputPosition.y() <= scalarInputHeight1)
2484 {
2485 interpolatePixel<T, T, tChannels, CV::PC_TOP_LEFT, Scalar, TIntermediate>(input, inputWidth, inputHeight, inputPaddingElements, inputPosition, (T*)(outputRowPixel), bias);
2486 }
2487 else
2488 {
2489 *outputRowPixel = *bColor;
2490 }
2491
2492 ++outputRowPixel;
2493 }
2494 }
2495}
2496
2497#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 41
2498
2499template <unsigned int tChannels>
2500inline void FrameInterpolatorBilinear::affine8BitPerChannelSSESubset(const uint8_t* source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3* source_A_target, const uint8_t* borderColor, uint8_t* target, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int firstTargetRow, const unsigned int numberTargetRows, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements)
2501{
2502 static_assert(tChannels >= 1u, "Invalid channel number!");
2503
2504 ocean_assert(source && target);
2505 ocean_assert(sourceWidth > 0u && sourceHeight > 0u);
2506 ocean_assert(targetWidth >= 4u && targetHeight > 0u);
2507 ocean_assert(source_A_target);
2508 ocean_assert(!source_A_target->isNull() && Numeric::isEqualEps((*source_A_target)[2]) && Numeric::isEqualEps((*source_A_target)[5]));
2509
2510 ocean_assert_and_suppress_unused(firstTargetRow + numberTargetRows <= targetHeight, targetHeight);
2511
2512 const unsigned int sourceStrideElements = tChannels * sourceWidth + sourcePaddingElements;
2513 const unsigned int targetStrideElements = tChannels * targetWidth + targetPaddingElements;
2514
2515 using PixelType = typename DataType<uint8_t, tChannels>::Type;
2516
2517 uint8_t zeroColor[tChannels] = {uint8_t(0)};
2518 const PixelType* const bColor = borderColor ? (PixelType*)borderColor : (PixelType*)zeroColor;
2519
2520 OCEAN_ALIGN_DATA(16) unsigned int validPixels[4];
2521
2522 OCEAN_ALIGN_DATA(16) unsigned int topLeftOffsets[4];
2523 OCEAN_ALIGN_DATA(16) unsigned int topRightOffsets[4];
2524 OCEAN_ALIGN_DATA(16) unsigned int bottomLeftOffsets[4];
2525 OCEAN_ALIGN_DATA(16) unsigned int bottomRightOffsets[4];
2526
2527 // we store 4 floats: [X0, X0, X0, X0], and same with X1 and X2
2528 const __m128 m128_f_X0 = _mm_set_ps1(float((*source_A_target)(0, 0)));
2529 const __m128 m128_f_X1 = _mm_set_ps1(float((*source_A_target)(1, 0)));
2530
2531 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberTargetRows; ++y)
2532 {
2533 PixelType* targetRow = (PixelType*)(target + y * targetStrideElements);
2534
2535 /*
2536 * We can slightly optimize the 3x3 matrix multiplication:
2537 *
2538 * | X0 Y0 Z0 | | x |
2539 * | X1 Y1 Z1 | * | y |
2540 * | 0 0 1 | | 1 |
2541 *
2542 * | xx | | X0 * x | | Y0 * y + Z0 |
2543 * | yy | = | X1 * x | + | Y1 * y + Z1 |
2544 *
2545 * As y is constant within the inner loop, the two terms on the right side in the above equations can be pre-calculated:
2546 *
2547 * C0 = Y0 * y + Z0
2548 * C1 = Y1 * y + Z1
2549 *
2550 * So the computation becomes:
2551 *
2552 * | x' | | X0 * x | | C0 |
2553 * | y' | = | X1 * x | + | C1 |
2554 */
2555
2556 // we store 4 floats: [C0, C0, C0, C0], and same with C1 and C2
2557 const __m128 m128_f_C0 = _mm_set_ps1(float((*source_A_target)(0, 1) * Scalar(y) + (*source_A_target)(0, 2)));
2558 const __m128 m128_f_C1 = _mm_set_ps1(float((*source_A_target)(1, 1) * Scalar(y) + (*source_A_target)(1, 2)));
2559
2560 // we store 4 floats: [0.0f, 0.0f, 0.0f, 0.0f]
2561 const __m128 m128_f_zero = _mm_setzero_ps();
2562
2563 // we store 4 integers: [tChannels, tChannels, tChannels, tChannels]
2564 const __m128i m128_i_channels = _mm_set1_epi32(tChannels);
2565
2566 // we store 4 integers: [sourceStrideElements, sourceStrideElements, sourceStrideElements, sourceStrideElements]
2567 const __m128i m128_i_sourceStrideElements = _mm_set1_epi32(sourceStrideElements);
2568
2569 // we store 4 integers: [inputWidth - 1, inputWidth - 1, inputWidth - 1, inputWidth - 1], and same with inputHeight
2570 const __m128i m128_i_sourceWidth_1 = _mm_set1_epi32(int(sourceWidth) - 1);
2571 const __m128i m128_i_sourceHeight_1 = _mm_set1_epi32(int(sourceHeight) - 1);
2572
2573 // we store 4 floats: [inputWidth - 1, inputWidth - 1, inputWidth - 1, inputWidth - 1], and same with inputHeight
2574 const __m128 m128_f_sourceWidth_1 = _mm_set_ps1(float(sourceWidth - 1u));
2575 const __m128 m128_f_sourceHeight_1 = _mm_set_ps1(float(sourceHeight - 1u));
2576
2577 for (unsigned int x = 0u; x < targetWidth; x += 4u)
2578 {
2579 if (x + 4u > targetWidth)
2580 {
2581 // the last iteration will not fit into the output frame,
2582 // so we simply shift x left by some pixels (at most 3) and we will calculate some pixels again
2583
2584 ocean_assert(x >= 4u && targetWidth > 4u);
2585 const unsigned int newX = targetWidth - 4u;
2586
2587 ocean_assert(x > newX);
2588 targetRow -= x - newX;
2589
2590 x = newX;
2591
2592 // the for loop will stop after this iteration
2593 ocean_assert(!(x + 4u < targetWidth));
2594 }
2595
2596
2597 // we need four successive x coordinate floats:
2598 // [x + 3.0f, x + 2.0f, x + 1.0f; x + 0.0f]
2599 const __m128 m128_f_x_0123 = _mm_set_ps(float(x + 3u), float(x + 2u), float(x + 1u), float(x + 0u));
2600
2601 // we calculate xx and yy for [x + 3.0f, x + 2.0f, x + 1.0f, x + 0.0f]
2602 const __m128 m128_f_sourceX = _mm_add_ps(_mm_mul_ps(m128_f_X0, m128_f_x_0123), m128_f_C0);
2603 const __m128 m128_f_sourceY = _mm_add_ps(_mm_mul_ps(m128_f_X1, m128_f_x_0123), m128_f_C1);
2604
2605 // now we check whether we are inside the input frame
2606 const __m128 m128_f_validPixelX = _mm_and_ps(_mm_cmple_ps(m128_f_sourceX, m128_f_sourceWidth_1), _mm_cmpge_ps(m128_f_sourceX, m128_f_zero)); // inputPosition.x() <= (inputWidth - 1) && inputPosition.x() >= 0 ? 0xFFFFFFFF : 0x00000000
2607 const __m128 m128_f_validPixelY = _mm_and_ps(_mm_cmple_ps(m128_f_sourceY, m128_f_sourceHeight_1), _mm_cmpge_ps(m128_f_sourceY, m128_f_zero)); // inputPosition.y() <= (inputHeight - 1) && inputPosition.y() >= 0 ? 0xFFFFFFFF : 0x00000000
2608
2609 const __m128i m128_i_validPixel = _mm_castps_si128(_mm_and_ps(m128_f_validPixelX, m128_f_validPixelY)); // is_inside_input_frame(inputPosition) ? 0xFFFFFFFF : 0x00000000
2610
2611 // we can stop here if all pixels are invalid
2612 if (_mm_test_all_zeros(m128_i_validPixel, _mm_set1_epi32(0xFFFFFFFF)))
2613 {
2614#ifdef OCEAN_DEBUG
2615 OCEAN_ALIGN_DATA(16) unsigned int debugValidPixels[4];
2616 _mm_store_si128((__m128i*)debugValidPixels, m128_i_validPixel);
2617 ocean_assert(!(debugValidPixels[0] || debugValidPixels[1] || debugValidPixels[2] || debugValidPixels[3]));
2618#endif
2619
2620 targetRow[0] = *bColor;
2621 targetRow[1] = *bColor;
2622 targetRow[2] = *bColor;
2623 targetRow[3] = *bColor;
2624
2625 targetRow += 4;
2626
2627 continue;
2628 }
2629
2630 // we store the result
2631 _mm_store_si128((__m128i*)validPixels, m128_i_validPixel);
2632 ocean_assert(validPixels[0] || validPixels[1] || validPixels[2] || validPixels[3]);
2633
2634
2635 // now we determine the left, top, right and bottom pixel used for the interpolation
2636 const __m128 m128_f_tx_floor = _mm_floor_ps(m128_f_sourceX);
2637 const __m128 m128_f_ty_floor = _mm_floor_ps(m128_f_sourceY);
2638
2639 // left = floor(x); top = floor(y)
2640 const __m128i m128_i_left = _mm_cvtps_epi32(m128_f_tx_floor);
2641 const __m128i m128_i_top = _mm_cvtps_epi32(m128_f_ty_floor);
2642
2643 // right = min(left + 1, width - 1); bottom = min(top + 1; height - 1)
2644 const __m128i m128_i_right = _mm_min_epu32(_mm_add_epi32(m128_i_left, _mm_set1_epi32(1)), m128_i_sourceWidth_1);
2645 const __m128i m128_i_bottom = _mm_min_epu32(_mm_add_epi32(m128_i_top, _mm_set1_epi32(1)), m128_i_sourceHeight_1);
2646
2647 // offset = (y * sourceStrideElements + tChannels * x)
2648 const __m128i m128_i_topLeftOffset = _mm_add_epi32(_mm_mullo_epi32(m128_i_top, m128_i_sourceStrideElements), _mm_mullo_epi32(m128_i_channels, m128_i_left)); // topleftOffset = (top * sourceStrideElements + tChannels * left)
2649 const __m128i m128_i_topRightOffset = _mm_add_epi32(_mm_mullo_epi32(m128_i_top, m128_i_sourceStrideElements), _mm_mullo_epi32(m128_i_channels, m128_i_right)); // toprightOffset = (top * sourceStrideElements + tChannels * right)
2650 const __m128i m128_i_bottomLeftOffset = _mm_add_epi32(_mm_mullo_epi32(m128_i_bottom, m128_i_sourceStrideElements), _mm_mullo_epi32(m128_i_channels, m128_i_left)); // ...
2651 const __m128i m128_i_bottomRightOffset = _mm_add_epi32(_mm_mullo_epi32(m128_i_bottom, m128_i_sourceStrideElements), _mm_mullo_epi32(m128_i_channels, m128_i_right));
2652
2653 // we store the offsets
2654 _mm_store_si128((__m128i*)topLeftOffsets, m128_i_topLeftOffset);
2655 _mm_store_si128((__m128i*)topRightOffsets, m128_i_topRightOffset);
2656 _mm_store_si128((__m128i*)bottomLeftOffsets, m128_i_bottomLeftOffset);
2657 _mm_store_si128((__m128i*)bottomRightOffsets, m128_i_bottomRightOffset);
2658
2659
2660 // now we need to determine the interpolation factors tx, tx_ and ty, ty_: (top_left * tx_ + top_right * tx) * ty_ + (bottom_left * tx_ + bottom_right * tx) * ty
2661
2662 // we determine the fractional portions of the x' and y':
2663 // e.g., [43.1231, -12.5543, -34.123, 99.2]
2664 // [ 0.1231, 0.4457, 0.877, 0.2] // note the result for negative value - but we will not process negative values anyway due to 'validPixel'
2665 __m128 m128_f_tx = _mm_sub_ps(m128_f_sourceX, m128_f_tx_floor);
2666 __m128 m128_f_ty = _mm_sub_ps(m128_f_sourceY, m128_f_ty_floor);
2667
2668 // we use integer interpolation [0.0, 1.0] -> [0, 128]
2669 m128_f_tx = _mm_mul_ps(m128_f_tx, _mm_set_ps1(128.0f));
2670 m128_f_ty = _mm_mul_ps(m128_f_ty, _mm_set_ps1(128.0f));
2671
2672 m128_f_tx = _mm_round_ps(m128_f_tx, _MM_FROUND_TO_NEAREST_INT);
2673 m128_f_ty = _mm_round_ps(m128_f_ty, _MM_FROUND_TO_NEAREST_INT);
2674
2675 const __m128i m128_i_tx = _mm_cvtps_epi32(m128_f_tx);
2676 const __m128i m128_i_ty = _mm_cvtps_epi32(m128_f_ty);
2677
2678 interpolate4Pixels8BitPerChannelSSE<tChannels>(source, topLeftOffsets, topRightOffsets, bottomLeftOffsets, bottomRightOffsets, validPixels, *bColor, m128_i_tx, m128_i_ty, targetRow);
2679 targetRow += 4;
2680 }
2681 }
2682}
2683
2684template <unsigned int tChannels>
2685inline void FrameInterpolatorBilinear::homography8BitPerChannelSSESubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* input_H_output, const uint8_t* borderColor, uint8_t* output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
2686{
2687 static_assert(tChannels >= 1u, "Invalid channel number!");
2688
2689 ocean_assert(input != nullptr && output != nullptr);
2690 ocean_assert(inputWidth > 0u && inputHeight > 0u);
2691 ocean_assert(outputWidth >= 4u && outputHeight > 0u);
2692 ocean_assert(input_H_output != nullptr);
2693
2694 ocean_assert_and_suppress_unused(firstOutputRow + numberOutputRows <= outputHeight, outputHeight);
2695
2696 const unsigned int inputStrideElements = inputWidth * tChannels + inputPaddingElements;
2697 const unsigned int outputStrideElements = outputWidth * tChannels + outputPaddingElements;
2698
2699 using PixelType = typename DataType<uint8_t, tChannels>::Type;
2700
2701 uint8_t zeroColor[tChannels] = {uint8_t(0)};
2702 const PixelType* const bColor = borderColor ? (PixelType*)borderColor : (PixelType*)zeroColor;
2703
2704 OCEAN_ALIGN_DATA(16) unsigned int validPixels[4];
2705
2706 OCEAN_ALIGN_DATA(16) unsigned int topLeftOffsets[4];
2707 OCEAN_ALIGN_DATA(16) unsigned int topRightOffsets[4];
2708 OCEAN_ALIGN_DATA(16) unsigned int bottomLeftOffsets[4];
2709 OCEAN_ALIGN_DATA(16) unsigned int bottomRightOffsets[4];
2710
2711 // we store 4 floats: [X0, X0, X0, X0], and same with X1 and X2
2712 const __m128 m128_f_X0 = _mm_set_ps1(float((*input_H_output)(0, 0)));
2713 const __m128 m128_f_X1 = _mm_set_ps1(float((*input_H_output)(1, 0)));
2714 const __m128 m128_f_X2 = _mm_set_ps1(float((*input_H_output)(2, 0)));
2715
2716 // we store 4 floats: [0.0f, 0.0f, 0.0f, 0.0f]
2717 const __m128 m128_f_zero = _mm_setzero_ps();
2718
2719 // we store 4 integers: [tChannels, tChannels, tChannels, tChannels]
2720 const __m128i m128_i_channels = _mm_set1_epi32(tChannels);
2721
2722 // we store 4 integers: [inputStrideElements, inputStrideElements, inputStrideElements, inputStrideElements]
2723 const __m128i m128_i_inputStrideElements = _mm_set1_epi32(inputStrideElements);
2724
2725 // we store 4 integers: [inputWidth - 1, inputWidth - 1, inputWidth -1, inputWidth -1], and same with inputHeight
2726 const __m128i m128_i_inputWidth_1 = _mm_set1_epi32(int(inputWidth) - 1);
2727 const __m128i m128_i_inputHeight_1 = _mm_set1_epi32(int(inputHeight) - 1);
2728
2729 // we store 4 floats: [inputWidth - 1, inputWidth - 1, inputWidth - 1, inputWidth - 1], and same with inputHeight
2730 const __m128 m128_f_inputWidth_1 = _mm_set_ps1(float(inputWidth - 1u));
2731 const __m128 m128_f_inputHeight_1 = _mm_set_ps1(float(inputHeight - 1u));
2732
2733 for (unsigned int y = firstOutputRow; y < firstOutputRow + numberOutputRows; ++y)
2734 {
2735 PixelType* outputPixelData = (PixelType*)(output + y * outputStrideElements);
2736
2737 /*
2738 * We can slightly optimize the 3x3 matrix multiplication:
2739 *
2740 * | X0 Y0 Z0 | | x |
2741 * | X1 Y1 Z1 | * | y |
2742 * | X2 Y2 Z2 | | 1 |
2743 *
2744 * | xx | | X0 * x | | Y0 * y + Z0 |
2745 * | yy | = | X1 * x | + | Y1 * y + Z1 |
2746 * | zz | | X2 * x | | Y2 * y + Z2 |
2747 *
2748 * | xx | | X0 * x | | C0 |
2749 * | yy | = | X1 * x | + | C1 |
2750 * | zz | | X2 * x | | C2 |
2751 *
2752 * As y is constant within the inner loop, we can pre-calculate the following terms:
2753 *
2754 * | x' | | (X0 * x + C0) / (X2 * x + C2) |
2755 * | y' | = | (X1 * x + C1) / (X2 * x + C2) |
2756 */
2757
2758 // we store 4 floats: [C0, C0, C0, C0], and same with C1 and C2
2759 const __m128 m128_f_C0 = _mm_set_ps1(float((*input_H_output)(0, 1) * Scalar(y) + (*input_H_output)(0, 2)));
2760 const __m128 m128_f_C1 = _mm_set_ps1(float((*input_H_output)(1, 1) * Scalar(y) + (*input_H_output)(1, 2)));
2761 const __m128 m128_f_C2 = _mm_set_ps1(float((*input_H_output)(2, 1) * Scalar(y) + (*input_H_output)(2, 2)));
2762
2763 for (unsigned int x = 0u; x < outputWidth; x += 4u)
2764 {
2765 if (x + 4u > outputWidth)
2766 {
2767 // the last iteration will not fit into the output frame,
2768 // so we simply shift x left by some pixels (at most 3) and we will calculate some pixels again
2769
2770 ocean_assert(x >= 4u && outputWidth > 4u);
2771 const unsigned int newX = outputWidth - 4u;
2772
2773 ocean_assert(x > newX);
2774 outputPixelData -= x - newX;
2775
2776 x = newX;
2777
2778 // the for loop will stop after this iteration
2779 ocean_assert(!(x + 4u < outputWidth));
2780 }
2781
2782
2783 // we need four successive x coordinate floats:
2784 // [x + 3.0f, x + 2.0f, x + 1.0f; x + 0.0f]
2785 const __m128 m128_f_x_0123 = _mm_set_ps(float(x + 3u), float(x + 2u), float(x + 1u), float(x + 0u));
2786
2787 // we calculate xx and yy and zz for [x + 3.0f, x + 2.0f, x + 1.0f, x + 0.0f]
2788 const __m128 m128_f_xx = _mm_add_ps(_mm_mul_ps(m128_f_X0, m128_f_x_0123), m128_f_C0);
2789 const __m128 m128_f_yy = _mm_add_ps(_mm_mul_ps(m128_f_X1, m128_f_x_0123), m128_f_C1);
2790 const __m128 m128_f_zz = _mm_add_ps(_mm_mul_ps(m128_f_X2, m128_f_x_0123), m128_f_C2);
2791
2792#ifdef USE_APPROXIMATED_INVERSE_OF_ZZ // (not defined by default)
2793
2794 // we calculate the (approximated) inverse of zz,
2795 // the overall performance will be approx. 5% better while the accuracy will be slightly worse:
2796 // [1/zz3, 1/zz2, 1/zz1, 1/zz0]
2797 const __m128 inv_zz_128 = _mm_rcp_ps(m128_f_zz);
2798
2799 // we determine the normalized coordinates x' and y' for for x + 3.0f, x + 2.0f, ...)
2800 const __m128 m128_f_inputX = _mm_mul_ps(m128_f_xx, inv_zz_128);
2801 const __m128 m128_f_inputY = _mm_mul_ps(m128_f_yy, inv_zz_128);
2802
2803#else
2804
2805 // we determine the normalized coordinates x' and y' for for x + 3.0f, x + 2.0f, ...)
2806 const __m128 m128_f_inputX = _mm_div_ps(m128_f_xx, m128_f_zz);
2807 const __m128 m128_f_inputY = _mm_div_ps(m128_f_yy, m128_f_zz);
2808
2809#endif // USE_APPROXIMATED_INVERSE_OF_ZZ
2810
2811
2812 // now we check whether we are inside the input frame
2813 const __m128 m128_f_validPixelX = _mm_and_ps(_mm_cmple_ps (m128_f_inputX, m128_f_inputWidth_1), _mm_cmpge_ps(m128_f_inputX, m128_f_zero)); // inputPosition.x() <= (inputWidth-1) && inputPosition.x() >= 0 ? 0xFFFFFF : 0x000000
2814 const __m128 m128_f_validPixelY = _mm_and_ps(_mm_cmple_ps (m128_f_inputY, m128_f_inputHeight_1), _mm_cmpge_ps(m128_f_inputY, m128_f_zero)); // inputPosition.y() <= (inputHeight-1) && inputPosition.y() >= 0 ? 0xFFFFFF : 0x000000
2815
2816 const __m128i m128_i_validPixel = _mm_castps_si128(_mm_and_ps(m128_f_validPixelX, m128_f_validPixelY)); // is_inside_input_frame(inputPosition) ? 0xFFFFFF : 0x000000
2817
2818 // we can stop here if all pixels are invalid
2819 if (_mm_test_all_zeros(m128_i_validPixel, _mm_set1_epi32(0xFFFFFFFF)))
2820 {
2821#ifdef OCEAN_DEBUG
2822 OCEAN_ALIGN_DATA(16) unsigned int debugValidPixels[4];
2823 _mm_store_si128((__m128i*)debugValidPixels, m128_i_validPixel);
2824 ocean_assert(!(debugValidPixels[0] || debugValidPixels[1] || debugValidPixels[2] || debugValidPixels[3]));
2825#endif
2826
2827 outputPixelData[0] = *bColor;
2828 outputPixelData[1] = *bColor;
2829 outputPixelData[2] = *bColor;
2830 outputPixelData[3] = *bColor;
2831
2832 outputPixelData += 4;
2833
2834 continue;
2835 }
2836
2837 // we store the result
2838 _mm_store_si128((__m128i*)validPixels, m128_i_validPixel);
2839 ocean_assert(validPixels[0] || validPixels[1] || validPixels[2] || validPixels[3]);
2840
2841
2842 // now we determine the left, top, right and bottom pixel used for the interpolation
2843 const __m128 m128_f_tx_floor = _mm_floor_ps(m128_f_inputX);
2844 const __m128 m128_f_ty_floor = _mm_floor_ps(m128_f_inputY);
2845
2846 // left = floor(x); top = floor(y)
2847 const __m128i m128_i_left = _mm_cvtps_epi32(m128_f_tx_floor);
2848 const __m128i m128_i_top = _mm_cvtps_epi32(m128_f_ty_floor);
2849
2850 // right = min(left + 1, width - 1); bottom = min(top + 1; height - 1)
2851 const __m128i m128_i_right = _mm_min_epu32(_mm_add_epi32(m128_i_left, _mm_set1_epi32(1)), m128_i_inputWidth_1);
2852 const __m128i m128_i_bottom = _mm_min_epu32(_mm_add_epi32(m128_i_top, _mm_set1_epi32(1)), m128_i_inputHeight_1);
2853
2854 // offset = (y * inputStrideElements + tChannels * x)
2855 const __m128i m128_i_topLeftOffset = _mm_add_epi32(_mm_mullo_epi32(m128_i_top, m128_i_inputStrideElements), _mm_mullo_epi32(m128_i_channels, m128_i_left)); // topleftOffset = (top * inputStrideElements + tChannels * left)
2856 const __m128i m128_i_topRightOffset = _mm_add_epi32(_mm_mullo_epi32(m128_i_top, m128_i_inputStrideElements), _mm_mullo_epi32(m128_i_channels, m128_i_right)); // toprightOffset = (top * inputStrideElements + tChannels * right)
2857 const __m128i m128_i_bottomLeftOffset = _mm_add_epi32(_mm_mullo_epi32(m128_i_bottom, m128_i_inputStrideElements), _mm_mullo_epi32(m128_i_channels, m128_i_left)); // ...
2858 const __m128i m128_i_bottomRightOffset = _mm_add_epi32(_mm_mullo_epi32(m128_i_bottom, m128_i_inputStrideElements), _mm_mullo_epi32(m128_i_channels, m128_i_right));
2859
2860 // we store the offsets
2861 _mm_store_si128((__m128i*)topLeftOffsets, m128_i_topLeftOffset);
2862 _mm_store_si128((__m128i*)topRightOffsets, m128_i_topRightOffset);
2863 _mm_store_si128((__m128i*)bottomLeftOffsets, m128_i_bottomLeftOffset);
2864 _mm_store_si128((__m128i*)bottomRightOffsets, m128_i_bottomRightOffset);
2865
2866
2867 // now we need to determine the interpolation factors tx, tx_ and ty, ty_: (top_left * tx_ + top_right * tx) * ty_ + (bottom_left * tx_ + bottom_right * tx) * ty
2868
2869 // we determine the fractional portions of the x' and y':
2870 // e.g., [43.1231, -12.5543, -34.123, 99.2]
2871 // [ 0.1231, 0.4457, 0.877, 0.2] // note the result for negative value - but we will not process negative values anyway due to 'validPixel'
2872 __m128 m128_f_tx = _mm_sub_ps(m128_f_inputX, m128_f_tx_floor);
2873 __m128 m128_f_ty = _mm_sub_ps(m128_f_inputY, m128_f_ty_floor);
2874
2875 // we use integer interpolation [0.0, 1.0] -> [0, 128]
2876 m128_f_tx = _mm_mul_ps(m128_f_tx, _mm_set_ps1(128.0f));
2877 m128_f_ty = _mm_mul_ps(m128_f_ty, _mm_set_ps1(128.0f));
2878
2879 m128_f_tx = _mm_round_ps(m128_f_tx, _MM_FROUND_TO_NEAREST_INT);
2880 m128_f_ty = _mm_round_ps(m128_f_ty, _MM_FROUND_TO_NEAREST_INT);
2881
2882 const __m128i m128_i_tx = _mm_cvtps_epi32(m128_f_tx);
2883 const __m128i m128_i_ty = _mm_cvtps_epi32(m128_f_ty);
2884
2885 interpolate4Pixels8BitPerChannelSSE<tChannels>(input, topLeftOffsets, topRightOffsets, bottomLeftOffsets, bottomRightOffsets, validPixels, *bColor, m128_i_tx, m128_i_ty, outputPixelData);
2886 outputPixelData += 4;
2887 }
2888 }
2889}
2890
2891template <>
2892OCEAN_FORCE_INLINE __m128i FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelSSE<3u>(const __m128i& sourcesTopLeft, const __m128i& sourcesTopRight, const __m128i& sourcesBottomLeft, const __m128i& sourcesBottomRight, const __m128i& factorsTopLeft, const __m128i& factorsTopRight, const __m128i& factorsBottomLeft, const __m128i& factorsBottomRight)
2893{
2894 // sourcesTopLeft stores the three color values of 4 (independent) pixels (the upper left pixels):
2895 // FEDC BA98 7654 3210
2896 // ---- VUYV UYVU YVUY
2897 // sourcesTopRight, sourcesBottomLeft, sourcesBottomRight have the same pattern
2898
2899 // factorsTopLeft stores the 32 bit interpolation values for 4 pixels:
2900 // FEDC BA98 7654 3210
2901 // 3 2 1 0 (32 bit interpolation values, fitting into 16 bit)
2902
2903
2904 // we will simply extract each channel from the source pixels,
2905 // each extracted channel will be multiplied by the corresponding interpolation factor
2906 // and all interpolation results will be accumulated afterwards
2907
2908 // FEDC BA98 7654 3210
2909 const __m128i mask32_Channel0 = SSE::set128i(0xFFFFFF09FFFFFF06ull, 0xFFFFFF03FFFFFF00ull); // ---9 ---6 ---3 ---0
2910 const __m128i mask32_Channel1 = SSE::set128i(0xFFFFFF0AFFFFFF07ull, 0xFFFFFF04FFFFFF01ull); // ---A ---7 ---4 ---1
2911 const __m128i mask32_Channel2 = SSE::set128i(0xFFFFFF0BFFFFFF08ull, 0xFFFFFF05FFFFFF02ull); // ---B ---8 ---5 ---2
2912
2913
2914 // we extract the first channel from the top left pixel values and multiply the channel with the interpolation factors
2915 // FEDC BA98 7654 3210
2916 // ---9 ---6 ---3 ---0
2917 // *
2918 // FTL3 FTL2 FTL1 FTL0
2919 __m128i multiplication_channel0 = _mm_mullo_epi32(factorsTopLeft, _mm_shuffle_epi8(sourcesTopLeft, mask32_Channel0));
2920
2921 // we the same multiplication for the second channel
2922 __m128i multiplication_channel1 = _mm_mullo_epi32(factorsTopLeft, _mm_shuffle_epi8(sourcesTopLeft, mask32_Channel1));
2923
2924 // and third channel
2925 __m128i multiplication_channel2 = _mm_mullo_epi32(factorsTopLeft, _mm_shuffle_epi8(sourcesTopLeft, mask32_Channel2));
2926
2927
2928 // now we repeat the process for the top right pixel values
2929 multiplication_channel0 = _mm_add_epi32(multiplication_channel0, _mm_mullo_epi32(factorsTopRight, _mm_shuffle_epi8(sourcesTopRight, mask32_Channel0)));
2930 multiplication_channel1 = _mm_add_epi32(multiplication_channel1, _mm_mullo_epi32(factorsTopRight, _mm_shuffle_epi8(sourcesTopRight, mask32_Channel1)));
2931 multiplication_channel2 = _mm_add_epi32(multiplication_channel2, _mm_mullo_epi32(factorsTopRight, _mm_shuffle_epi8(sourcesTopRight, mask32_Channel2)));
2932
2933
2934 // and for the bottom left pixel values
2935 multiplication_channel0 = _mm_add_epi32(multiplication_channel0, _mm_mullo_epi32(factorsBottomLeft, _mm_shuffle_epi8(sourcesBottomLeft, mask32_Channel0)));
2936 multiplication_channel1 = _mm_add_epi32(multiplication_channel1, _mm_mullo_epi32(factorsBottomLeft, _mm_shuffle_epi8(sourcesBottomLeft, mask32_Channel1)));
2937 multiplication_channel2 = _mm_add_epi32(multiplication_channel2, _mm_mullo_epi32(factorsBottomLeft, _mm_shuffle_epi8(sourcesBottomLeft, mask32_Channel2)));
2938
2939
2940 // and for the bottom right pixel values
2941 multiplication_channel0 = _mm_add_epi32(multiplication_channel0, _mm_mullo_epi32(factorsBottomRight, _mm_shuffle_epi8(sourcesBottomRight, mask32_Channel0)));
2942 multiplication_channel1 = _mm_add_epi32(multiplication_channel1, _mm_mullo_epi32(factorsBottomRight, _mm_shuffle_epi8(sourcesBottomRight, mask32_Channel1)));
2943 multiplication_channel2 = _mm_add_epi32(multiplication_channel2, _mm_mullo_epi32(factorsBottomRight, _mm_shuffle_epi8(sourcesBottomRight, mask32_Channel2)));
2944
2945
2946 const __m128i m128_i_8192 = _mm_set1_epi32(8192);
2947
2948 // we add 8192 for rounding and shift the result by 14 bits (division by 128*128) // TODO if using 256 we should be able to avoid the shifting by 14 bits (simply by using shuffle operations)
2949
2950 // in addition to rounding and shifting, we need to move the interpolation results to the correct channel:
2951 // target data: ---9 ---6 ---3 ---0
2952 // shufflet target: ---- --9- -6-- 3--0
2953 // mask location: ---C ---8 ---4 ---0
2954 // mask: ---- --C- -8-- 4--0
2955 __m128i interpolation_channel0 = _mm_shuffle_epi8(_mm_srli_epi32(_mm_add_epi32(multiplication_channel0, m128_i_8192), 14), SSE::set128i(0xFFFFFFFFFFFF0CFFull, 0xFF08FFFF04FFFF00ull));
2956
2957 // target data: ---A ---7 ---4 ---1
2958 // shufflet target: ---- -A-- 7--4 --1-
2959 // mask location: ---C ---8 ---4 ---0
2960 // mask: ---- -C-- 8--4 --0-
2961 __m128i interpolation_channel1 = _mm_shuffle_epi8(_mm_srli_epi32(_mm_add_epi32(multiplication_channel1, m128_i_8192), 14), SSE::set128i(0xFFFFFFFFFF0CFFFFull, 0x08FFFF04FFFF00FFull));
2962
2963 // target data: ---B ---8 ---5 ---2
2964 // shufflet target: ---- B--8 --5- -2--
2965 // mask location: ---C ---8 ---4 ---0
2966 // mask: ---- C--8 --4- -0--
2967 __m128i interpolation_channel2 = _mm_shuffle_epi8(_mm_srli_epi32(_mm_add_epi32(multiplication_channel2, m128_i_8192), 14), SSE::set128i(0xFFFFFFFF0CFFFF08ull, 0xFFFF04FFFF00FFFFull));
2968
2969
2970 // finally, we simply blend all interpolation results together
2971
2972 return _mm_or_si128(_mm_or_si128(interpolation_channel0, interpolation_channel1), interpolation_channel2);
2973}
2974
2975template <>
2976OCEAN_FORCE_INLINE __m128i FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelSSE<4u>(const __m128i& sourcesTopLeft, const __m128i& sourcesTopRight, const __m128i& sourcesBottomLeft, const __m128i& sourcesBottomRight, const __m128i& factorsTopLeft, const __m128i& factorsTopRight, const __m128i& factorsBottomLeft, const __m128i& factorsBottomRight)
2977{
2978 // sourcesTopLeft stores the four color values of 4 (independent) pixels (the upper left pixels):
2979 // FEDC BA98 7654 3210
2980 // AVUY AVUY AVUY AVUY
2981 // sourcesTopRight, sourcesBottomLeft, sourcesBottomRight have the same pattern
2982
2983 // factorsTopLeft stores the 32 bit interpolation values for 4 pixels:
2984 // FEDC BA98 7654 3210
2985 // 3 2 1 0 (32 bit interpolation values, fitting into 16 bit)
2986
2987
2988 // we will simply extract each channel from the source pixels,
2989 // each extracted channel will be multiplied by the corresponding interpolation factor
2990 // and all interpolation results will be accumulated afterwards
2991
2992 // FEDC BA98 7654 3210
2993 const __m128i mask32_Channel0 = SSE::set128i(0xA0A0A00CA0A0A008ull, 0xA0A0A004A0A0A000ull); // ---C ---8 ---4 ---0
2994 const __m128i mask32_Channel1 = SSE::set128i(0xA0A0A00DA0A0A009ull, 0xA0A0A005A0A0A001ull); // ---D ---9 ---5 ---1
2995 const __m128i mask32_Channel2 = SSE::set128i(0xA0A0A00EA0A0A00Aull, 0xA0A0A006A0A0A002ull); // ---E ---A ---6 ---2
2996 const __m128i mask32_Channel3 = SSE::set128i(0xA0A0A00FA0A0A00Bull, 0xA0A0A007A0A0A003ull); // ---F ---B ---7 ---3
2997
2998
2999 // we extract the first channel from the top left pixel values and multiply the channel with the interpolation factors
3000 // FEDC BA98 7654 3210
3001 // ---C ---8 ---4 ---0
3002 // *
3003 // FTL3 FTL2 FTL1 FTL0
3004 __m128i multiplication_channel0 = _mm_mullo_epi32(factorsTopLeft, _mm_shuffle_epi8(sourcesTopLeft, mask32_Channel0));
3005
3006 // we the same multiplication for the second channel
3007 __m128i multiplication_channel1 = _mm_mullo_epi32(factorsTopLeft, _mm_shuffle_epi8(sourcesTopLeft, mask32_Channel1));
3008
3009 // and third channel
3010 __m128i multiplication_channel2 = _mm_mullo_epi32(factorsTopLeft, _mm_shuffle_epi8(sourcesTopLeft, mask32_Channel2));
3011
3012 // and last channel
3013 __m128i multiplication_channel3 = _mm_mullo_epi32(factorsTopLeft, _mm_shuffle_epi8(sourcesTopLeft, mask32_Channel3));
3014
3015
3016 // now we repeat the process for the top right pixel values
3017 multiplication_channel0 = _mm_add_epi32(multiplication_channel0, _mm_mullo_epi32(factorsTopRight, _mm_shuffle_epi8(sourcesTopRight, mask32_Channel0)));
3018 multiplication_channel1 = _mm_add_epi32(multiplication_channel1, _mm_mullo_epi32(factorsTopRight, _mm_shuffle_epi8(sourcesTopRight, mask32_Channel1)));
3019 multiplication_channel2 = _mm_add_epi32(multiplication_channel2, _mm_mullo_epi32(factorsTopRight, _mm_shuffle_epi8(sourcesTopRight, mask32_Channel2)));
3020 multiplication_channel3 = _mm_add_epi32(multiplication_channel3, _mm_mullo_epi32(factorsTopRight, _mm_shuffle_epi8(sourcesTopRight, mask32_Channel3)));
3021
3022
3023 // and for the bottom left pixel values
3024 multiplication_channel0 = _mm_add_epi32(multiplication_channel0, _mm_mullo_epi32(factorsBottomLeft, _mm_shuffle_epi8(sourcesBottomLeft, mask32_Channel0)));
3025 multiplication_channel1 = _mm_add_epi32(multiplication_channel1, _mm_mullo_epi32(factorsBottomLeft, _mm_shuffle_epi8(sourcesBottomLeft, mask32_Channel1)));
3026 multiplication_channel2 = _mm_add_epi32(multiplication_channel2, _mm_mullo_epi32(factorsBottomLeft, _mm_shuffle_epi8(sourcesBottomLeft, mask32_Channel2)));
3027 multiplication_channel3 = _mm_add_epi32(multiplication_channel3, _mm_mullo_epi32(factorsBottomLeft, _mm_shuffle_epi8(sourcesBottomLeft, mask32_Channel3)));
3028
3029
3030 // and for the bottom right pixel values
3031 multiplication_channel0 = _mm_add_epi32(multiplication_channel0, _mm_mullo_epi32(factorsBottomRight, _mm_shuffle_epi8(sourcesBottomRight, mask32_Channel0)));
3032 multiplication_channel1 = _mm_add_epi32(multiplication_channel1, _mm_mullo_epi32(factorsBottomRight, _mm_shuffle_epi8(sourcesBottomRight, mask32_Channel1)));
3033 multiplication_channel2 = _mm_add_epi32(multiplication_channel2, _mm_mullo_epi32(factorsBottomRight, _mm_shuffle_epi8(sourcesBottomRight, mask32_Channel2)));
3034 multiplication_channel3 = _mm_add_epi32(multiplication_channel3, _mm_mullo_epi32(factorsBottomRight, _mm_shuffle_epi8(sourcesBottomRight, mask32_Channel3)));
3035
3036
3037 const __m128i m128_i_8192 = _mm_set1_epi32(8192);
3038
3039 // we add 8192 for rounding and shift the result by 14 bits (division by 128*128)
3040
3041 // in addition to rounding and shifting, we need to move the interpolation results to the correct channel:
3042 // ---C ---8 ---4 ---0
3043 // ---C ---9 ---4 ---0
3044 __m128i interpolation_channel0 = _mm_srli_epi32(_mm_add_epi32(multiplication_channel0, m128_i_8192), 14);
3045
3046 // in addition to rounding and shifting, we need to move the interpolation results to the correct channel:
3047 // ---D ---9 ---5 ---1
3048 // --D- --9- --5- --1-
3049 __m128i interpolation_channel1 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(multiplication_channel1, m128_i_8192), 14), 8);
3050
3051 // ---E ---A ---6 ---2
3052 // -E-- -A-- -6-- -2--
3053 __m128i interpolation_channel2 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(multiplication_channel2, m128_i_8192), 14), 16);
3054
3055 // ---F ---B ---7 ---3
3056 // F--- B--- 7--- 3---
3057 __m128i interpolation_channel3 = _mm_slli_epi32(_mm_srli_epi32(_mm_add_epi32(multiplication_channel3, m128_i_8192), 14), 24);
3058
3059
3060 // finally, we simply blend all interpolation results together
3061
3062 return _mm_or_si128(_mm_or_si128(interpolation_channel0, interpolation_channel1), _mm_or_si128(interpolation_channel2, interpolation_channel3));
3063}
3064
3065#ifdef OCEAN_COMPILER_MSC
3066
3067// we see a significant performance decrease with non-VS compilers/platforms,
3068// so we do not use the 3channel version with non-Windows compilers
3069
3070template <>
3071OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelSSE<1u>(const uint8_t* source, const unsigned int offsetsTopLeft[4], const unsigned int offsetsTopRight[4], const unsigned int offsetsBottomLeft[4], const unsigned int offsetsBottomRight[4], const unsigned int validPixels[4], const DataType<uint8_t, 1u>::Type& borderColor, const __m128i& m128_factorsRight, const __m128i& m128_factorsBottom, typename DataType<uint8_t, 1u>::Type* targetPositionPixels)
3072{
3073 ocean_assert(source != nullptr);
3074 ocean_assert(targetPositionPixels != nullptr);
3075
3076 using PixelType = typename DataType<uint8_t, 1u>::Type;
3077
3078 // as we do not initialize the following intermediate data,
3079 // we hopefully will not allocate memory on the stack each time this function is called
3080 OCEAN_ALIGN_DATA(16) PixelType pixels[16];
3081
3082 // we gather the individual source pixel values from the source image,
3083 // based on the calculated pixel locations
3084 for (unsigned int i = 0u; i < 4u; ++i)
3085 {
3086 if (validPixels[i])
3087 {
3088 pixels[i * 4u + 0u] = *((PixelType*)(source + offsetsTopLeft[i]));
3089 pixels[i * 4u + 1u] = *((PixelType*)(source + offsetsTopRight[i]));
3090 pixels[i * 4u + 2u] = *((PixelType*)(source + offsetsBottomLeft[i]));
3091 pixels[i * 4u + 3u] = *((PixelType*)(source + offsetsBottomRight[i]));
3092 }
3093 else
3094 {
3095 pixels[i * 4u + 0u] = borderColor;
3096 pixels[i * 4u + 1u] = borderColor;
3097 pixels[i * 4u + 2u] = borderColor;
3098 pixels[i * 4u + 3u] = borderColor;
3099 }
3100 }
3101
3102 static_assert(sizeof(__m128i) == sizeof(pixels), "Invalid data type!");
3103
3104 const __m128i m128_pixels = _mm_load_si128((const __m128i*)pixels);
3105
3106
3107 // factorLeft = 128 - factorRight
3108 // factorTop = 128 - factorBottom
3109
3110 const __m128i m128_factorsLeft = _mm_sub_epi32(_mm_set1_epi32(128), m128_factorsRight);
3111 const __m128i m128_factorsTop = _mm_sub_epi32(_mm_set1_epi32(128), m128_factorsBottom);
3112
3113 // (top_left * factorLeft + top_right * factorRight) * factorTop + (bottom_left * factorLeft + bottom_right * factorRight) * factorBottom
3114 // == top_left * factorTopLeft + top_right * factorTopRight + bottom_left * factorBottomLeft + bottom_right * factorBottomRight
3115
3116 const __m128i m128_factorsTopLeft = _mm_mullo_epi32(m128_factorsTop, m128_factorsLeft);
3117 const __m128i m128_factorsTopRight = _mm_mullo_epi32(m128_factorsTop, m128_factorsRight);
3118 const __m128i m128_factorsBottomLeft = _mm_mullo_epi32(m128_factorsBottom, m128_factorsLeft);
3119 const __m128i m128_factorsBottomRight = _mm_mullo_epi32(m128_factorsBottom, m128_factorsRight);
3120
3121 // pixels stores the four interpolation grascale pixel values (top left, top right, bottom left, bottom right) for 4 (independent) pixels:
3122 // F E D C B A 9 8 7 6 5 4 3 2 1 0
3123 // BR BL TR TL BR BL TR TL BR BL TR TL BR BL TR TL
3124
3125 // factorsTopLeft stores the 32 bit interpolation values for 4 pixels:
3126 // FEDC BA98 7654 3210
3127 // 3 2 1 0 (32 bit interpolation values, fitting into 16 bit)
3128
3129
3130 // we will simply extract each channel from the source pixels,
3131 // each extracted channel will be multiplied by the corresponding interpolation factor
3132 // and all interpolation results will be accumulated afterwards
3133
3134 // FEDC BA98 7654 3210
3135 const __m128i mask32_topLeft = SSE::set128i(0xFFFFFF0CFFFFFF08ull, 0xFFFFFF04FFFFFF00ull); // ---C ---8 ---4 ---0
3136 const __m128i mask32_topRight = SSE::set128i(0xFFFFFF0DFFFFFF09ull, 0xFFFFFF05FFFFFF01ull); // ---D ---9 ---5 ---1
3137 const __m128i mask32_bottomLeft = SSE::set128i(0xFFFFFF0EFFFFFF0Aull, 0xFFFFFF06FFFFFF02ull); // ---E ---A ---6 ---2
3138 const __m128i mask32_bottomRight = SSE::set128i(0xFFFFFF0FFFFFFF0Bull, 0xFFFFFF07FFFFFF03ull); // ---F ---B ---7 ---3
3139
3140
3141 // we extract the top left values and multiply them with the interpolation factors
3142 // FEDC BA98 7654 3210
3143 // ---C ---8 ---4 ---0
3144 // *
3145 // FTL3 FTL2 FTL1 FTL0
3146 __m128i multiplicationA = _mm_mullo_epi32(m128_factorsTopLeft, _mm_shuffle_epi8(m128_pixels, mask32_topLeft));
3147 __m128i multiplicationB = _mm_mullo_epi32(m128_factorsTopRight, _mm_shuffle_epi8(m128_pixels, mask32_topRight));
3148
3149 multiplicationA = _mm_add_epi32(multiplicationA, _mm_mullo_epi32(m128_factorsBottomLeft, _mm_shuffle_epi8(m128_pixels, mask32_bottomLeft)));
3150 multiplicationB = _mm_add_epi32(multiplicationB, _mm_mullo_epi32(m128_factorsBottomRight, _mm_shuffle_epi8(m128_pixels, mask32_bottomRight)));
3151
3152 __m128i multiplication = _mm_add_epi32(multiplicationA, multiplicationB);
3153
3154 const __m128i m128_i_8192 = _mm_set1_epi32(8192);
3155
3156 // we add 8192 for rounding and shift the result by 14 bits (division by 128*128) // TODO if using 256 we should be able to avoid the shifting by 14 bits (simply by using shuffle operations)
3157 // additionally, we shuffle the individual results together
3158
3159 const __m128i result = _mm_shuffle_epi8(_mm_srli_epi32(_mm_add_epi32(multiplication, m128_i_8192), 14), SSE::set128i(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF0C080400ull));
3160
3161 *((unsigned int*)targetPositionPixels) = _mm_extract_epi32(result, 0);
3162}
3163
3164template <>
3165OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelSSE<3u>(const uint8_t* source, const unsigned int offsetsTopLeft[4], const unsigned int offsetsTopRight[4], const unsigned int offsetsBottomLeft[4], const unsigned int offsetsBottomRight[4], const unsigned int validPixels[4], const DataType<uint8_t, 3u>::Type& borderColor, const __m128i& m128_factorsRight, const __m128i& m128_factorsBottom, typename DataType<uint8_t, 3u>::Type* targetPositionPixels)
3166{
3167 ocean_assert(source != nullptr);
3168 ocean_assert(targetPositionPixels != nullptr);
3169
3170 using PixelType = typename DataType<uint8_t, 3u>::Type;
3171
3172 // as we do not initialize the following intermediate data,
3173 // we hopefully will not allocate memory on the stack each time this function is called
3174 OCEAN_ALIGN_DATA(16) PixelType topLeftPixels[6];
3175 OCEAN_ALIGN_DATA(16) PixelType topRightPixels[6];
3176 OCEAN_ALIGN_DATA(16) PixelType bottomLeftPixels[6];
3177 OCEAN_ALIGN_DATA(16) PixelType bottomRightPixels[6];
3178
3179 // we gather the individual source pixel values from the source image,
3180 // based on the calculated pixel locations
3181 for (unsigned int i = 0u; i < 4u; ++i)
3182 {
3183 if (validPixels[i])
3184 {
3185 topLeftPixels[i] = *((PixelType*)(source + offsetsTopLeft[i]));
3186 topRightPixels[i] = *((PixelType*)(source + offsetsTopRight[i]));
3187 bottomLeftPixels[i] = *((PixelType*)(source + offsetsBottomLeft[i]));
3188 bottomRightPixels[i] = *((PixelType*)(source + offsetsBottomRight[i]));
3189 }
3190 else
3191 {
3192 topLeftPixels[i] = borderColor;
3193 topRightPixels[i] = borderColor;
3194 bottomLeftPixels[i] = borderColor;
3195 bottomRightPixels[i] = borderColor;
3196 }
3197 }
3198
3199 static_assert(sizeof(__m128i) <= sizeof(topLeftPixels), "Invalid data type!");
3200
3201 const __m128i m128_topLeftPixels = _mm_load_si128((const __m128i*)topLeftPixels);
3202 const __m128i m128_topRightPixels = _mm_load_si128((const __m128i*)topRightPixels);
3203 const __m128i m128_bottomLeftPixels = _mm_load_si128((const __m128i*)bottomLeftPixels);
3204 const __m128i m128_bottomRightPixels = _mm_load_si128((const __m128i*)bottomRightPixels);
3205
3206
3207 // factorLeft = 128 - factorRight
3208 // factorTop = 128 - factorBottom
3209
3210 const __m128i m128_factorsLeft = _mm_sub_epi32(_mm_set1_epi32(128), m128_factorsRight);
3211 const __m128i m128_factorsTop = _mm_sub_epi32(_mm_set1_epi32(128), m128_factorsBottom);
3212
3213 // (top_left * factorLeft + top_right * factorRight) * factorTop + (bottom_left * factorLeft + bottom_right * factorRight) * factorBottom
3214 // == top_left * factorTopLeft + top_right * factorTopRight + bottom_left * factorBottomLeft + bottom_right * factorBottomRight
3215
3216 const __m128i m128_factorsTopLeft = _mm_mullo_epi32(m128_factorsTop, m128_factorsLeft);
3217 const __m128i m128_factorsTopRight = _mm_mullo_epi32(m128_factorsTop, m128_factorsRight);
3218 const __m128i m128_factorsBottomLeft = _mm_mullo_epi32(m128_factorsBottom, m128_factorsLeft);
3219 const __m128i m128_factorsBottomRight = _mm_mullo_epi32(m128_factorsBottom, m128_factorsRight);
3220
3221
3222 const __m128i m128_interpolationResult = interpolate4Pixels8BitPerChannelSSE<3u>(m128_topLeftPixels, m128_topRightPixels, m128_bottomLeftPixels, m128_bottomRightPixels, m128_factorsTopLeft, m128_factorsTopRight, m128_factorsBottomLeft, m128_factorsBottomRight);
3223
3224 // we copy the first 12 bytes
3225 memcpy(targetPositionPixels, &m128_interpolationResult, 12u);
3226}
3227
3228#endif // OCEAN_COMPILER_MSC
3229
3230template <>
3231OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelSSE<4u>(const uint8_t* source, const unsigned int offsetsTopLeft[4], const unsigned int offsetsTopRight[4], const unsigned int offsetsBottomLeft[4], const unsigned int offsetsBottomRight[4], const unsigned int validPixels[4], const DataType<uint8_t, 4u>::Type& borderColor, const __m128i& m128_factorsRight, const __m128i& m128_factorsBottom, typename DataType<uint8_t, 4u>::Type* targetPositionPixels)
3232{
3233 ocean_assert(source != nullptr);
3234 ocean_assert(targetPositionPixels != nullptr);
3235
3236 using PixelType = typename DataType<uint8_t, 4u>::Type;
3237
3238 // as we do not initialize the following intermediate data,
3239 // we hopefully will not allocate memory on the stack each time this function is called
3240 OCEAN_ALIGN_DATA(16) PixelType topLeftPixels[4];
3241 OCEAN_ALIGN_DATA(16) PixelType topRightPixels[4];
3242 OCEAN_ALIGN_DATA(16) PixelType bottomLeftPixels[4];
3243 OCEAN_ALIGN_DATA(16) PixelType bottomRightPixels[4];
3244
3245 // we gather the individual source pixel values from the source image,
3246 // based on the calculated pixel locations
3247
3248 for (unsigned int i = 0u; i < 4u; ++i)
3249 {
3250 if (validPixels[i])
3251 {
3252 topLeftPixels[i] = *((PixelType*)(source + offsetsTopLeft[i]));
3253 topRightPixels[i] = *((PixelType*)(source + offsetsTopRight[i]));
3254 bottomLeftPixels[i] = *((PixelType*)(source + offsetsBottomLeft[i]));
3255 bottomRightPixels[i] = *((PixelType*)(source + offsetsBottomRight[i]));
3256 }
3257 else
3258 {
3259 topLeftPixels[i] = borderColor;
3260 topRightPixels[i] = borderColor;
3261 bottomLeftPixels[i] = borderColor;
3262 bottomRightPixels[i] = borderColor;
3263 }
3264 }
3265
3266 static_assert(sizeof(__m128i) == sizeof(topLeftPixels), "Invalid data type!");
3267
3268 const __m128i m128_topLeftPixels = _mm_load_si128((const __m128i*)topLeftPixels);
3269 const __m128i m128_topRightPixels = _mm_load_si128((const __m128i*)topRightPixels);
3270 const __m128i m128_bottomLeftPixels = _mm_load_si128((const __m128i*)bottomLeftPixels);
3271 const __m128i m128_bottomRightPixels = _mm_load_si128((const __m128i*)bottomRightPixels);
3272
3273
3274 // factorLeft = 128 - factorRight
3275 // factorTop = 128 - factorBottom
3276
3277 const __m128i m128_factorsLeft = _mm_sub_epi32(_mm_set1_epi32(128), m128_factorsRight);
3278 const __m128i m128_factorsTop = _mm_sub_epi32(_mm_set1_epi32(128), m128_factorsBottom);
3279
3280 // (top_left * factorLeft + top_right * factorRight) * factorTop + (bottom_left * factorLeft + bottom_right * factorRight) * factorBottom
3281 // == top_left * factorTopLeft + top_right * factorTopRight + bottom_left * factorBottomLeft + bottom_right * factorBottomRight
3282
3283 const __m128i m128_factorsTopLeft = _mm_mullo_epi32(m128_factorsTop, m128_factorsLeft);
3284 const __m128i m128_factorsTopRight = _mm_mullo_epi32(m128_factorsTop, m128_factorsRight);
3285 const __m128i m128_factorsBottomLeft = _mm_mullo_epi32(m128_factorsBottom, m128_factorsLeft);
3286 const __m128i m128_factorsBottomRight = _mm_mullo_epi32(m128_factorsBottom, m128_factorsRight);
3287
3288
3289 const __m128i m128_interpolationResult = interpolate4Pixels8BitPerChannelSSE<4u>(m128_topLeftPixels, m128_topRightPixels, m128_bottomLeftPixels, m128_bottomRightPixels, m128_factorsTopLeft, m128_factorsTopRight, m128_factorsBottomLeft, m128_factorsBottomRight);
3290
3291 _mm_storeu_si128((__m128i*)targetPositionPixels, m128_interpolationResult);
3292}
3293
3294template <unsigned int tChannels>
3295OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelSSE(const uint8_t* source, const unsigned int offsetsTopLeft[4], const unsigned int offsetsTopRight[4], const unsigned int offsetsBottomLeft[4], const unsigned int offsetsBottomRight[4], const unsigned int validPixels[4], const typename DataType<uint8_t, tChannels>::Type& borderColor, const __m128i& m128_factorsRight, const __m128i& m128_factorsBottom, typename DataType<uint8_t, tChannels>::Type* targetPositionPixels)
3296{
3297 ocean_assert(source != nullptr);
3298 ocean_assert(targetPositionPixels != nullptr);
3299
3300 // as we do not initialize the following intermediate data,
3301 // we hopefully will not allocate memory on the stack each time this function is called
3302 OCEAN_ALIGN_DATA(16) unsigned int factorsTopLeft[4];
3303 OCEAN_ALIGN_DATA(16) unsigned int factorsTopRight[4];
3304 OCEAN_ALIGN_DATA(16) unsigned int factorsBottomLeft[4];
3305 OCEAN_ALIGN_DATA(16) unsigned int factorsBottomRight[4];
3306
3307
3308 // factorLeft = 128 - factorRight
3309 // factorTop = 128 - factorBottom
3310
3311 const __m128i m128_factorsLeft = _mm_sub_epi32(_mm_set1_epi32(128), m128_factorsRight);
3312 const __m128i m128_factorsTop = _mm_sub_epi32(_mm_set1_epi32(128), m128_factorsBottom);
3313
3314 // (top_left * factorLeft + top_right * factorRight) * factorTop + (bottom_left * factorLeft + bottom_right * factorRight) * factorBottom
3315 // == top_left * factorTopLeft + top_right * factorTopRight + bottom_left * factorBottomLeft + bottom_right * factorBottomRight
3316
3317 const __m128i m128_factorsTopLeft = _mm_mullo_epi32(m128_factorsTop, m128_factorsLeft);
3318 const __m128i m128_factorsTopRight = _mm_mullo_epi32(m128_factorsTop, m128_factorsRight);
3319 const __m128i m128_factorsBottomLeft = _mm_mullo_epi32(m128_factorsBottom, m128_factorsLeft);
3320 const __m128i m128_factorsBottomRight = _mm_mullo_epi32(m128_factorsBottom, m128_factorsRight);
3321
3322
3323 // we store the interpolation factors
3324 _mm_store_si128((__m128i*)factorsTopLeft, m128_factorsTopLeft);
3325 _mm_store_si128((__m128i*)factorsTopRight, m128_factorsTopRight);
3326 _mm_store_si128((__m128i*)factorsBottomLeft, m128_factorsBottomLeft);
3327 _mm_store_si128((__m128i*)factorsBottomRight, m128_factorsBottomRight);
3328
3329 for (unsigned int i = 0u; i < 4u; ++i)
3330 {
3331 if (validPixels[i])
3332 {
3333 const uint8_t* topLeft = source + offsetsTopLeft[i];
3334 const uint8_t* topRight = source + offsetsTopRight[i];
3335
3336 const uint8_t* bottomLeft = source + offsetsBottomLeft[i];
3337 const uint8_t* bottomRight = source + offsetsBottomRight[i];
3338
3339 const unsigned int& factorTopLeft = factorsTopLeft[i];
3340 const unsigned int& factorTopRight = factorsTopRight[i];
3341 const unsigned int& factorBottomLeft = factorsBottomLeft[i];
3342 const unsigned int& factorBottomRight = factorsBottomRight[i];
3343
3344 for (unsigned int n = 0u; n < tChannels; ++n)
3345 {
3346 ((uint8_t*)targetPositionPixels)[n] = (uint8_t)((topLeft[n] * factorTopLeft + topRight[n] * factorTopRight + bottomLeft[n] * factorBottomLeft + bottomRight[n] * factorBottomRight + 8192u) >> 14u);
3347 }
3348 }
3349 else
3350 {
3351 *targetPositionPixels = borderColor;
3352 }
3353
3354 targetPositionPixels++;
3355 }
3356}
3357
3358#endif // OCEAN_HARDWARE_SSE_VERSION
3359
3360#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
3361
3362template <unsigned int tChannels>
3363void FrameInterpolatorBilinear::affine8BitPerChannelNEONSubset(const uint8_t* source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3* source_A_target, const uint8_t* borderColor, uint8_t* target, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int firstTargetRow, const unsigned int numberTargetRows, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements)
3364{
3365 static_assert(tChannels >= 1u, "Invalid channel number!");
3366
3367 ocean_assert(source && target);
3368 ocean_assert(sourceWidth > 0u && sourceHeight > 0u);
3369 ocean_assert_and_suppress_unused(targetWidth >= 4u && targetHeight > 0u, targetHeight);
3370 ocean_assert(source_A_target);
3371 ocean_assert(!source_A_target->isNull() && Numeric::isEqualEps((*source_A_target)[2]) && Numeric::isEqualEps((*source_A_target)[5]));
3372
3373 ocean_assert(firstTargetRow + numberTargetRows <= targetHeight);
3374
3375 const unsigned int sourceStrideElements = sourceWidth * tChannels + sourcePaddingElements;
3376 const unsigned int targetStrideElements = targetWidth * tChannels + targetPaddingElements;
3377
3378 using PixelType = typename DataType<uint8_t, tChannels>::Type;
3379
3380 uint8_t zeroColor[tChannels] = {uint8_t(0)};
3381 const PixelType* const bColor = borderColor ? (PixelType*)borderColor : (PixelType*)zeroColor;
3382
3383 unsigned int validPixels[4];
3384
3385 unsigned int topLeftOffsetsElements[4];
3386 unsigned int topRightOffsetsElements[4];
3387 unsigned int bottomLeftOffsetsElements[4];
3388 unsigned int bottomRightOffsetsElements[4];
3389
3390 const uint32x4_t constantChannels_u_32x4 = vdupq_n_u32(tChannels);
3391
3392 // we store 4 floats: [X0, X0, X0, X0], and same with X1 and X2
3393 const float32x4_t m128_f_X0 = vdupq_n_f32(float((*source_A_target)(0, 0)));
3394 const float32x4_t m128_f_X1 = vdupq_n_f32(float((*source_A_target)(1, 0)));
3395
3396 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberTargetRows; ++y)
3397 {
3398 PixelType* targetRow = (PixelType*)(target + y * targetStrideElements);
3399
3400 /*
3401 * We can slightly optimize the 3x3 matrix multiplication:
3402 *
3403 * | X0 Y0 Z0 | | x |
3404 * | X1 Y1 Z1 | * | y |
3405 * | 0 0 1 | | 1 |
3406 *
3407 * | xx | | X0 * x | | Y0 * y + Z0 |
3408 * | yy | = | X1 * x | + | Y1 * y + Z1 |
3409 *
3410 * As y is constant within the inner loop, the two terms on the right side in the above equations can be pre-calculated:
3411 *
3412 * C0 = Y0 * y + Z0
3413 * C1 = Y1 * y + Z1
3414 *
3415 * So the computation becomes:
3416 *
3417 * | x' | | X0 * x | | C0 |
3418 * | y' | = | X1 * x | + | C1 |
3419 */
3420
3421 // we store 4 floats: [C0, C0, C0, C0], and same with C1 and C2
3422 const float32x4_t m128_f_C0 = vdupq_n_f32(float((*source_A_target)(0, 1) * Scalar(y) + (*source_A_target)(0, 2)));
3423 const float32x4_t m128_f_C1 = vdupq_n_f32(float((*source_A_target)(1, 1) * Scalar(y) + (*source_A_target)(1, 2)));
3424
3425 // we store 4 floats: [0.0f, 0.0f, 0.0f, 0.0f]
3426 const float32x4_t m128_f_zero = vdupq_n_f32(0.0f);
3427
3428 // we store 4 integers: [sourceStrideElements, sourceStrideElements, sourceStrideElements, sourceStrideElements]
3429 const uint32x4_t m128_u_sourceStrideElements = vdupq_n_u32(sourceStrideElements);
3430
3431 // we store 4 integers: [sourceWidth - 1, sourceWidth - 1, sourceWidth - 1, sourceWidth - 1], and same with sourceHeight
3432 const uint32x4_t m128_u_sourceWidth_1 = vdupq_n_u32(sourceWidth - 1u);
3433 const uint32x4_t m128_u_sourceHeight_1 = vdupq_n_u32(sourceHeight - 1u);
3434
3435 // we store 4 floats: [sourceWidth - 1, sourceWidth - 1, sourceWidth - 1, sourceWidth - 1], and same with sourceHeight
3436 const float32x4_t m128_f_sourceWidth_1 = vdupq_n_f32(float(sourceWidth - 1u));
3437 const float32x4_t m128_f_sourceHeight_1 = vdupq_n_f32(float(sourceHeight - 1u));
3438
3439 for (unsigned int x = 0u; x < targetWidth; x += 4u)
3440 {
3441 if (x + 4u > targetWidth)
3442 {
3443 // the last iteration will not fit into the target frame,
3444 // so we simply shift x left by some pixels (at most 3) and we will calculate some pixels again
3445
3446 ocean_assert(x >= 4u && targetWidth > 4u);
3447 const unsigned int newX = targetWidth - 4u;
3448
3449 ocean_assert(x > newX);
3450 targetRow -= x - newX;
3451
3452 x = newX;
3453
3454 // the for loop will stop after this iteration
3455 ocean_assert(!(x + 4u < targetWidth));
3456 }
3457
3458
3459 // we need four successive x coordinate floats:
3460 // [x + 3.0f, x + 2.0f, x + 1.0f; x + 0.0f]
3461 float x_0123[4] = {float(x + 0u), float(x + 1u), float(x + 2u), float(x + 3u)};
3462 const float32x4_t m128_f_x_0123 = vld1q_f32(x_0123);
3463
3464 // we calculate xx and yy and zz for [x + 3.0f, x + 2.0f, x + 1.0f, x + 0.0f]
3465 const float32x4_t m128_f_sourceX = vmlaq_f32(m128_f_C0, m128_f_X0, m128_f_x_0123);
3466 const float32x4_t m128_f_sourceY = vmlaq_f32(m128_f_C1, m128_f_X1, m128_f_x_0123);
3467
3468
3469 // now we check whether we are inside the source frame
3470 const uint32x4_t m128_u_validPixelX = vandq_u32(vcleq_f32(m128_f_sourceX, m128_f_sourceWidth_1), vcgeq_f32(m128_f_sourceX, m128_f_zero)); // sourcePosition.x() <= (sourceWidth - 1) && sourcePosition.x() >= 0 ? 0xFFFFFFFF : 0x00000000
3471 const uint32x4_t m128_u_validPixelY = vandq_u32(vcleq_f32(m128_f_sourceY, m128_f_sourceHeight_1), vcgeq_f32(m128_f_sourceY, m128_f_zero)); // sourcePosition.y() <= (sourceHeight - 1) && sourcePosition.y() >= 0 ? 0xFFFFFFFF : 0x00000000
3472
3473 const uint32x4_t m128_u_validPixel = vandq_u32(m128_u_validPixelX, m128_u_validPixelY); // is_inside_source_frame(sourcePosition) ? 0xFFFFFFFF : 0x00000000
3474
3475
3476 // we can stop here if all pixels are invalid
3477 const uint32x2_t m64_u_validPixel = vorr_u32(vget_low_u32(m128_u_validPixel), vget_high_u32(m128_u_validPixel));
3478 if ((vget_lane_u32(m64_u_validPixel, 0) | vget_lane_u32(m64_u_validPixel, 1)) == 0x00000000u)
3479 {
3480#ifdef OCEAN_DEBUG
3481 OCEAN_ALIGN_DATA(16) unsigned int debugValidPixels[4];
3482 vst1q_u32(debugValidPixels, m128_u_validPixel);
3483 ocean_assert(!(debugValidPixels[0] || debugValidPixels[1] || debugValidPixels[2] || debugValidPixels[3]));
3484#endif
3485
3486 targetRow[0] = *bColor;
3487 targetRow[1] = *bColor;
3488 targetRow[2] = *bColor;
3489 targetRow[3] = *bColor;
3490
3491 targetRow += 4;
3492
3493 continue;
3494 }
3495
3496
3497 // we store the result
3498 vst1q_u32(validPixels, m128_u_validPixel);
3499 ocean_assert(validPixels[0] || validPixels[1] || validPixels[2] || validPixels[3]);
3500
3501
3502 // now we determine the left, top, right and bottom pixel used for the interpolation
3503 // left = floor(x); top = floor(y)
3504 const uint32x4_t m128_u_left = vcvtq_u32_f32(m128_f_sourceX);
3505 const uint32x4_t m128_u_top = vcvtq_u32_f32(m128_f_sourceY);
3506
3507 // right = min(left + 1, width - 1); bottom = min(top + 1; height - 1)
3508 const uint32x4_t m128_u_right = vminq_u32(vaddq_u32(m128_u_left, vdupq_n_u32(1u)), m128_u_sourceWidth_1);
3509 const uint32x4_t m128_u_bottom = vminq_u32(vaddq_u32(m128_u_top, vdupq_n_u32(1u)), m128_u_sourceHeight_1);
3510
3511 // offset = y * stride + x * channels
3512 const uint32x4_t m128_u_topLeftOffsetElements = vmlaq_u32(vmulq_u32(m128_u_left, constantChannels_u_32x4), m128_u_top, m128_u_sourceStrideElements); // topLeftOffset = top * strideElements + left * channels
3513 const uint32x4_t m128_u_topRightOffsetElements = vmlaq_u32(vmulq_u32(m128_u_right, constantChannels_u_32x4), m128_u_top, m128_u_sourceStrideElements); // topRightOffset = top * strideElements + right * channels
3514 const uint32x4_t m128_u_bottomLeftOffsetElements = vmlaq_u32(vmulq_u32(m128_u_left, constantChannels_u_32x4), m128_u_bottom, m128_u_sourceStrideElements); // ...
3515 const uint32x4_t m128_u_bottomRightOffsetElements = vmlaq_u32(vmulq_u32(m128_u_right, constantChannels_u_32x4), m128_u_bottom, m128_u_sourceStrideElements);
3516
3517 // we store the offsets
3518 vst1q_u32(topLeftOffsetsElements, m128_u_topLeftOffsetElements);
3519 vst1q_u32(topRightOffsetsElements, m128_u_topRightOffsetElements);
3520 vst1q_u32(bottomLeftOffsetsElements, m128_u_bottomLeftOffsetElements);
3521 vst1q_u32(bottomRightOffsetsElements, m128_u_bottomRightOffsetElements);
3522
3523
3524 // now we need to determine the interpolation factors tx, tx_ and ty, ty_: (top_left * tx_ + top_right * tx) * ty_ + (bottom_left * tx_ + bottom_right * tx) * ty
3525
3526 // we determine the fractional portions of the x' and y':
3527 float32x4_t m128_f_tx = vsubq_f32(m128_f_sourceX, vcvtq_f32_u32(m128_u_left));
3528 float32x4_t m128_f_ty = vsubq_f32(m128_f_sourceY, vcvtq_f32_u32(m128_u_top));
3529
3530 // we use integer interpolation [0.0, 1.0] -> [0, 128]
3531 m128_f_tx = vmulq_f32(m128_f_tx, vdupq_n_f32(128.0f));
3532 m128_f_ty = vmulq_f32(m128_f_ty, vdupq_n_f32(128.0f));
3533
3534 const uint32x4_t m128_u_tx = vcvtq_u32_f32(vaddq_f32(m128_f_tx, vdupq_n_f32(0.5)));
3535 const uint32x4_t m128_u_ty = vcvtq_u32_f32(vaddq_f32(m128_f_ty, vdupq_n_f32(0.5)));
3536
3537 if constexpr (tChannels > 4u)
3538 {
3539 // normally we would simply call instead of copying the code of the function to this location
3540 // however, if calling the function instead of applying the code here directly
3541 // clang ends with code approx. 20% slower
3542 // thus we make a copy of the code and keep the function for demonstration purposes
3543
3544 //interpolate4Pixels8BitPerChannelNEON<tChannels>(source, topLeftOffsetsElements, topRightOffsetsElements, bottomLeftOffsetsElements, bottomRightOffsetsElements, validPixels, *bColor, m128_u_tx, m128_u_ty, targetPixelData);
3545 //targetPixelData += 4;
3546
3547 const uint32x4_t m128_u_tx_ = vsubq_u32(vdupq_n_u32(128u), m128_u_tx);
3548 const uint32x4_t m128_u_ty_ = vsubq_u32(vdupq_n_u32(128u), m128_u_ty);
3549
3550 // (top_left * tx_ + top_right * tx) * ty_ + (bottom_left * tx_ + bottom_right * tx) * ty
3551 // == top_left * tx_ty_ + top_right * txty_ + bottom_left * tx_ty + bottom_right * txty
3552 const uint32x4_t m128_u_tx_ty_ = vmulq_u32(m128_u_tx_, m128_u_ty_);
3553 const uint32x4_t m128_u_txty_ = vmulq_u32(m128_u_tx, m128_u_ty_);
3554 const uint32x4_t m128_u_tx_ty = vmulq_u32(m128_u_tx_, m128_u_ty);
3555 const uint32x4_t m128_u_txty = vmulq_u32(m128_u_tx, m128_u_ty);
3556
3557 unsigned int tx_ty_s[4];
3558 unsigned int txty_s[4];
3559 unsigned int tx_tys[4];
3560 unsigned int txtys[4];
3561
3562 // we store the interpolation factors
3563 vst1q_u32(tx_ty_s, m128_u_tx_ty_);
3564 vst1q_u32(txty_s, m128_u_txty_);
3565 vst1q_u32(tx_tys, m128_u_tx_ty);
3566 vst1q_u32(txtys, m128_u_txty);
3567
3568 for (unsigned int i = 0u; i < 4u; ++i)
3569 {
3570 if (validPixels[i])
3571 {
3572 ocean_assert(topLeftOffsetsElements[i] < sourceStrideElements * sourceHeight);
3573 ocean_assert(topRightOffsetsElements[i] < sourceStrideElements * sourceHeight);
3574 ocean_assert(bottomLeftOffsetsElements[i] < sourceStrideElements * sourceHeight);
3575 ocean_assert(bottomRightOffsetsElements[i] < sourceStrideElements * sourceHeight);
3576
3577 const uint8_t* topLeft = source + topLeftOffsetsElements[i];
3578 const uint8_t* topRight = source + topRightOffsetsElements[i];
3579
3580 const uint8_t* bottomLeft = source + bottomLeftOffsetsElements[i];
3581 const uint8_t* bottomRight = source + bottomRightOffsetsElements[i];
3582
3583 const unsigned int tx_ty_ = tx_ty_s[i];
3584 const unsigned int txty_ = txty_s[i];
3585 const unsigned int tx_ty = tx_tys[i];
3586 const unsigned int txty = txtys[i];
3587
3588 ocean_assert(tx_ty_ + txty_ + tx_ty + txty == 128u * 128u);
3589
3590 for (unsigned int n = 0u; n < tChannels; ++n)
3591 {
3592 ((uint8_t*)targetRow)[n] = uint8_t((topLeft[n] * tx_ty_ + topRight[n] * txty_ + bottomLeft[n] * tx_ty + bottomRight[n] * txty + 8192u) >> 14u);
3593 }
3594 }
3595 else
3596 {
3597 *targetRow = *bColor;
3598 }
3599
3600 targetRow++;
3601 }
3602 }
3603 else
3604 {
3605 interpolate4Pixels8BitPerChannelNEON<tChannels>(source, topLeftOffsetsElements, topRightOffsetsElements, bottomLeftOffsetsElements, bottomRightOffsetsElements, validPixels, *bColor, m128_u_tx, m128_u_ty, targetRow);
3606 targetRow += 4;
3607 }
3608 }
3609 }
3610}
3611
3612template <unsigned int tChannels>
3613void FrameInterpolatorBilinear::homography8BitPerChannelNEONSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* input_H_output, const uint8_t* borderColor, uint8_t* output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
3614{
3615 static_assert(tChannels >= 1u, "Invalid channel number!");
3616
3617 ocean_assert(input != nullptr && output != nullptr);
3618 ocean_assert(inputWidth > 0u && inputHeight > 0u);
3619 ocean_assert_and_suppress_unused(outputWidth >= 4u && outputHeight > 0u, outputHeight);
3620 ocean_assert(input_H_output != nullptr);
3621
3622 ocean_assert(firstOutputRow + numberOutputRows <= outputHeight);
3623
3624 const unsigned int inputStrideElements = inputWidth * tChannels + inputPaddingElements;
3625 const unsigned int outputStrideElements = outputWidth * tChannels + outputPaddingElements;
3626
3627 using PixelType = typename DataType<uint8_t, tChannels>::Type;
3628
3629 uint8_t zeroColor[tChannels] = {uint8_t(0)};
3630 const PixelType* const bColor = borderColor ? (PixelType*)borderColor : (PixelType*)zeroColor;
3631
3632 unsigned int validPixels[4];
3633
3634 unsigned int topLeftOffsetsElements[4];
3635 unsigned int topRightOffsetsElements[4];
3636 unsigned int bottomLeftOffsetsElements[4];
3637 unsigned int bottomRightOffsetsElements[4];
3638
3639 const uint32x4_t constantChannels_u_32x4 = vdupq_n_u32(tChannels);
3640
3641 // we store 4 floats: [X0, X0, X0, X0], and same with X1 and X2
3642 const float32x4_t m128_f_X0 = vdupq_n_f32(float((*input_H_output)(0, 0)));
3643 const float32x4_t m128_f_X1 = vdupq_n_f32(float((*input_H_output)(1, 0)));
3644 const float32x4_t m128_f_X2 = vdupq_n_f32(float((*input_H_output)(2, 0)));
3645
3646 for (unsigned int y = firstOutputRow; y < firstOutputRow + numberOutputRows; ++y)
3647 {
3648 PixelType* outputPixelData = (PixelType*)(output + y * outputStrideElements);
3649
3650 /*
3651 * We can slightly optimize the 3x3 matrix multiplication:
3652 *
3653 * | X0 Y0 Z0 | | x |
3654 * | X1 Y1 Z1 | * | y |
3655 * | X2 Y2 Z2 | | 1 |
3656 *
3657 * | xx | | X0 * x | | Y0 * y + Z0 |
3658 * | yy | = | X1 * x | + | Y1 * y + Z1 |
3659 * | zz | | X2 * x | | Y2 * y + Z2 |
3660 *
3661 * | xx | | X0 * x | | C0 |
3662 * | yy | = | X1 * x | + | C1 |
3663 * | zz | | X2 * x | | C3 |
3664 *
3665 * As y is constant within the inner loop, we can pre-calculate the following terms:
3666 *
3667 * | x' | | (X0 * x + C0) / (X2 * x + C2) |
3668 * | y' | = | (X1 * x + C1) / (X2 * x + C2) |
3669 */
3670
3671 // we store 4 floats: [C0, C0, C0, C0], and same with C1 and C2
3672 const float32x4_t m128_f_C0 = vdupq_n_f32(float((*input_H_output)(0, 1) * Scalar(y) + (*input_H_output)(0, 2)));
3673 const float32x4_t m128_f_C1 = vdupq_n_f32(float((*input_H_output)(1, 1) * Scalar(y) + (*input_H_output)(1, 2)));
3674 const float32x4_t m128_f_C2 = vdupq_n_f32(float((*input_H_output)(2, 1) * Scalar(y) + (*input_H_output)(2, 2)));
3675
3676 // we store 4 floats: [0.0f, 0.0f, 0.0f, 0.0f]
3677 const float32x4_t m128_f_zero = vdupq_n_f32(0.0f);
3678
3679 // we store 4 integers: [inputStrideElements, inputStrideElements, inputStrideElements, inputStrideElements]
3680 const uint32x4_t m128_u_inputStrideElements = vdupq_n_u32(inputStrideElements);
3681
3682 // we store 4 integers: [inputWidth - 1, inputWidth - 1, inputWidth - 1, inputWidth - 1], and same with inputHeight
3683 const uint32x4_t m128_u_inputWidth_1 = vdupq_n_u32(inputWidth - 1u);
3684 const uint32x4_t m128_u_inputHeight_1 = vdupq_n_u32(inputHeight - 1u);
3685
3686 // we store 4 floats: [inputWidth - 1, inputWidth - 1, inputWidth - 1, inputWidth - 1], and same with inputHeight
3687 const float32x4_t m128_f_inputWidth_1 = vdupq_n_f32(float(inputWidth - 1u));
3688 const float32x4_t m128_f_inputHeight_1 = vdupq_n_f32(float(inputHeight - 1u));
3689
3690 for (unsigned int x = 0u; x < outputWidth; x += 4u)
3691 {
3692 if (x + 4u > outputWidth)
3693 {
3694 // the last iteration will not fit into the output frame,
3695 // so we simply shift x left by some pixels (at most 3) and we will calculate some pixels again
3696
3697 ocean_assert(x >= 4u && outputWidth > 4u);
3698 const unsigned int newX = outputWidth - 4u;
3699
3700 ocean_assert(x > newX);
3701 outputPixelData -= x - newX;
3702
3703 x = newX;
3704
3705 // the for loop will stop after this iteration
3706 ocean_assert(!(x + 4u < outputWidth));
3707 }
3708
3709
3710 // we need four successive x coordinate floats:
3711 // [x + 3.0f, x + 2.0f, x + 1.0f; x + 0.0f]
3712 float x_0123[4] = {float(x + 0u), float(x + 1u), float(x + 2u), float(x + 3u)};
3713 const float32x4_t m128_f_x_0123 = vld1q_f32(x_0123);
3714
3715 // we calculate xx and yy and zz for [x + 3.0f, x + 2.0f, x + 1.0f, x + 0.0f]
3716 const float32x4_t m128_f_xx = vmlaq_f32(m128_f_C0, m128_f_X0, m128_f_x_0123);
3717 const float32x4_t m128_f_yy = vmlaq_f32(m128_f_C1, m128_f_X1, m128_f_x_0123);
3718 const float32x4_t m128_f_zz = vmlaq_f32(m128_f_C2, m128_f_X2, m128_f_x_0123);
3719
3720#ifdef USE_DIVISION_ARM64_ARCHITECTURE
3721
3722 // using the division available from ARM64 is more precise
3723 const float32x4_t m128_f_inputX = vdivq_f32(m128_f_xx, m128_f_zz);
3724 const float32x4_t m128_f_inputY = vdivq_f32(m128_f_yy, m128_f_zz);
3725
3726#else
3727
3728 // we calculate the (approximated) inverse of zz
3729 // [1/zz3, 1/zz2, 1/zz1, 1/zz0]
3730 float32x4_t inv_zz_128 = vrecpeq_f32(m128_f_zz);
3731 inv_zz_128 = vmulq_f32(vrecpsq_f32(m128_f_zz, inv_zz_128), inv_zz_128); // improving the accuracy of the approx. inverse by Newton/Raphson
3732
3733 // we determine the normalized coordinates x' and y' for for x + 3.0f, x + 2.0f, ...)
3734 const float32x4_t m128_f_inputX = vmulq_f32(m128_f_xx, inv_zz_128);
3735 const float32x4_t m128_f_inputY = vmulq_f32(m128_f_yy, inv_zz_128);
3736
3737#endif // USE_DIVISION_ARM64_ARCHITECTURE
3738
3739
3740 // now we check whether we are inside the input frame
3741 const uint32x4_t m128_u_validPixelX = vandq_u32(vcleq_f32(m128_f_inputX, m128_f_inputWidth_1), vcgeq_f32(m128_f_inputX, m128_f_zero)); // inputPosition.x() >= 0 && inputPosition.x() <= (inputWidth - 1) ? 0xFFFFFF : 0x000000
3742 const uint32x4_t m128_u_validPixelY = vandq_u32(vcleq_f32(m128_f_inputY, m128_f_inputHeight_1), vcgeq_f32(m128_f_inputY, m128_f_zero)); // inputPosition.y() >= 0 && inputPosition.y() <= (inputHeight - 1) ? 0xFFFFFF : 0x000000
3743
3744 const uint32x4_t m128_u_validPixel = vandq_u32(m128_u_validPixelX, m128_u_validPixelY); // is_inside_input_frame(inputPosition) ? 0xFFFFFF : 0x000000
3745
3746
3747 // we can stop here if all pixels are invalid
3748 const uint32x2_t m64_u_validPixel = vorr_u32(vget_low_u32(m128_u_validPixel), vget_high_u32(m128_u_validPixel));
3749 if ((vget_lane_u32(m64_u_validPixel, 0) | vget_lane_u32(m64_u_validPixel, 1)) == 0x00000000u)
3750 {
3751#ifdef OCEAN_DEBUG
3752 OCEAN_ALIGN_DATA(16) unsigned int debugValidPixels[4];
3753 vst1q_u32(debugValidPixels, m128_u_validPixel);
3754 ocean_assert(!(debugValidPixels[0] || debugValidPixels[1] || debugValidPixels[2] || debugValidPixels[3]));
3755#endif
3756
3757 outputPixelData[0] = *bColor;
3758 outputPixelData[1] = *bColor;
3759 outputPixelData[2] = *bColor;
3760 outputPixelData[3] = *bColor;
3761
3762 outputPixelData += 4;
3763
3764 continue;
3765 }
3766
3767
3768 // we store the result
3769 vst1q_u32(validPixels, m128_u_validPixel);
3770 ocean_assert(validPixels[0] || validPixels[1] || validPixels[2] || validPixels[3]);
3771
3772
3773 // now we determine the left, top, right and bottom pixel used for the interpolation
3774 // left = floor(x); top = floor(y)
3775 const uint32x4_t m128_u_left = vcvtq_u32_f32(m128_f_inputX);
3776 const uint32x4_t m128_u_top = vcvtq_u32_f32(m128_f_inputY);
3777
3778 // right = min(left + 1, width - 1); bottom = min(top + 1; height - 1)
3779 const uint32x4_t m128_u_right = vminq_u32(vaddq_u32(m128_u_left, vdupq_n_u32(1u)), m128_u_inputWidth_1);
3780 const uint32x4_t m128_u_bottom = vminq_u32(vaddq_u32(m128_u_top, vdupq_n_u32(1u)), m128_u_inputHeight_1);
3781
3782 // offset = y * stride + x * channels
3783 const uint32x4_t m128_u_topLeftOffsetElements = vmlaq_u32(vmulq_u32(m128_u_left, constantChannels_u_32x4), m128_u_top, m128_u_inputStrideElements); // topLeftOffset = top * strideElements + left * channels
3784 const uint32x4_t m128_u_topRightOffsetElements = vmlaq_u32(vmulq_u32(m128_u_right, constantChannels_u_32x4), m128_u_top, m128_u_inputStrideElements); // topRightOffset = top * strideElements + right * channels
3785 const uint32x4_t m128_u_bottomLeftOffsetElements = vmlaq_u32(vmulq_u32(m128_u_left, constantChannels_u_32x4), m128_u_bottom, m128_u_inputStrideElements); // ...
3786 const uint32x4_t m128_u_bottomRightOffsetElements = vmlaq_u32(vmulq_u32(m128_u_right, constantChannels_u_32x4), m128_u_bottom, m128_u_inputStrideElements);
3787
3788 // we store the offsets
3789 vst1q_u32(topLeftOffsetsElements, m128_u_topLeftOffsetElements);
3790 vst1q_u32(topRightOffsetsElements, m128_u_topRightOffsetElements);
3791 vst1q_u32(bottomLeftOffsetsElements, m128_u_bottomLeftOffsetElements);
3792 vst1q_u32(bottomRightOffsetsElements, m128_u_bottomRightOffsetElements);
3793
3794
3795 // now we need to determine the interpolation factors tx, tx_ and ty, ty_: (top_left * tx_ + top_right * tx) * ty_ + (bottom_left * tx_ + bottom_right * tx) * ty
3796
3797 // we determine the fractional portions of the x' and y':
3798 float32x4_t m128_f_tx = vsubq_f32(m128_f_inputX, vcvtq_f32_u32(m128_u_left));
3799 float32x4_t m128_f_ty = vsubq_f32(m128_f_inputY, vcvtq_f32_u32(m128_u_top));
3800
3801 // we use integer interpolation [0.0, 1.0] -> [0, 128]
3802 m128_f_tx = vmulq_f32(m128_f_tx, vdupq_n_f32(128.0f));
3803 m128_f_ty = vmulq_f32(m128_f_ty, vdupq_n_f32(128.0f));
3804
3805 const uint32x4_t m128_u_tx = vcvtq_u32_f32(vaddq_f32(m128_f_tx, vdupq_n_f32(0.5)));
3806 const uint32x4_t m128_u_ty = vcvtq_u32_f32(vaddq_f32(m128_f_ty, vdupq_n_f32(0.5)));
3807
3808 if constexpr (tChannels > 4u)
3809 {
3810 // normally we would simply call instead of copying the code of the function to this location
3811 // however, if calling the function instead of applying the code here directly
3812 // clang ends with code approx. 20% slower
3813 // thus we make a copy of the code and keep the function for demonstration purposes
3814
3815 //interpolate4Pixels8BitPerChannelNEON<tChannels>(input, topLeftOffsetsElements, topRightOffsetsElements, bottomLeftOffsetsElements, bottomRightOffsetsElements, validPixels, *bColor, m128_u_tx, m128_u_ty, outputPixelData);
3816 //outputPixelData += 4;
3817
3818 const uint32x4_t m128_u_tx_ = vsubq_u32(vdupq_n_u32(128u), m128_u_tx);
3819 const uint32x4_t m128_u_ty_ = vsubq_u32(vdupq_n_u32(128u), m128_u_ty);
3820
3821 // (top_left * tx_ + top_right * tx) * ty_ + (bottom_left * tx_ + bottom_right * tx) * ty
3822 // == top_left * tx_ty_ + top_right * txty_ + bottom_left * tx_ty + bottom_right * txty
3823 const uint32x4_t m128_u_tx_ty_ = vmulq_u32(m128_u_tx_, m128_u_ty_);
3824 const uint32x4_t m128_u_txty_ = vmulq_u32(m128_u_tx, m128_u_ty_);
3825 const uint32x4_t m128_u_tx_ty = vmulq_u32(m128_u_tx_, m128_u_ty);
3826 const uint32x4_t m128_u_txty = vmulq_u32(m128_u_tx, m128_u_ty);
3827
3828 unsigned int tx_ty_s[4];
3829 unsigned int txty_s[4];
3830 unsigned int tx_tys[4];
3831 unsigned int txtys[4];
3832
3833 // we store the interpolation factors
3834 vst1q_u32(tx_ty_s, m128_u_tx_ty_);
3835 vst1q_u32(txty_s, m128_u_txty_);
3836 vst1q_u32(tx_tys, m128_u_tx_ty);
3837 vst1q_u32(txtys, m128_u_txty);
3838
3839 for (unsigned int i = 0u; i < 4u; ++i)
3840 {
3841 if (validPixels[i])
3842 {
3843 ocean_assert(topLeftOffsetsElements[i] < inputStrideElements * inputHeight);
3844 ocean_assert(topRightOffsetsElements[i] < inputStrideElements * inputHeight);
3845 ocean_assert(bottomLeftOffsetsElements[i] < inputStrideElements * inputHeight);
3846 ocean_assert(bottomRightOffsetsElements[i] < inputStrideElements * inputHeight);
3847
3848 const uint8_t* topLeft = input + topLeftOffsetsElements[i];
3849 const uint8_t* topRight = input + topRightOffsetsElements[i];
3850
3851 const uint8_t* bottomLeft = input + bottomLeftOffsetsElements[i];
3852 const uint8_t* bottomRight = input + bottomRightOffsetsElements[i];
3853
3854 const unsigned int tx_ty_ = tx_ty_s[i];
3855 const unsigned int txty_ = txty_s[i];
3856 const unsigned int tx_ty = tx_tys[i];
3857 const unsigned int txty = txtys[i];
3858
3859 ocean_assert(tx_ty_ + txty_ + tx_ty + txty == 128u * 128u);
3860
3861 for (unsigned int n = 0u; n < tChannels; ++n)
3862 {
3863 ((uint8_t*)outputPixelData)[n] = uint8_t((topLeft[n] * tx_ty_ + topRight[n] * txty_ + bottomLeft[n] * tx_ty + bottomRight[n] * txty + 8192u) >> 14u);
3864 }
3865 }
3866 else
3867 {
3868 *outputPixelData = *bColor;
3869 }
3870
3871 outputPixelData++;
3872 }
3873 }
3874 else
3875 {
3876 interpolate4Pixels8BitPerChannelNEON<tChannels>(input, topLeftOffsetsElements, topRightOffsetsElements, bottomLeftOffsetsElements, bottomRightOffsetsElements, validPixels, *bColor, m128_u_tx, m128_u_ty, outputPixelData);
3877 outputPixelData += 4;
3878 }
3879 }
3880 }
3881}
3882
3883template <>
3884OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelNEON<1u>(const uint8_t* source, const unsigned int offsetsTopLeftElements[4], const unsigned int offsetsTopRightElements[4], const unsigned int offsetsBottomLeftElements[4], const unsigned int offsetsBottomRightElements[4], const unsigned int validPixels[4], const DataType<uint8_t, 1u>::Type& borderColor, const uint32x4_t& m128_factorsRight, const uint32x4_t& m128_factorsBottom, typename DataType<uint8_t, 1u>::Type* targetPositionPixels)
3885{
3886 ocean_assert(source != nullptr);
3887 ocean_assert(targetPositionPixels != nullptr);
3888
3889 // as we do not initialize the following intermediate data,
3890 // we hopefully will not allocate memory on the stack each time this function is called
3891 DataType<uint8_t, 1u>::Type pixels[16];
3892
3893 // we will store the pixel information in the following pattern:
3894 // F E D C B A 9 8 7 6 5 4 3 2 1 0
3895 // BR3 BL3 TR3 TL3 BR2 BL2 TR2 TL2 BR1 BL1 TR1 TL1 BR0 BL0 TR0 TL0
3896
3897 // we gather the individual source pixel values from the source image,
3898 // based on the calculated pixel locations
3899 for (unsigned int i = 0u; i < 4u; ++i)
3900 {
3901 if (validPixels[i])
3902 {
3903 pixels[i * 4u + 0u] = *((const DataType<uint8_t, 1u>::Type*)(source + offsetsTopLeftElements[i]));
3904 pixels[i * 4u + 1u] = *((const DataType<uint8_t, 1u>::Type*)(source + offsetsTopRightElements[i]));
3905 pixels[i * 4u + 2u] = *((const DataType<uint8_t, 1u>::Type*)(source + offsetsBottomLeftElements[i]));
3906 pixels[i * 4u + 3u] = *((const DataType<uint8_t, 1u>::Type*)(source + offsetsBottomRightElements[i]));
3907 }
3908 else
3909 {
3910 pixels[i * 4u + 0u] = borderColor;
3911 pixels[i * 4u + 1u] = borderColor;
3912 pixels[i * 4u + 2u] = borderColor;
3913 pixels[i * 4u + 3u] = borderColor;
3914 }
3915 }
3916
3917 static_assert(sizeof(uint8x16_t) == sizeof(pixels), "Invalid data type!");
3918
3919 const uint8x16_t m128_pixels = vld1q_u8((const uint8_t*)pixels);
3920
3921
3922 // factorLeft = 128 - factorRight
3923 // factorTop = 128 - factorBottom
3924
3925 const uint32x4_t m128_factorsLeft = vsubq_u32(vdupq_n_u32(128u), m128_factorsRight);
3926 const uint32x4_t m128_factorsTop = vsubq_u32(vdupq_n_u32(128u), m128_factorsBottom);
3927
3928 // (top_left * factorLeft + top_right * factorRight) * factorTop + (bottom_left * factorLeft + bottom_right * factorRight) * factorBottom
3929 // == top_left * factorTopLeft + top_right * factorTopRight + bottom_left * factorBottomLeft + bottom_right * factorBottomRight
3930
3931 const uint32x4_t m128_factorsTopLeft = vmulq_u32(m128_factorsTop, m128_factorsLeft);
3932 const uint32x4_t m128_factorsTopRight = vmulq_u32(m128_factorsTop, m128_factorsRight);
3933 const uint32x4_t m128_factorsBottomLeft = vmulq_u32(m128_factorsBottom, m128_factorsLeft);
3934 const uint32x4_t m128_factorsBottomRight = vmulq_u32(m128_factorsBottom, m128_factorsRight);
3935
3936 // pixels stores the four interpolation grascale pixel values (top left, top right, bottom left, bottom right) for 4 (independent) pixels:
3937 // F E D C B A 9 8 7 6 5 4 3 2 1 0
3938 // BR BL TR TL BR BL TR TL BR BL TR TL BR BL TR TL
3939
3940 // factorsTopLeft stores the 32 bit interpolation values for 4 pixels:
3941 // FEDC BA98 7654 3210
3942 // 3 2 1 0 (32 bit interpolation values, fitting into 16 bit)
3943
3944
3945 // we will simply extract each channel from the source pixels,
3946 // each extracted channel will be multiplied by the corresponding interpolation factor
3947 // and all interpolation results will be accumulated afterwards
3948
3949 const uint32x4_t m128_maskFirstByte = vdupq_n_u32(0x000000FFu);
3950
3951 const uint32x4_t m128_muliplicationA = vmulq_u32(vandq_u32(vreinterpretq_u32_u8(m128_pixels), m128_maskFirstByte), m128_factorsTopLeft);
3952 const uint32x4_t m128_muliplicationB = vmulq_u32(vandq_u32(vshrq_n_u32(vreinterpretq_u32_u8(m128_pixels), 8), m128_maskFirstByte), m128_factorsTopRight);
3953 const uint32x4_t m128_muliplicationC = vmulq_u32(vandq_u32(vshrq_n_u32(vreinterpretq_u32_u8(m128_pixels), 16), m128_maskFirstByte), m128_factorsBottomLeft);
3954 const uint32x4_t m128_muliplicationD = vmulq_u32(vandq_u32(vshrq_n_u32(vreinterpretq_u32_u8(m128_pixels), 24), m128_maskFirstByte), m128_factorsBottomRight);
3955
3956 const uint32x4_t m128_multiplication = vaddq_u32(vaddq_u32(m128_muliplicationA, m128_muliplicationB), vaddq_u32(m128_muliplicationC, m128_muliplicationD));
3957
3958 // we add 8192 and shift by 14 bits
3959
3960 const uint8x16_t m128_interpolation = vreinterpretq_u8_u32(vshrq_n_u32(vaddq_u32(m128_multiplication, vdupq_n_u32(8192u)), 14));
3961
3962 // finally we have the following result:
3963 // ---C ---8 ---4 ---0
3964 // and we need to extract the four pixel values:
3965 //
3966 // NOTE: Because of a possible bug in Clang affecting ARMv7, vget_lane_u32()
3967 // seems to assume 32-bit memory alignment for output location, which cannot
3968 // be guaranteed. This results in bus errors and crashes the application.
3969 // ARM64 is not affected.
3970#if defined(__aarch64__)
3971
3972 constexpr uint8x8_t m64_mask0 = NEON::create_uint8x8(0, 4, 1, 1, 1, 1, 1, 1);
3973 constexpr uint8x8_t m64_mask1 = NEON::create_uint8x8(1, 1, 0, 4, 1, 1, 1, 1);
3974
3975 const uint8x8_t m64_interpolation01 = vtbl1_u8(vget_low_u8(m128_interpolation), m64_mask0);
3976 const uint8x8_t m64_interpolation23 = vtbl1_u8(vget_high_u8(m128_interpolation), m64_mask1);
3977
3978 const uint8x8_t m64_interpolation0123 = vorr_u8(m64_interpolation01, m64_interpolation23);
3979
3980 const uint32_t result = vget_lane_u32(vreinterpret_u32_u8(m64_interpolation0123), 0);
3981 memcpy(targetPositionPixels, &result, sizeof(uint32_t));
3982
3983#else
3984
3985 *((uint8_t*)targetPositionPixels + 0) = vgetq_lane_u8(m128_interpolation, 0);
3986 *((uint8_t*)targetPositionPixels + 1) = vgetq_lane_u8(m128_interpolation, 4);
3987 *((uint8_t*)targetPositionPixels + 2) = vgetq_lane_u8(m128_interpolation, 8);
3988 *((uint8_t*)targetPositionPixels + 3) = vgetq_lane_u8(m128_interpolation, 12);
3989
3990#endif
3991}
3992
3993OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate8Pixels1Channel8BitNEON(const uint8x8_t& topLeft_u_8x8, const uint8x8_t& topRight_u_8x8, const uint8x8_t& bottomLeft_u_8x8, const uint8x8_t& bottomRight_u_8x8, const uint8x16_t& factorsRight_factorsBottom_128_u_8x16, uint8_t* targetPositionPixels)
3994{
3995 const uint8x16_t factorsLeft_factorsTop_128_u_8x16 = vsubq_u8(vdupq_n_u8(128u), factorsRight_factorsBottom_128_u_8x16); // factorLeft = 128 - factorRight, factorTop = 128 - factorBottomv
3996
3997 const uint8x8_t factorsRight_u_8x8 = vget_low_u8(factorsRight_factorsBottom_128_u_8x16);
3998 const uint16x8_t factorsBottom_u_16x8 = vmovl_u8(vget_high_u8(factorsRight_factorsBottom_128_u_8x16));
3999
4000 const uint8x8_t factorsLeft_u_8x8 = vget_low_u8(factorsLeft_factorsTop_128_u_8x16);
4001 const uint16x8_t factorsTop_u_16x8 = vmovl_u8(vget_high_u8(factorsLeft_factorsTop_128_u_8x16));
4002
4003 const uint16x8_t intermediateTop_u_16x8 = vmlal_u8(vmull_u8(topLeft_u_8x8, factorsLeft_u_8x8), topRight_u_8x8, factorsRight_u_8x8); // intermediateTop = topLeft * factorLeft + topRight * factorRight
4004 const uint16x8_t intermediateBottom_u_16x8 = vmlal_u8(vmull_u8(bottomLeft_u_8x8, factorsLeft_u_8x8), bottomRight_u_8x8, factorsRight_u_8x8); // intermediateBottom = bottomLeft * factorLeft + bottomRight * factorRight
4005
4006 const uint32x4_t resultA_32x4 = vmlal_u16(vmull_u16(vget_low_u16(intermediateTop_u_16x8), vget_low_u16(factorsTop_u_16x8)), vget_low_u16(intermediateBottom_u_16x8), vget_low_u16(factorsBottom_u_16x8)); // result = intermediateTop * factorTop + intermediateBottom + factorBottom
4007 const uint32x4_t resultB_32x4 = vmlal_u16(vmull_u16(vget_high_u16(intermediateTop_u_16x8), vget_high_u16(factorsTop_u_16x8)), vget_high_u16(intermediateBottom_u_16x8), vget_high_u16(factorsBottom_u_16x8));
4008
4009 const uint16x8_t result_16x8 = vcombine_u16(vrshrn_n_u32(resultA_32x4, 14), vrshrn_n_u32(resultB_32x4, 14)); // round(result / 16384.0)
4010
4011 const uint8x8_t result_8x8 = vmovn_u16(result_16x8);
4012
4013 vst1_u8(targetPositionPixels, result_8x8);
4014}
4015
4016template <>
4017OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelNEON<2u>(const uint8_t* source, const unsigned int offsetsTopLeftElements[4], const unsigned int offsetsTopRightElements[4], const unsigned int offsetsBottomLeftElements[4], const unsigned int offsetsBottomRightElements[4], const unsigned int validPixels[4], const DataType<uint8_t, 2u>::Type& borderColor, const uint32x4_t& m128_factorsRight, const uint32x4_t& m128_factorsBottom, typename DataType<uint8_t, 2u>::Type* targetPositionPixels)
4018{
4019 ocean_assert(source != nullptr);
4020 ocean_assert(targetPositionPixels != nullptr);
4021
4022 using PixelType = typename DataType<uint8_t, 2u>::Type;
4023
4024 // as we do not initialize the following intermediate data,
4025 // we hopefully will not allocate memory on the stack each time this function is called
4026 PixelType topPixels[8];
4027 PixelType bottomPixels[8];
4028
4029 // we will store the pixel information in the following pattern (here for YA):
4030 // FE DC BA 98 76 54 32 10
4031 // YA YA YA YA YA YA YA YA
4032 // TR TL TR TL TR TL TR TL
4033
4034 // we gather the individual source pixel values from the source image,
4035 // based on the calculated pixel locations
4036 for (unsigned int i = 0u; i < 4u; ++i)
4037 {
4038 if (validPixels[i])
4039 {
4040 *(topPixels + i * 2u + 0u) = *((const PixelType*)(source + offsetsTopLeftElements[i]));
4041 *(topPixels + i * 2u + 1u) = *((const PixelType*)(source + offsetsTopRightElements[i]));
4042 *(bottomPixels + i * 2u + 0u) = *((const PixelType*)(source + offsetsBottomLeftElements[i]));
4043 *(bottomPixels + i * 2u + 1u) = *((const PixelType*)(source + offsetsBottomRightElements[i]));
4044 }
4045 else
4046 {
4047 *(topPixels + i * 2u + 0u) = borderColor;
4048 *(topPixels + i * 2u + 1u) = borderColor;
4049 *(bottomPixels + i * 2u + 0u) = borderColor;
4050 *(bottomPixels + i * 2u + 1u) = borderColor;
4051 }
4052 }
4053
4054 static_assert(sizeof(uint32x4_t) == sizeof(topPixels), "Invalid data type!");
4055
4056 const uint32x4_t m128_topPixels = vreinterpretq_u32_u8(vld1q_u8((const uint8_t*)topPixels));
4057 const uint32x4_t m128_bottomPixels = vreinterpretq_u32_u8(vld1q_u8((const uint8_t*)bottomPixels));
4058
4059
4060 // factorLeft = 128 - factorRight
4061 // factorTop = 128 - factorBottom
4062
4063 const uint32x4_t m128_factorsLeft = vsubq_u32(vdupq_n_u32(128u), m128_factorsRight);
4064 const uint32x4_t m128_factorsTop = vsubq_u32(vdupq_n_u32(128u), m128_factorsBottom);
4065
4066 // (top_left * factorLeft + top_right * factorRight) * factorTop + (bottom_left * factorLeft + bottom_right * factorRight) * factorBottom
4067 // == top_left * factorTopLeft + top_right * factorTopRight + bottom_left * factorBottomLeft + bottom_right * factorBottomRight
4068
4069 const uint32x4_t m128_factorsTopLeft = vmulq_u32(m128_factorsTop, m128_factorsLeft);
4070 const uint32x4_t m128_factorsTopRight = vmulq_u32(m128_factorsTop, m128_factorsRight);
4071 const uint32x4_t m128_factorsBottomLeft = vmulq_u32(m128_factorsBottom, m128_factorsLeft);
4072 const uint32x4_t m128_factorsBottomRight = vmulq_u32(m128_factorsBottom, m128_factorsRight);
4073
4074
4075 const uint32x4_t m128_maskFirstByte = vdupq_n_u32(0x000000FFu);
4076
4077 uint32x4_t m128_muliplicationChannel0 = vmulq_u32(vandq_u32(m128_topPixels, m128_maskFirstByte), m128_factorsTopLeft);
4078 uint32x4_t m128_muliplicationChannel1 = vmulq_u32(vandq_u32(vshrq_n_u32(m128_topPixels, 8), m128_maskFirstByte), m128_factorsTopLeft);
4079
4080 m128_muliplicationChannel0 = vaddq_u32(m128_muliplicationChannel0, vmulq_u32(vandq_u32(vshrq_n_u32(m128_topPixels, 16), m128_maskFirstByte), m128_factorsTopRight));
4081 m128_muliplicationChannel1 = vaddq_u32(m128_muliplicationChannel1, vmulq_u32(vandq_u32(vshrq_n_u32(m128_topPixels, 24), m128_maskFirstByte), m128_factorsTopRight));
4082
4083 m128_muliplicationChannel0 = vaddq_u32(m128_muliplicationChannel0, vmulq_u32(vandq_u32(m128_bottomPixels, m128_maskFirstByte), m128_factorsBottomLeft));
4084 m128_muliplicationChannel1 = vaddq_u32(m128_muliplicationChannel1, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomPixels, 8), m128_maskFirstByte), m128_factorsBottomLeft));
4085
4086 m128_muliplicationChannel0 = vaddq_u32(m128_muliplicationChannel0, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomPixels, 16), m128_maskFirstByte), m128_factorsBottomRight));
4087 m128_muliplicationChannel1 = vaddq_u32(m128_muliplicationChannel1, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomPixels, 24), m128_maskFirstByte), m128_factorsBottomRight));
4088
4089
4090 // we add 8192 and shift by 14 bits
4091
4092 const uint32x4_t m128_interpolation0 = vshrq_n_u32(vaddq_u32(m128_muliplicationChannel0, vdupq_n_u32(8192u)), 14);
4093 const uint32x4_t m128_interpolation1 = vshrq_n_u32(vaddq_u32(m128_muliplicationChannel1, vdupq_n_u32(8192u)), 14);
4094
4095 // finaly we blend the interpolation results together to get the following pattern:
4096 // FE DC BA 98 76 54 32 10
4097 // 00 YA 00 YA 00 YA 00 YA
4098
4099 const uint32x4_t m128_interpolation = vorrq_u32(m128_interpolation0, vshlq_n_u32(m128_interpolation1, 8));
4100
4101 // we shuffle the 128 bit register to a 64 bit register:
4102
4103 const uint8x8_t m64_mask0 = NEON::create_uint8x8(0, 1, 4, 5, 2, 2, 2, 2);
4104 const uint8x8_t m64_mask1 = NEON::create_uint8x8(2, 2, 2, 2, 0, 1, 4, 5);
4105
4106 const uint8x8_t m64_interpolation_low = vtbl1_u8(vget_low_u8(vreinterpretq_u8_u32(m128_interpolation)), m64_mask0);
4107 const uint8x8_t m64_interpolation_high = vtbl1_u8(vget_high_u8(vreinterpretq_u8_u32(m128_interpolation)), m64_mask1);
4108
4109 const uint8x8_t m64_interpolation = vorr_u8(m64_interpolation_low, m64_interpolation_high);
4110
4111 // no we can store the following pattern as one block:
4112
4113 // 76 54 32 10
4114 // YA YA YA YA
4115
4116 vst1_u8((uint8_t*)targetPositionPixels, m64_interpolation);
4117}
4118
4119template <>
4120OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelNEON<3u>(const uint8_t* source, const unsigned int offsetsTopLeftElements[4], const unsigned int offsetsTopRightElements[4], const unsigned int offsetsBottomLeftElements[4], const unsigned int offsetsBottomRightElements[4], const unsigned int validPixels[4], const DataType<uint8_t, 3u>::Type& borderColor, const uint32x4_t& m128_factorsRight, const uint32x4_t& m128_factorsBottom, typename DataType<uint8_t, 3u>::Type* targetPositionPixels)
4121{
4122 ocean_assert(source != nullptr);
4123 ocean_assert(targetPositionPixels != nullptr);
4124
4125 // as we do not initialize the following intermediate data,
4126 // we hopefully will not allocate memory on the stack each time this function is called
4127 uint32_t topLeftPixels[4];
4128 uint32_t topRightPixels[4];
4129 uint32_t bottomLeftPixels[4];
4130 uint32_t bottomRightPixels[4];
4131
4132 // we will store the pixel information in the following pattern, note the padding byte after each pixel (here for RGB):
4133 // FEDCBA9876543210
4134 // BGR BGR BGR BGR
4135
4136 // we gather the individual source pixel values from the source image,
4137 // based on the calculated pixel locations
4138 for (unsigned int i = 0u; i < 4u; ++i)
4139 {
4140 if (validPixels[i])
4141 {
4142 memcpy(topLeftPixels + i, source + offsetsTopLeftElements[i], sizeof(uint8_t) * 3);
4143 memcpy(topRightPixels + i, source + offsetsTopRightElements[i], sizeof(uint8_t) * 3);
4144 memcpy(bottomLeftPixels + i, source + offsetsBottomLeftElements[i], sizeof(uint8_t) * 3);
4145 memcpy(bottomRightPixels + i, source + offsetsBottomRightElements[i], sizeof(uint8_t) * 3);
4146 }
4147 else
4148 {
4149 memcpy(topLeftPixels + i, &borderColor, sizeof(uint8_t) * 3);
4150 memcpy(topRightPixels + i, &borderColor, sizeof(uint8_t) * 3);
4151 memcpy(bottomLeftPixels + i, &borderColor, sizeof(uint8_t) * 3);
4152 memcpy(bottomRightPixels + i, &borderColor, sizeof(uint8_t) * 3);
4153 }
4154 }
4155
4156 static_assert(sizeof(uint32x4_t) == sizeof(topLeftPixels), "Invalid data type!");
4157
4158 const uint32x4_t m128_topLeftPixels = vld1q_u32(topLeftPixels);
4159 const uint32x4_t m128_topRightPixels = vld1q_u32(topRightPixels);
4160 const uint32x4_t m128_bottomLeftPixels = vld1q_u32(bottomLeftPixels);
4161 const uint32x4_t m128_bottomRightPixels = vld1q_u32(bottomRightPixels);
4162
4163
4164 // factorLeft = 128 - factorRight
4165 // factorTop = 128 - factorBottom
4166
4167 const uint32x4_t m128_factorsLeft = vsubq_u32(vdupq_n_u32(128u), m128_factorsRight);
4168 const uint32x4_t m128_factorsTop = vsubq_u32(vdupq_n_u32(128u), m128_factorsBottom);
4169
4170 // (top_left * factorLeft + top_right * factorRight) * factorTop + (bottom_left * factorLeft + bottom_right * factorRight) * factorBottom
4171 // == top_left * factorTopLeft + top_right * factorTopRight + bottom_left * factorBottomLeft + bottom_right * factorBottomRight
4172
4173 const uint32x4_t m128_factorsTopLeft = vmulq_u32(m128_factorsTop, m128_factorsLeft);
4174 const uint32x4_t m128_factorsTopRight = vmulq_u32(m128_factorsTop, m128_factorsRight);
4175 const uint32x4_t m128_factorsBottomLeft = vmulq_u32(m128_factorsBottom, m128_factorsLeft);
4176 const uint32x4_t m128_factorsBottomRight = vmulq_u32(m128_factorsBottom, m128_factorsRight);
4177
4178
4179 const uint32x4_t m128_maskFirstByte = vdupq_n_u32(0x000000FFu);
4180
4181 uint32x4_t m128_muliplicationChannel0 = vmulq_u32(vandq_u32(m128_topLeftPixels, m128_maskFirstByte), m128_factorsTopLeft);
4182 uint32x4_t m128_muliplicationChannel1 = vmulq_u32(vandq_u32(vshrq_n_u32(m128_topLeftPixels, 8), m128_maskFirstByte), m128_factorsTopLeft);
4183 uint32x4_t m128_muliplicationChannel2 = vmulq_u32(vandq_u32(vshrq_n_u32(m128_topLeftPixels, 16), m128_maskFirstByte), m128_factorsTopLeft);
4184
4185 m128_muliplicationChannel0 = vaddq_u32(m128_muliplicationChannel0, vmulq_u32(vandq_u32(m128_topRightPixels, m128_maskFirstByte), m128_factorsTopRight));
4186 m128_muliplicationChannel1 = vaddq_u32(m128_muliplicationChannel1, vmulq_u32(vandq_u32(vshrq_n_u32(m128_topRightPixels, 8), m128_maskFirstByte), m128_factorsTopRight));
4187 m128_muliplicationChannel2 = vaddq_u32(m128_muliplicationChannel2, vmulq_u32(vandq_u32(vshrq_n_u32(m128_topRightPixels, 16), m128_maskFirstByte), m128_factorsTopRight));
4188
4189 m128_muliplicationChannel0 = vaddq_u32(m128_muliplicationChannel0, vmulq_u32(vandq_u32(m128_bottomLeftPixels, m128_maskFirstByte), m128_factorsBottomLeft));
4190 m128_muliplicationChannel1 = vaddq_u32(m128_muliplicationChannel1, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomLeftPixels, 8), m128_maskFirstByte), m128_factorsBottomLeft));
4191 m128_muliplicationChannel2 = vaddq_u32(m128_muliplicationChannel2, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomLeftPixels, 16), m128_maskFirstByte), m128_factorsBottomLeft));
4192
4193 m128_muliplicationChannel0 = vaddq_u32(m128_muliplicationChannel0, vmulq_u32(vandq_u32(m128_bottomRightPixels, m128_maskFirstByte), m128_factorsBottomRight));
4194 m128_muliplicationChannel1 = vaddq_u32(m128_muliplicationChannel1, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomRightPixels, 8), m128_maskFirstByte), m128_factorsBottomRight));
4195 m128_muliplicationChannel2 = vaddq_u32(m128_muliplicationChannel2, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomRightPixels, 16), m128_maskFirstByte), m128_factorsBottomRight));
4196
4197
4198 // we add 8192 and shift by 14 bits
4199
4200 const uint32x4_t m128_interpolation0 = vshrq_n_u32(vaddq_u32(m128_muliplicationChannel0, vdupq_n_u32(8192u)), 14);
4201 const uint32x4_t m128_interpolation1 = vshrq_n_u32(vaddq_u32(m128_muliplicationChannel1, vdupq_n_u32(8192u)), 14);
4202 const uint32x4_t m128_interpolation2 = vshrq_n_u32(vaddq_u32(m128_muliplicationChannel2, vdupq_n_u32(8192u)), 14);
4203
4204 // finaly we blend the interpolation results together
4205
4206 const uint32x4_t m128_interpolation = vorrq_u32(vorrq_u32(m128_interpolation0, vshlq_n_u32(m128_interpolation1, 8)), vshlq_n_u32(m128_interpolation2, 16));
4207
4208 // we have to extract the get rid of the padding byte:
4209 // FEDCBA9876543210
4210 // BGR BGR BGR BGR
4211
4212 uint32_t intermediateBuffer[4];
4213 vst1q_u32(intermediateBuffer, m128_interpolation);
4214
4215 for (unsigned int i = 0u; i < 4u; ++i)
4216 {
4217 memcpy(targetPositionPixels + i, intermediateBuffer + i, sizeof(uint8_t) * 3);
4218 }
4219}
4220
4221template <>
4222OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelNEON<4u>(const uint8_t* source, const unsigned int offsetsTopLeftElements[4], const unsigned int offsetsTopRightElements[4], const unsigned int offsetsBottomLeftElements[4], const unsigned int offsetsBottomRightElements[4], const unsigned int validPixels[4], const DataType<uint8_t, 4u>::Type& borderColor, const uint32x4_t& m128_factorsRight, const uint32x4_t& m128_factorsBottom, typename DataType<uint8_t, 4u>::Type* targetPositionPixels)
4223{
4224 ocean_assert(source != nullptr);
4225 ocean_assert(targetPositionPixels != nullptr);
4226
4227 using PixelType = typename DataType<uint8_t, 4u>::Type;
4228
4229 // as we do not initialize the following intermediate data,
4230 // we hopefully will not allocate memory on the stack each time this function is called
4231 PixelType topLeftPixels[4];
4232 PixelType topRightPixels[4];
4233 PixelType bottomLeftPixels[4];
4234 PixelType bottomRightPixels[4];
4235
4236 // we will store the pixel information in the following pattern (here for RGBA):
4237 // FEDC BA98 7654 3210
4238 // ABGR ABGR ABGR ABGR
4239
4240 // we gather the individual source pixel values from the source image,
4241 // based on the calculated pixel locations
4242 for (unsigned int i = 0u; i < 4u; ++i)
4243 {
4244 if (validPixels[i])
4245 {
4246 *(topLeftPixels + i) = *((const PixelType*)(source + offsetsTopLeftElements[i]));
4247 *(topRightPixels + i) = *((const PixelType*)(source + offsetsTopRightElements[i]));
4248 *(bottomLeftPixels + i) = *((const PixelType*)(source + offsetsBottomLeftElements[i]));
4249 *(bottomRightPixels + i) = *((const PixelType*)(source + offsetsBottomRightElements[i]));
4250 }
4251 else
4252 {
4253 *(topLeftPixels + i) = borderColor;
4254 *(topRightPixels + i) = borderColor;
4255 *(bottomLeftPixels + i) = borderColor;
4256 *(bottomRightPixels + i) = borderColor;
4257 }
4258 }
4259
4260 static_assert(sizeof(uint32x4_t) == sizeof(topLeftPixels), "Invalid data type!");
4261
4262 const uint32x4_t m128_topLeftPixels = vreinterpretq_u32_u8(vld1q_u8((const uint8_t*)topLeftPixels));
4263 const uint32x4_t m128_topRightPixels = vreinterpretq_u32_u8(vld1q_u8((const uint8_t*)topRightPixels));
4264 const uint32x4_t m128_bottomLeftPixels = vreinterpretq_u32_u8(vld1q_u8((const uint8_t*)bottomLeftPixels));
4265 const uint32x4_t m128_bottomRightPixels = vreinterpretq_u32_u8(vld1q_u8((const uint8_t*)bottomRightPixels));
4266
4267
4268 // factorLeft = 128 - factorRight
4269 // factorTop = 128 - factorBottom
4270
4271 const uint32x4_t m128_factorsLeft = vsubq_u32(vdupq_n_u32(128u), m128_factorsRight);
4272 const uint32x4_t m128_factorsTop = vsubq_u32(vdupq_n_u32(128u), m128_factorsBottom);
4273
4274 // (top_left * factorLeft + top_right * factorRight) * factorTop + (bottom_left * factorLeft + bottom_right * factorRight) * factorBottom
4275 // == top_left * factorTopLeft + top_right * factorTopRight + bottom_left * factorBottomLeft + bottom_right * factorBottomRight
4276
4277 const uint32x4_t m128_factorsTopLeft = vmulq_u32(m128_factorsTop, m128_factorsLeft);
4278 const uint32x4_t m128_factorsTopRight = vmulq_u32(m128_factorsTop, m128_factorsRight);
4279 const uint32x4_t m128_factorsBottomLeft = vmulq_u32(m128_factorsBottom, m128_factorsLeft);
4280 const uint32x4_t m128_factorsBottomRight = vmulq_u32(m128_factorsBottom, m128_factorsRight);
4281
4282
4283 const uint32x4_t m128_maskFirstByte = vdupq_n_u32(0x000000FFu);
4284
4285 uint32x4_t m128_muliplicationChannel0 = vmulq_u32(vandq_u32(m128_topLeftPixels, m128_maskFirstByte), m128_factorsTopLeft);
4286 uint32x4_t m128_muliplicationChannel1 = vmulq_u32(vandq_u32(vshrq_n_u32(m128_topLeftPixels, 8), m128_maskFirstByte), m128_factorsTopLeft);
4287 uint32x4_t m128_muliplicationChannel2 = vmulq_u32(vandq_u32(vshrq_n_u32(m128_topLeftPixels, 16), m128_maskFirstByte), m128_factorsTopLeft);
4288 uint32x4_t m128_muliplicationChannel3 = vmulq_u32(vandq_u32(vshrq_n_u32(m128_topLeftPixels, 24), m128_maskFirstByte), m128_factorsTopLeft);
4289
4290 m128_muliplicationChannel0 = vaddq_u32(m128_muliplicationChannel0, vmulq_u32(vandq_u32(m128_topRightPixels, m128_maskFirstByte), m128_factorsTopRight));
4291 m128_muliplicationChannel1 = vaddq_u32(m128_muliplicationChannel1, vmulq_u32(vandq_u32(vshrq_n_u32(m128_topRightPixels, 8), m128_maskFirstByte), m128_factorsTopRight));
4292 m128_muliplicationChannel2 = vaddq_u32(m128_muliplicationChannel2, vmulq_u32(vandq_u32(vshrq_n_u32(m128_topRightPixels, 16), m128_maskFirstByte), m128_factorsTopRight));
4293 m128_muliplicationChannel3 = vaddq_u32(m128_muliplicationChannel3, vmulq_u32(vandq_u32(vshrq_n_u32(m128_topRightPixels, 24), m128_maskFirstByte), m128_factorsTopRight));
4294
4295 m128_muliplicationChannel0 = vaddq_u32(m128_muliplicationChannel0, vmulq_u32(vandq_u32(m128_bottomLeftPixels, m128_maskFirstByte), m128_factorsBottomLeft));
4296 m128_muliplicationChannel1 = vaddq_u32(m128_muliplicationChannel1, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomLeftPixels, 8), m128_maskFirstByte), m128_factorsBottomLeft));
4297 m128_muliplicationChannel2 = vaddq_u32(m128_muliplicationChannel2, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomLeftPixels, 16), m128_maskFirstByte), m128_factorsBottomLeft));
4298 m128_muliplicationChannel3 = vaddq_u32(m128_muliplicationChannel3, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomLeftPixels, 24), m128_maskFirstByte), m128_factorsBottomLeft));
4299
4300 m128_muliplicationChannel0 = vaddq_u32(m128_muliplicationChannel0, vmulq_u32(vandq_u32(m128_bottomRightPixels, m128_maskFirstByte), m128_factorsBottomRight));
4301 m128_muliplicationChannel1 = vaddq_u32(m128_muliplicationChannel1, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomRightPixels, 8), m128_maskFirstByte), m128_factorsBottomRight));
4302 m128_muliplicationChannel2 = vaddq_u32(m128_muliplicationChannel2, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomRightPixels, 16), m128_maskFirstByte), m128_factorsBottomRight));
4303 m128_muliplicationChannel3 = vaddq_u32(m128_muliplicationChannel3, vmulq_u32(vandq_u32(vshrq_n_u32(m128_bottomRightPixels, 24), m128_maskFirstByte), m128_factorsBottomRight));
4304
4305
4306 // we add 8192 and shift by 14 bits
4307
4308 const uint32x4_t m128_interpolation0 = vshrq_n_u32(vaddq_u32(m128_muliplicationChannel0, vdupq_n_u32(8192u)), 14);
4309 const uint32x4_t m128_interpolation1 = vshrq_n_u32(vaddq_u32(m128_muliplicationChannel1, vdupq_n_u32(8192u)), 14);
4310 const uint32x4_t m128_interpolation2 = vshrq_n_u32(vaddq_u32(m128_muliplicationChannel2, vdupq_n_u32(8192u)), 14);
4311 const uint32x4_t m128_interpolation3 = vshrq_n_u32(vaddq_u32(m128_muliplicationChannel3, vdupq_n_u32(8192u)), 14);
4312
4313 // finaly we blend the interpolation results together
4314
4315 const uint32x4_t m128_interpolation = vorrq_u32(vorrq_u32(m128_interpolation0, vshlq_n_u32(m128_interpolation1, 8)), vorrq_u32(vshlq_n_u32(m128_interpolation2, 16), vshlq_n_u32(m128_interpolation3, 24)));
4316
4317 vst1q_u8((uint8_t*)targetPositionPixels, vreinterpretq_u8_u32(m128_interpolation));
4318}
4319
4320OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate4Pixels4Channel8BitPerChannelNEON(const uint8x16_t& topLeftPixels_u8x16, const uint8x16_t& topRightPixels_u8x16, const uint8x16_t& bottomLeftPixels_u8x16, const uint8x16_t& bottomRightPixels_u8x16, const uint32x4_t& m128_factorsRight, const uint32x4_t& m128_factorsBottom, typename DataType<uint8_t, 4u>::Type* targetPositionPixels, const bool useOptimizedNEONFactorReplication)
4321{
4322 ocean_assert(targetPositionPixels != nullptr);
4323
4324 // Replicate per-pixel u32 interpolation factors [f0,f1,f2,f3] to per-channel u8 factors.
4325 // Both implementations produce: lo = [f0,f0,f0,f0, f1,f1,f1,f1], hi = [f2,f2,f2,f2, f3,f3,f3,f3]
4326
4327 uint8x8_t factorsRight_lo;
4328 uint8x8_t factorsRight_hi;
4329 uint8x8_t factorsLeft_lo;
4330 uint8x8_t factorsLeft_hi;
4331 uint8x8_t factorsBottom_lo;
4332 uint8x8_t factorsBottom_hi;
4333 uint8x8_t factorsTop_lo;
4334 uint8x8_t factorsTop_hi;
4335
4336#if defined(__aarch64__)
4337 // vqtbl1q_u8 is an AArch64-only intrinsic; ARMv7 always falls through to the original narrow+zip path.
4338 if (!useOptimizedNEONFactorReplication)
4339#else
4340 (void)useOptimizedNEONFactorReplication;
4341#endif
4342 {
4343 const uint8x8_t factorsRight_u8x8 = vmovn_u16(vcombine_u16(vmovn_u32(m128_factorsRight), vmovn_u32(m128_factorsRight)));
4344 const uint8x8x2_t factorsRight_zip1 = vzip_u8(factorsRight_u8x8, factorsRight_u8x8);
4345 const uint8x8x2_t factorsRight_zip2 = vzip_u8(factorsRight_zip1.val[0], factorsRight_zip1.val[0]);
4346
4347 factorsRight_lo = factorsRight_zip2.val[0]; // pixels 0-1
4348 factorsRight_hi = factorsRight_zip2.val[1]; // pixels 2-3
4349 factorsLeft_lo = vsub_u8(vdup_n_u8(128u), factorsRight_lo);
4350 factorsLeft_hi = vsub_u8(vdup_n_u8(128u), factorsRight_hi);
4351
4352 const uint8x8_t factorsBottom_u8x8 = vmovn_u16(vcombine_u16(vmovn_u32(m128_factorsBottom), vmovn_u32(m128_factorsBottom)));
4353 const uint8x8x2_t factorsBottom_zip1 = vzip_u8(factorsBottom_u8x8, factorsBottom_u8x8);
4354 const uint8x8x2_t factorsBottom_zip2 = vzip_u8(factorsBottom_zip1.val[0], factorsBottom_zip1.val[0]);
4355
4356 factorsBottom_lo = factorsBottom_zip2.val[0];
4357 factorsBottom_hi = factorsBottom_zip2.val[1];
4358 factorsTop_lo = vsub_u8(vdup_n_u8(128u), factorsBottom_lo);
4359 factorsTop_hi = vsub_u8(vdup_n_u8(128u), factorsBottom_hi);
4360 }
4361#if defined(__aarch64__)
4362 else
4363 {
4364 // Replicate per-pixel u32 factors [f0,f1,f2,f3] to per-channel u8 via TBL on the u32 register directly.
4365 // On little-endian ARM64, the low byte of each u32 lane holds the factor value (0-128).
4366 // Indices pick byte 0 of each u32 lane (offsets 0, 4, 8, 12) and replicate 4x per channel.
4367 static const uint8x16_t replicateU32Idx = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
4368
4369 const uint8x16_t factorsRight_rep = vqtbl1q_u8(vreinterpretq_u8_u32(m128_factorsRight), replicateU32Idx);
4370 const uint8x16_t factorsLeft_rep = vsubq_u8(vdupq_n_u8(128u), factorsRight_rep);
4371
4372 factorsRight_lo = vget_low_u8(factorsRight_rep);
4373 factorsRight_hi = vget_high_u8(factorsRight_rep);
4374 factorsLeft_lo = vget_low_u8(factorsLeft_rep);
4375 factorsLeft_hi = vget_high_u8(factorsLeft_rep);
4376
4377 const uint8x16_t factorsBottom_rep = vqtbl1q_u8(vreinterpretq_u8_u32(m128_factorsBottom), replicateU32Idx);
4378 const uint8x16_t factorsTop_rep = vsubq_u8(vdupq_n_u8(128u), factorsBottom_rep);
4379
4380 factorsBottom_lo = vget_low_u8(factorsBottom_rep);
4381 factorsBottom_hi = vget_high_u8(factorsBottom_rep);
4382 factorsTop_lo = vget_low_u8(factorsTop_rep);
4383 factorsTop_hi = vget_high_u8(factorsTop_rep);
4384 }
4385#endif // defined(__aarch64__)
4386
4387 // Split pixel data into halves: pixels 0-1 (lo) and pixels 2-3 (hi)
4388 const uint8x8_t topLeft_lo = vget_low_u8(topLeftPixels_u8x16);
4389 const uint8x8_t topLeft_hi = vget_high_u8(topLeftPixels_u8x16);
4390 const uint8x8_t topRight_lo = vget_low_u8(topRightPixels_u8x16);
4391 const uint8x8_t topRight_hi = vget_high_u8(topRightPixels_u8x16);
4392 const uint8x8_t bottomLeft_lo = vget_low_u8(bottomLeftPixels_u8x16);
4393 const uint8x8_t bottomLeft_hi = vget_high_u8(bottomLeftPixels_u8x16);
4394 const uint8x8_t bottomRight_lo = vget_low_u8(bottomRightPixels_u8x16);
4395 const uint8x8_t bottomRight_hi = vget_high_u8(bottomRightPixels_u8x16);
4396
4397 // --- Pixels 0-1 ---
4398 // Horizontal: topInterp = TL * fLeft + TR * fRight (u8 x u8 -> u16)
4399 const uint16x8_t topInterp_lo = vmlal_u8(vmull_u8(topLeft_lo, factorsLeft_lo), topRight_lo, factorsRight_lo);
4400 const uint16x8_t botInterp_lo = vmlal_u8(vmull_u8(bottomLeft_lo, factorsLeft_lo), bottomRight_lo, factorsRight_lo);
4401
4402 // Vertical: result = topInterp * fTop + botInterp * fBot (u16 x u16 -> u32)
4403 const uint16x8_t factorsTop_lo_u16 = vmovl_u8(factorsTop_lo);
4404 const uint16x8_t factorsBottom_lo_u16 = vmovl_u8(factorsBottom_lo);
4405
4406 const uint32x4_t result_lo_A = vmlal_u16(vmull_u16(vget_low_u16(topInterp_lo), vget_low_u16(factorsTop_lo_u16)), vget_low_u16(botInterp_lo), vget_low_u16(factorsBottom_lo_u16));
4407 const uint32x4_t result_lo_B = vmlal_u16(vmull_u16(vget_high_u16(topInterp_lo), vget_high_u16(factorsTop_lo_u16)), vget_high_u16(botInterp_lo), vget_high_u16(factorsBottom_lo_u16));
4408
4409 // --- Pixels 2-3 ---
4410 const uint16x8_t topInterp_hi = vmlal_u8(vmull_u8(topLeft_hi, factorsLeft_hi), topRight_hi, factorsRight_hi);
4411 const uint16x8_t botInterp_hi = vmlal_u8(vmull_u8(bottomLeft_hi, factorsLeft_hi), bottomRight_hi, factorsRight_hi);
4412
4413 const uint16x8_t factorsTop_hi_u16 = vmovl_u8(factorsTop_hi);
4414 const uint16x8_t factorsBottom_hi_u16 = vmovl_u8(factorsBottom_hi);
4415
4416 const uint32x4_t result_hi_A = vmlal_u16(vmull_u16(vget_low_u16(topInterp_hi), vget_low_u16(factorsTop_hi_u16)), vget_low_u16(botInterp_hi), vget_low_u16(factorsBottom_hi_u16));
4417 const uint32x4_t result_hi_B = vmlal_u16(vmull_u16(vget_high_u16(topInterp_hi), vget_high_u16(factorsTop_hi_u16)), vget_high_u16(botInterp_hi), vget_high_u16(factorsBottom_hi_u16));
4418
4419 // Round and narrow: (result + 8192) >> 14 via vrshrn_n_u32
4420 const uint16x4_t narrow_lo_A = vrshrn_n_u32(result_lo_A, 14);
4421 const uint16x4_t narrow_lo_B = vrshrn_n_u32(result_lo_B, 14);
4422 const uint16x4_t narrow_hi_A = vrshrn_n_u32(result_hi_A, 14);
4423 const uint16x4_t narrow_hi_B = vrshrn_n_u32(result_hi_B, 14);
4424
4425 const uint8x8_t result_lo = vmovn_u16(vcombine_u16(narrow_lo_A, narrow_lo_B));
4426 const uint8x8_t result_hi = vmovn_u16(vcombine_u16(narrow_hi_A, narrow_hi_B));
4427
4428 vst1q_u8((uint8_t*)targetPositionPixels, vcombine_u8(result_lo, result_hi));
4429}
4430
4431template <unsigned int tChannels>
4432OCEAN_FORCE_INLINE void FrameInterpolatorBilinear::interpolate4Pixels8BitPerChannelNEON(const uint8_t* source, const unsigned int offsetsTopLeftElements[4], const unsigned int offsetsTopRightElements[4], const unsigned int offsetsBottomLeftElements[4], const unsigned int offsetsBottomRightElements[4], const unsigned int validPixels[4], const typename DataType<uint8_t, tChannels>::Type& borderColor, const uint32x4_t& m128_factorsRight, const uint32x4_t& m128_factorsBottom, typename DataType<uint8_t, tChannels>::Type* targetPositionPixels)
4433{
4434 ocean_assert(source != nullptr);
4435 ocean_assert(targetPositionPixels != nullptr);
4436
4437 // as we do not initialize the following intermediate data,
4438 // we hopefully will not allocate memory on the stack each time this function is called
4439 unsigned int factorsTopLeft[4];
4440 unsigned int factorsTopRight[4];
4441 unsigned int factorsBottomLeft[4];
4442 unsigned int factorsBottomRight[4];
4443
4444
4445 // factorLeft = 128 - factorRight
4446 // factorTop = 128 - factorBottom
4447
4448 const uint32x4_t m128_factorsLeft = vsubq_u32(vdupq_n_u32(128u), m128_factorsRight);
4449 const uint32x4_t m128_factorsTop = vsubq_u32(vdupq_n_u32(128u), m128_factorsBottom);
4450
4451 // (top_left * factorLeft + top_right * factorRight) * factorTop + (bottom_left * factorLeft + bottom_right * factorRight) * factorBottom
4452 // == top_left * factorTopLeft + top_right * factorTopRight + bottom_left * factorBottomLeft + bottom_right * factorBottomRight
4453
4454 const uint32x4_t m128_factorsTopLeft = vmulq_u32(m128_factorsTop, m128_factorsLeft);
4455 const uint32x4_t m128_factorsTopRight = vmulq_u32(m128_factorsTop, m128_factorsRight);
4456 const uint32x4_t m128_factorsBottomLeft = vmulq_u32(m128_factorsBottom, m128_factorsLeft);
4457 const uint32x4_t m128_factorsBottomRight = vmulq_u32(m128_factorsBottom, m128_factorsRight);
4458
4459
4460 // we store the interpolation factors
4461 vst1q_u32(factorsTopLeft, m128_factorsTopLeft);
4462 vst1q_u32(factorsTopRight, m128_factorsTopRight);
4463 vst1q_u32(factorsBottomLeft, m128_factorsBottomLeft);
4464 vst1q_u32(factorsBottomRight, m128_factorsBottomRight);
4465
4466 for (unsigned int i = 0u; i < 4u; ++i)
4467 {
4468 if (validPixels[i])
4469 {
4470 const uint8_t* topLeft = source + offsetsTopLeftElements[i];
4471 const uint8_t* topRight = source + offsetsTopRightElements[i];
4472
4473 const uint8_t* bottomLeft = source + offsetsBottomLeftElements[i];
4474 const uint8_t* bottomRight = source + offsetsBottomRightElements[i];
4475
4476 const unsigned int& factorTopLeft = factorsTopLeft[i];
4477 const unsigned int& factorTopRight = factorsTopRight[i];
4478 const unsigned int& factorBottomLeft = factorsBottomLeft[i];
4479 const unsigned int& factorBottomRight = factorsBottomRight[i];
4480
4481 for (unsigned int n = 0u; n < tChannels; ++n)
4482 {
4483 ((uint8_t*)targetPositionPixels)[n] = (topLeft[n] * factorTopLeft + topRight[n] * factorTopRight + bottomLeft[n] * factorBottomLeft + bottomRight[n] * factorBottomRight + 8192u) >> 14u;
4484 }
4485 }
4486 else
4487 {
4488 *targetPositionPixels = borderColor;
4489 }
4490
4491 targetPositionPixels++;
4492 }
4493}
4494
4495#endif // OCEAN_HARDWARE_NEON_VERSION
4496
4497template <unsigned int tChannels>
4498inline void FrameInterpolatorBilinear::homographies8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* homographies, const uint8_t* borderColor, uint8_t* output, const Scalar outputQuadrantCenterX, const Scalar outputQuadrantCenterY, const int outputOriginX, const int outputOriginY, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
4499{
4500 static_assert(tChannels >= 1u, "Invalid channel number!");
4501
4502 ocean_assert(input && output);
4503 ocean_assert(inputWidth > 0u && inputHeight > 0u);
4504 ocean_assert(outputWidth > 0u && outputHeight > 0u);
4505
4506 ocean_assert(outputQuadrantCenterX >= 0 && outputQuadrantCenterX < Scalar(outputWidth));
4507 ocean_assert(outputQuadrantCenterY >= 0 && outputQuadrantCenterY < Scalar(outputHeight));
4508 ocean_assert(homographies);
4509
4510 const unsigned int outputStrideElements = outputWidth * tChannels + outputPaddingElements;
4511
4512 const Scalar scalarInputWidth_1 = Scalar(inputWidth - 1u);
4513 const Scalar scalarInputHeight_1 = Scalar(inputHeight - 1u);
4514
4515 constexpr uint8_t zeroColor[tChannels] = {uint8_t(0)};
4516 const uint8_t* const bColor = borderColor ? borderColor : zeroColor;
4517
4518 uint8_t* outputData = output + firstOutputRow * outputStrideElements;
4519
4520 const Scalar left = Scalar(outputQuadrantCenterX) * Scalar(0.5);
4521 const Scalar right = (Scalar(outputWidth) + Scalar(outputQuadrantCenterX)) * Scalar(0.5);
4522
4523 const Scalar top = Scalar(outputQuadrantCenterY) * Scalar(0.5);
4524 const Scalar bottom = (Scalar(outputHeight) + Scalar(outputQuadrantCenterY)) * Scalar(0.5);
4525
4526 ocean_assert(right - left > Numeric::eps());
4527 ocean_assert(bottom - top > Numeric::eps());
4528
4529 const Scalar invWidth = Scalar(1) / Scalar(right - left);
4530 const Scalar invHeight = Scalar(1) / Scalar(bottom - top);
4531
4532 for (unsigned int y = firstOutputRow; y < firstOutputRow + numberOutputRows; ++y)
4533 {
4534 for (unsigned int x = 0; x < outputWidth; ++x)
4535 {
4536 Vector2 outputPosition = Vector2(Scalar(int(x)), Scalar(int(y)));
4537
4538 const Scalar _tx = minmax<Scalar>(0, (outputPosition.x() - left) * invWidth, 1);
4539 const Scalar _ty = minmax<Scalar>(0, (outputPosition.y() - top) * invHeight, 1);
4540
4541 outputPosition += Vector2(Scalar(outputOriginX), Scalar(outputOriginY));
4542
4543 const Scalar tx = 1 - _tx;
4544 const Scalar ty = 1 - _ty;
4545
4546 const Vector2 inputPositionTopLeft(homographies[0] * outputPosition);
4547 const Vector2 inputPositionTopRight(homographies[1] * outputPosition);
4548 const Vector2 inputPositionBottomLeft(homographies[2] * outputPosition);
4549 const Vector2 inputPositionBottomRight(homographies[3] * outputPosition);
4550
4551 const Scalar tTopLeft = tx * ty;
4552 const Scalar tTopRight = _tx * ty;
4553 const Scalar tBottomLeft = tx * _ty;
4554 const Scalar tBottomRight = _tx * _ty;
4555
4556 const Vector2 inputPosition = inputPositionTopLeft * tTopLeft + inputPositionTopRight * tTopRight
4557 + inputPositionBottomLeft * tBottomLeft + inputPositionBottomRight * tBottomRight;
4558
4559 if (inputPosition.x() < Scalar(0) || inputPosition.x() > scalarInputWidth_1 || inputPosition.y() < Scalar(0) || inputPosition.y() > scalarInputHeight_1)
4560 {
4561 for (unsigned int c = 0u; c < tChannels; ++c)
4562 {
4563 outputData[c] = bColor[c];
4564 }
4565 }
4566 else
4567 {
4568 interpolatePixel8BitPerChannel<tChannels, PC_TOP_LEFT>(input, inputWidth, inputHeight, inputPaddingElements, inputPosition, outputData);
4569 }
4570
4571 outputData += tChannels;
4572 }
4573
4574 outputData += outputPaddingElements;
4575 }
4576}
4577
4578template <unsigned int tChannels>
4579void FrameInterpolatorBilinear::homographyMask8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* input_H_output, uint8_t* output, uint8_t* outputMask, const uint8_t maskValue, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, unsigned int firstOutputRow, const unsigned int numberOutputRows)
4580{
4581 static_assert(tChannels >= 1u, "Invalid channel number!");
4582
4583 ocean_assert(input != nullptr && output != nullptr);
4584 ocean_assert(inputWidth > 0u && inputHeight > 0u);
4585 ocean_assert(outputWidth > 0u && outputHeight > 0u);
4586 ocean_assert(input_H_output != nullptr);
4587
4588 ocean_assert_and_suppress_unused(firstOutputRow + numberOutputRows <= outputHeight, outputHeight);
4589
4590 const unsigned int outputStrideElements = outputWidth * tChannels + outputPaddingElements;
4591 const unsigned int outputMaskStrideElements = outputWidth + outputMaskPaddingElements;
4592
4593 const Scalar scalarInputWidth_1 = Scalar(inputWidth - 1u);
4594 const Scalar scalarInputHeight_1 = Scalar(inputHeight - 1u);
4595
4596 using PixelType = typename DataType<uint8_t, tChannels>::Type;
4597
4598 for (unsigned int y = firstOutputRow; y < firstOutputRow + numberOutputRows; ++y)
4599 {
4600 PixelType* outputData = (PixelType*)(output + y * outputStrideElements);
4601 uint8_t* outputMaskData = outputMask + y * outputMaskStrideElements;
4602
4603 /*
4604 * We can slightly optimize the 3x3 matrix multiplication:
4605 *
4606 * | X0 Y0 Z0 | | x |
4607 * | X1 Y1 Z1 | * | y |
4608 * | X2 Y2 Z2 | | 1 |
4609 *
4610 * | x' | | X0 * x | | Y0 * y + Z0 |
4611 * | y' | = | X1 * x | + | Y1 * y + Z1 |
4612 * | z' | | X2 * x | | Y2 * y + Z2 |
4613 *
4614 * As y is constant within the inner loop, we can pre-calculate the following terms:
4615 *
4616 * | x' | | (X0 * x + constValue0) / (X2 * x + constValue2) |
4617 * | y' | = | (X1 * x + constValue1) / (X2 * x + constValue2) |
4618 *
4619 * | p | = | (X * x + c) / (X2 * x + constValue2) |
4620 */
4621
4622 const Vector2 X(input_H_output->data() + 0);
4623 const Vector2 c(Vector2(input_H_output->data() + 3) * Scalar(y) + Vector2(input_H_output->data() + 6));
4624
4625 const Scalar X2 = (*input_H_output)(2, 0);
4626 const Scalar constValue2 = (*input_H_output)(2, 1) * Scalar(y) + (*input_H_output)(2, 2);
4627
4628 for (unsigned int x = 0; x < outputWidth; ++x)
4629 {
4630 const Vector2 inputPosition((X * Scalar(x) + c) / (X2 * Scalar(x) + constValue2));
4631
4632#ifdef OCEAN_DEBUG
4633 const Vector2 debugInputPosition(*input_H_output * Vector2(Scalar(x), Scalar(y)));
4634 ocean_assert(inputPosition.isEqual(debugInputPosition, Scalar(0.01)));
4635#endif
4636
4637 if (inputPosition.x() < Scalar(0) || inputPosition.x() > scalarInputWidth_1 || inputPosition.y() < Scalar(0) || inputPosition.y() > scalarInputHeight_1)
4638 {
4639 *outputMaskData = 0xFF - maskValue;
4640 }
4641 else
4642 {
4643 interpolatePixel8BitPerChannel<tChannels, PC_TOP_LEFT>(input, inputWidth, inputHeight, inputPaddingElements, inputPosition, (uint8_t*)(outputData));
4644 *outputMaskData = maskValue;
4645 }
4646
4647 outputData++;
4648 outputMaskData++;
4649 }
4650 }
4651}
4652
4653template <unsigned int tChannels>
4654inline void FrameInterpolatorBilinear::homographiesMask8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3* homographies, uint8_t* output, uint8_t* outputMask, const uint8_t maskValue, const Scalar outputQuadrantCenterX, const Scalar outputQuadrantCenterY, const int outputOriginX, const int outputOriginY, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
4655{
4656 static_assert(tChannels >= 1u, "Invalid channel number!");
4657
4658 ocean_assert(input && output);
4659 ocean_assert(inputWidth > 0u && inputHeight > 0u);
4660 ocean_assert(outputWidth > 0u && outputHeight > 0u);
4661
4662 ocean_assert(outputQuadrantCenterX >= 0 && outputQuadrantCenterX < Scalar(outputWidth));
4663 ocean_assert(outputQuadrantCenterY >= 0 && outputQuadrantCenterY < Scalar(outputHeight));
4664 ocean_assert(homographies);
4665
4666 const unsigned int outputStrideElements = tChannels * outputWidth + outputPaddingElements;
4667 const unsigned int outputMaskStrideElements = outputWidth + outputMaskPaddingElements;
4668
4669 const Scalar scalarInputWidth_1 = Scalar(inputWidth - 1u);
4670 const Scalar scalarInputHeight_1 = Scalar(inputHeight - 1u);
4671
4672 uint8_t* outputData = output + firstOutputRow * outputStrideElements;
4673 outputMask += firstOutputRow * outputMaskStrideElements;
4674
4675 const Scalar left = Scalar(outputQuadrantCenterX) * Scalar(0.5);
4676 const Scalar right = (Scalar(outputWidth) + Scalar(outputQuadrantCenterX)) * Scalar(0.5);
4677
4678 const Scalar top = Scalar(outputQuadrantCenterY) * Scalar(0.5);
4679 const Scalar bottom = (Scalar(outputHeight) + Scalar(outputQuadrantCenterY)) * Scalar(0.5);
4680
4681 ocean_assert(right - left > Numeric::eps());
4682 ocean_assert(bottom - top > Numeric::eps());
4683
4684 const Scalar invWidth = Scalar(1) / Scalar(right - left);
4685 const Scalar invHeight = Scalar(1) / Scalar(bottom - top);
4686
4687 for (unsigned int y = firstOutputRow; y < firstOutputRow + numberOutputRows; ++y)
4688 {
4689 for (unsigned int x = 0u; x < outputWidth; ++x)
4690 {
4691 Vector2 outputPosition = Vector2(Scalar(int(x)), Scalar(int(y)));
4692
4693 const Scalar _tx = minmax<Scalar>(0, (outputPosition.x() - left) * invWidth, 1);
4694 const Scalar _ty = minmax<Scalar>(0, (outputPosition.y() - top) * invHeight, 1);
4695
4696 outputPosition += Vector2(Scalar(outputOriginX), Scalar(outputOriginY));
4697
4698 const Scalar tx = 1 - _tx;
4699 const Scalar ty = 1 - _ty;
4700
4701 const Vector2 inputPositionTopLeft(homographies[0] * outputPosition);
4702 const Vector2 inputPositionTopRight(homographies[1] * outputPosition);
4703 const Vector2 inputPositionBottomLeft(homographies[2] * outputPosition);
4704 const Vector2 inputPositionBottomRight(homographies[3] * outputPosition);
4705
4706 const Scalar tTopLeft = tx * ty;
4707 const Scalar tTopRight = _tx * ty;
4708 const Scalar tBottomLeft = tx * _ty;
4709 const Scalar tBottomRight = _tx * _ty;
4710
4711 const Vector2 inputPosition = inputPositionTopLeft * tTopLeft + inputPositionTopRight * tTopRight
4712 + inputPositionBottomLeft * tBottomLeft + inputPositionBottomRight * tBottomRight;
4713
4714 if (inputPosition.x() < Scalar(0) || inputPosition.x() > scalarInputWidth_1 || inputPosition.y() < Scalar(0) || inputPosition.y() > scalarInputHeight_1)
4715 {
4716 *outputMask = 0xFFu - maskValue;
4717 }
4718 else
4719 {
4720 interpolatePixel8BitPerChannel<tChannels, PC_TOP_LEFT>(input, inputWidth, inputHeight, inputPaddingElements, inputPosition, outputData);
4721 *outputMask = maskValue;
4722 }
4723
4724 outputData += tChannels;
4725 outputMask++;
4726 }
4727
4728 outputData += outputPaddingElements;
4729 outputMask += outputMaskPaddingElements;
4730 }
4731}
4732
4733template <unsigned int tChannels>
4734void FrameInterpolatorBilinear::homographyWithCamera8BitPerChannelSubset(const PinholeCamera* inputCamera, const PinholeCamera* outputCamera, const PinholeCamera::DistortionLookup* outputCameraDistortionLookup, const uint8_t* input, const SquareMatrix3* normalizedHomography, const bool useDistortionParameters, const uint8_t* borderColor, uint8_t* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
4735{
4736 static_assert(tChannels >= 1u, "Invalid channel number!");
4737
4738 ocean_assert(inputCamera && outputCamera && normalizedHomography);
4739 ocean_assert(input && output);
4740
4741 ocean_assert(firstRow + numberRows <= outputCamera->height());
4742
4743 const unsigned int outputStrideElements = tChannels * outputCamera->width() + outputPaddingElements;
4744
4745 const Scalar scalarInputWidth_1 = Scalar(inputCamera->width() - 1u);
4746 const Scalar scalarInputHeight_1 = Scalar(inputCamera->height() - 1u);
4747
4748 const SquareMatrix3 combinedMatrix(*normalizedHomography * outputCamera->invertedIntrinsic());
4749
4750 using PixelType = typename DataType<uint8_t, tChannels>::Type;
4751
4752 const uint8_t zeroColor[tChannels] = {uint8_t(0)};
4753 const PixelType* const bColor = borderColor ? (PixelType*)borderColor : (PixelType*)zeroColor;
4754
4755 uint8_t* outputData = output + firstRow * outputStrideElements;
4756
4757 for (unsigned int y = firstRow; y < firstRow + numberRows; ++y)
4758 {
4759 for (unsigned int x = 0; x < outputCamera->width(); ++x)
4760 {
4761 const Vector2 inputPosition(inputCamera->normalizedImagePoint2imagePoint<true>(combinedMatrix * outputCameraDistortionLookup->undistortedImagePoint(Vector2(Scalar(x), Scalar(y))), useDistortionParameters));
4762
4763 if (inputPosition.x() < Scalar(0) || inputPosition.x() > scalarInputWidth_1 || inputPosition.y() < Scalar(0) || inputPosition.y() > scalarInputHeight_1)
4764 {
4765 *((PixelType*)outputData) = *bColor;
4766 }
4767 else
4768 {
4769 interpolatePixel8BitPerChannel<tChannels, PC_TOP_LEFT>(input, inputCamera->width(), inputCamera->height(), inputPaddingElements, inputPosition, outputData);
4770 }
4771
4772 outputData += tChannels;
4773 }
4774
4775 outputData += outputPaddingElements;
4776 }
4777}
4778
4779template <unsigned int tChannels>
4780void FrameInterpolatorBilinear::homographyWithCameraMask8BitPerChannelSubset(const PinholeCamera* inputCamera, const PinholeCamera* outputCamera, const PinholeCamera::DistortionLookup* outputCameraDistortionLookup, const uint8_t* input, const unsigned int inputPaddingElements, const SquareMatrix3* normalizedHomography, uint8_t* output, uint8_t* outputMask, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const uint8_t maskValue, const unsigned int firstRow, const unsigned int numberRows)
4781{
4782 static_assert(tChannels >= 1u, "Invalid channel number!");
4783
4784 ocean_assert(inputCamera != nullptr && outputCamera != nullptr && normalizedHomography != nullptr);
4785 ocean_assert(input != nullptr && output != nullptr);
4786
4787 ocean_assert(firstRow + numberRows <= outputCamera->height());
4788
4789 const unsigned int outputStrideElements = outputCamera->width() * tChannels + outputPaddingElements;
4790 const unsigned int outputMaskStrideElements = outputCamera->width() + outputMaskPaddingElements;
4791
4792 const Scalar scalarInputWidth_1 = Scalar(inputCamera->width() - 1u);
4793 const Scalar scalarInputHeight_1 = Scalar(inputCamera->height() - 1u);
4794
4795 const SquareMatrix3 combinedMatrix(*normalizedHomography * outputCamera->invertedIntrinsic());
4796
4797 uint8_t* outputData = output + firstRow * outputStrideElements;
4798 outputMask += firstRow * outputMaskStrideElements;
4799
4800 constexpr bool useDistortionParameters = true;
4801
4802 for (unsigned int y = firstRow; y < firstRow + numberRows; ++y)
4803 {
4804 for (unsigned int x = 0; x < outputCamera->width(); ++x)
4805 {
4806 const Vector2 inputPosition(inputCamera->normalizedImagePoint2imagePoint<true>(combinedMatrix * outputCameraDistortionLookup->undistortedImagePoint(Vector2(Scalar(x), Scalar(y))), useDistortionParameters));
4807
4808 if (inputPosition.x() < Scalar(0) || inputPosition.x() > scalarInputWidth_1 || inputPosition.y() < Scalar(0) || inputPosition.y() > scalarInputHeight_1)
4809 {
4810 *outputMask = 0xFF - maskValue;
4811 }
4812 else
4813 {
4814 interpolatePixel8BitPerChannel<tChannels, PC_TOP_LEFT>(input, inputCamera->width(), inputCamera->height(), inputPaddingElements, inputPosition, outputData);
4815 *outputMask = maskValue;
4816 }
4817
4818 outputData += tChannels;
4819 ++outputMask;
4820 }
4821
4822 outputData += outputPaddingElements;
4823 outputMask += outputMaskPaddingElements;
4824 }
4825}
4826
4827template <unsigned int tChannels>
4828void FrameInterpolatorBilinear::lookup8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable* input_LT_output, const bool offset, const uint8_t* borderColor, uint8_t* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
4829{
4830 static_assert(tChannels >= 1u, "Invalid channel number!");
4831
4832 ocean_assert(input_LT_output != nullptr);
4833 ocean_assert(input != nullptr && output != nullptr);
4834
4835 ocean_assert(inputWidth != 0u && inputHeight != 0u);
4836 ocean_assert(firstRow + numberRows <= input_LT_output->sizeY());
4837
4838 using PixelType = typename DataType<uint8_t, tChannels>::Type;
4839
4840 const uint8_t zeroColor[tChannels] = {uint8_t(0)};
4841 const PixelType* const bColor = borderColor ? (PixelType*)borderColor : (PixelType*)zeroColor;
4842
4843 const unsigned int columns = (unsigned int)(input_LT_output->sizeX());
4844
4845 const unsigned int outputStrideElements = tChannels * columns + outputPaddingElements;
4846
4847 static_assert(std::is_same<Vector2, LookupTable::Type>::value, "Invalid data type!");
4848
4849 const Scalar inputWidth1 = Scalar(inputWidth - 1u);
4850 const Scalar inputHeight1 = Scalar(inputHeight - 1u);
4851
4852 Memory rowLookupMemory = Memory::create<Vector2>(columns);
4853 Vector2* const rowLookupData = rowLookupMemory.data<Vector2>();
4854
4855 for (unsigned int y = firstRow; y < firstRow + numberRows; ++y)
4856 {
4857 input_LT_output->bilinearValues(y, rowLookupData);
4858
4859 PixelType* outputData = (PixelType*)(output + y * outputStrideElements);
4860
4861 for (unsigned int x = 0u; x < columns; ++x)
4862 {
4863 const Vector2& lookupValue = rowLookupData[x];
4864
4865 const Vector2 inputPosition = offset ? Vector2(Scalar(x) + lookupValue.x(), Scalar(y) + lookupValue.y()) : lookupValue;
4866
4867 if (inputPosition.x() >= Scalar(0) && inputPosition.y() >= Scalar(0) && inputPosition.x() <= inputWidth1 && inputPosition.y() <= inputHeight1)
4868 {
4869 interpolatePixel8BitPerChannel<tChannels, PC_TOP_LEFT>(input, inputWidth, inputHeight, inputPaddingElements, inputPosition, (uint8_t*)(outputData));
4870 }
4871 else
4872 {
4873 *outputData = *bColor;
4874 }
4875
4876 outputData++;
4877 }
4878 }
4879}
4880
4881template <typename T, unsigned int tChannels>
4882void FrameInterpolatorBilinear::lookupSubset(const T* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable* input_LT_output, const bool offset, const T* borderColor, T* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
4883{
4884 static_assert(tChannels >= 1u, "Invalid channel number!");
4885
4886 ocean_assert((!std::is_same<uint8_t, T>::value));
4887
4888 ocean_assert(input_LT_output != nullptr);
4889 ocean_assert(input != nullptr && output != nullptr);
4890
4891 ocean_assert(inputWidth != 0u && inputHeight != 0u);
4892 ocean_assert(firstRow + numberRows <= input_LT_output->sizeY());
4893
4894 using PixelType = typename DataType<T, tChannels>::Type;
4895
4896 const T zeroColor[tChannels] = {T(0)};
4897 const PixelType* const bColor = borderColor ? (PixelType*)borderColor : (PixelType*)zeroColor;
4898
4899 const unsigned int columns = (unsigned int)(input_LT_output->sizeX());
4900
4901 const unsigned int outputStrideElements = tChannels * columns + outputPaddingElements;
4902
4903 static_assert(std::is_same<Vector2, LookupTable::Type>::value, "Invalid data type!");
4904
4905 const Scalar inputWidth1 = Scalar(inputWidth - 1u);
4906 const Scalar inputHeight1 = Scalar(inputHeight - 1u);
4907
4908 Memory rowLookupMemory = Memory::create<Vector2>(columns);
4909 Vector2* const rowLookupData = rowLookupMemory.data<Vector2>();
4910
4911 for (unsigned int y = firstRow; y < firstRow + numberRows; ++y)
4912 {
4913 input_LT_output->bilinearValues(y, rowLookupData);
4914
4915 PixelType* outputData = (PixelType*)(output + y * outputStrideElements);
4916
4917 for (unsigned int x = 0u; x < columns; ++x)
4918 {
4919 const Vector2& lookupValue = rowLookupData[x];
4920
4921 const Vector2 inputPosition = offset ? Vector2(Scalar(x) + lookupValue.x(), Scalar(y) + lookupValue.y()) : lookupValue;
4922
4923 if (inputPosition.x() >= Scalar(0) && inputPosition.y() >= Scalar(0) && inputPosition.x() <= inputWidth1 && inputPosition.y() <= inputHeight1)
4924 {
4925 interpolatePixel<T, T, tChannels, PC_TOP_LEFT>(input, inputWidth, inputHeight, inputPaddingElements, inputPosition, (T*)(outputData));
4926 }
4927 else
4928 {
4929 *outputData = *bColor;
4930 }
4931
4932 outputData++;
4933 }
4934 }
4935}
4936
4937#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
4938
4939template <>
4940inline void FrameInterpolatorBilinear::lookup8BitPerChannelSubsetNEON<1u>(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable* input_LT_output, const bool offset, const uint8_t* borderColor, uint8_t* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows, const bool useOptimizedNEON, const bool useOptimizedBilinearValuesAndFactorCalculation, const bool /*useOptimizedNEONFactorReplication*/)
4941{
4942 ocean_assert(input_LT_output != nullptr);
4943 ocean_assert(input != nullptr && output != nullptr);
4944
4945 ocean_assert(inputWidth != 0u && inputHeight != 0u);
4946 ocean_assert(firstRow + numberRows <= input_LT_output->sizeY());
4947
4948 using PixelType = uint8_t;
4949
4950 const uint8x16_t constantBorderColor_u_8x16 = vdupq_n_u8(borderColor ? *borderColor : 0u);
4951
4952 const unsigned int outputWidth = (unsigned int)(input_LT_output->sizeX());
4953 ocean_assert(outputWidth >= 8u);
4954
4955 static_assert(std::is_same<Vector2, LookupTable::Type>::value, "Invalid data type!");
4956
4957 const unsigned int inputStrideElements = inputWidth + inputPaddingElements;
4958 const unsigned int outputStrideElements = outputWidth + outputPaddingElements;
4959
4960 Memory rowLookupMemory = Memory::create<VectorF2>(outputWidth);
4961 VectorF2* const rowLookupData = rowLookupMemory.data<VectorF2>();
4962
4963 const float32x4_t constantZero_f_32x4 = vdupq_n_f32(0.0f); // [0.0f, 0.0f, 0.0f, 0.0f]
4964 const float32x4_t constantEight_f_32x4 = vdupq_n_f32(8.0f); // [4.0f, 4.0f, 4.0f, 4.0f]
4965
4966 // [0.0f, 1.0f, 2.0f, 3.0f, ...]
4967 const float f_01234567[8] = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
4968 const float32x4_t conststant0123_f_32x4 = vld1q_f32(f_01234567 + 0);
4969 const float32x4_t conststant4567_f_32x4 = vld1q_f32(f_01234567 + 4);
4970
4971 const float32x4_t constant128_f_32x4 = vdupq_n_f32(128.0f);
4972
4973 const uint32x4_t constantOne_u_32x4 = vdupq_n_u32(1u);
4974
4975 const uint32x4_t constantChannels_u_32x4 = vdupq_n_u32(1u);
4976
4977 const float32x4_t constantInputWidth1_f_32x4 = vdupq_n_f32(float(inputWidth - 1u));
4978 const float32x4_t constantInputHeight1_f_32x4 = vdupq_n_f32(float(inputHeight - 1u));
4979
4980 const uint32x4_t constantInputStrideElements_u_32x4 = vdupq_n_u32(inputStrideElements);
4981 const uint32x4_t constantInputHeight1_u_32x4 = vdupq_n_u32(inputHeight - 1u);
4982
4983 unsigned int validPixels[8];
4984
4985 unsigned int topLeftOffsetsElements[8];
4986 unsigned int bottomLeftOffsetsElements[8];
4987
4988 uint8_t pixels[32];
4989
4990 for (unsigned int y = firstRow; y < firstRow + numberRows; ++y)
4991 {
4992 PixelType* outputPixelData = (PixelType*)(output + y * outputStrideElements);
4993
4994 input_LT_output->bilinearValues<VectorF2>(y, rowLookupData);
4995
4996 float32x4_t additionalInputOffsetX0123_f_32x4 = conststant0123_f_32x4;
4997 float32x4_t additionalInputOffsetX4567_f_32x4 = conststant4567_f_32x4;
4998
4999 const float32x4_t additionalInputOffsetY_f_32x4 = vdupq_n_f32(float(y));
5000
5001 for (unsigned int x = 0u; x < outputWidth; x += 8u)
5002 {
5003 if (x + 8u > outputWidth)
5004 {
5005 // the last iteration will not fit into the output frame,
5006 // so we simply shift x left by some pixels (at most 7) and we will calculate some pixels again
5007
5008 ocean_assert(x >= 8u && outputWidth > 8u);
5009 const unsigned int newX = outputWidth - 8u;
5010
5011 ocean_assert(x > newX);
5012 const unsigned int xOffset = x - newX;
5013
5014 outputPixelData -= xOffset;
5015
5016 if (offset)
5017 {
5018 additionalInputOffsetX0123_f_32x4 = vsubq_f32(additionalInputOffsetX0123_f_32x4, vdupq_n_f32(float(xOffset)));
5019 additionalInputOffsetX4567_f_32x4 = vsubq_f32(additionalInputOffsetX4567_f_32x4, vdupq_n_f32(float(xOffset)));
5020 }
5021
5022 x = newX;
5023
5024 // the for loop will stop after this iteration
5025 ocean_assert(!(x + 8u < outputWidth));
5026 }
5027
5028 const float32x4x2_t inputPositions0123_f_32x4x2 = vld2q_f32((const float*)(rowLookupData + x + 0u));
5029 const float32x4x2_t inputPositions4567_f_32x4x2 = vld2q_f32((const float*)(rowLookupData + x + 4u));
5030
5031 float32x4_t inputPositionsX0123_f_32x4 = inputPositions0123_f_32x4x2.val[0];
5032 float32x4_t inputPositionsY0123_f_32x4 = inputPositions0123_f_32x4x2.val[1];
5033
5034 float32x4_t inputPositionsX4567_f_32x4 = inputPositions4567_f_32x4x2.val[0];
5035 float32x4_t inputPositionsY4567_f_32x4 = inputPositions4567_f_32x4x2.val[1];
5036
5037 if (offset)
5038 {
5039 inputPositionsX0123_f_32x4 = vaddq_f32(inputPositionsX0123_f_32x4, additionalInputOffsetX0123_f_32x4);
5040 inputPositionsY0123_f_32x4 = vaddq_f32(inputPositionsY0123_f_32x4, additionalInputOffsetY_f_32x4);
5041
5042 inputPositionsX4567_f_32x4 = vaddq_f32(inputPositionsX4567_f_32x4, additionalInputOffsetX4567_f_32x4);
5043 inputPositionsY4567_f_32x4 = vaddq_f32(inputPositionsY4567_f_32x4, additionalInputOffsetY_f_32x4);
5044
5045 additionalInputOffsetX0123_f_32x4 = vaddq_f32(additionalInputOffsetX0123_f_32x4, constantEight_f_32x4);
5046 additionalInputOffsetX4567_f_32x4 = vaddq_f32(additionalInputOffsetX4567_f_32x4, constantEight_f_32x4);
5047 }
5048
5049 // now we check whether we are inside the input frame
5050 const uint32x4_t validPixelsX0123_u_32x4 = vandq_u32(vcltq_f32(inputPositionsX0123_f_32x4, constantInputWidth1_f_32x4), vcgeq_f32(inputPositionsX0123_f_32x4, constantZero_f_32x4)); // inputPosition.x() >= 0 && inputPosition.x() < (inputWidth - 1) ? 0xFFFFFF : 0x000000
5051 const uint32x4_t validPixelsX4567_u_32x4 = vandq_u32(vcltq_f32(inputPositionsX4567_f_32x4, constantInputWidth1_f_32x4), vcgeq_f32(inputPositionsX4567_f_32x4, constantZero_f_32x4));
5052
5053 const uint32x4_t validPixelsY0123_u_32x4 = vandq_u32(vcltq_f32(inputPositionsY0123_f_32x4, constantInputHeight1_f_32x4), vcgeq_f32(inputPositionsY0123_f_32x4, constantZero_f_32x4)); // inputPosition.y() >= 0 && inputPosition.y() < (inputHeight - 1) ? 0xFFFFFF : 0x000000
5054 const uint32x4_t validPixelsY4567_u_32x4 = vandq_u32(vcltq_f32(inputPositionsY4567_f_32x4, constantInputHeight1_f_32x4), vcgeq_f32(inputPositionsY4567_f_32x4, constantZero_f_32x4));
5055
5056 const uint32x4_t validPixels0123_u_32x4 = vandq_u32(validPixelsX0123_u_32x4, validPixelsY0123_u_32x4); // is_inside_input_frame(inputPosition) ? 0xFFFFFF : 0x000000
5057 const uint32x4_t validPixels4567_u_32x4 = vandq_u32(validPixelsX4567_u_32x4, validPixelsY4567_u_32x4);
5058
5059 vst1q_u32(validPixels + 0, validPixels0123_u_32x4);
5060 vst1q_u32(validPixels + 4, validPixels4567_u_32x4);
5061
5062
5063 const uint32x4_t inputPositionsLeft0123_u_32x4 = vcvtq_u32_f32(inputPositionsX0123_f_32x4);
5064 const uint32x4_t inputPositionsLeft4567_u_32x4 = vcvtq_u32_f32(inputPositionsX4567_f_32x4);
5065
5066 const uint32x4_t inputPositionsTop0123_u_32x4 = vcvtq_u32_f32(inputPositionsY0123_f_32x4);
5067 const uint32x4_t inputPositionsTop4567_u_32x4 = vcvtq_u32_f32(inputPositionsY4567_f_32x4);
5068
5069 const uint32x4_t inputPositionsBottom0123_u_32x4 = vminq_u32(vaddq_u32(inputPositionsTop0123_u_32x4, constantOne_u_32x4), constantInputHeight1_u_32x4);
5070 const uint32x4_t inputPositionsBottom4567_u_32x4 = vminq_u32(vaddq_u32(inputPositionsTop4567_u_32x4, constantOne_u_32x4), constantInputHeight1_u_32x4);
5071
5072
5073 const uint32x4_t topLeftOffsetsElements0123_u_32x4 = vmlaq_u32(vmulq_u32(inputPositionsLeft0123_u_32x4, constantChannels_u_32x4), inputPositionsTop0123_u_32x4, constantInputStrideElements_u_32x4); // topLeftOffset = top * strideElements + left * channels
5074 vst1q_u32(topLeftOffsetsElements + 0, topLeftOffsetsElements0123_u_32x4);
5075 const uint32x4_t topLeftOffsetsElements4567_u_32x4 = vmlaq_u32(vmulq_u32(inputPositionsLeft4567_u_32x4, constantChannels_u_32x4), inputPositionsTop4567_u_32x4, constantInputStrideElements_u_32x4);
5076 vst1q_u32(topLeftOffsetsElements + 4, topLeftOffsetsElements4567_u_32x4);
5077
5078 const uint32x4_t bottomLeftOffsetsElements0123_u_32x4 = vmlaq_u32(vmulq_u32(inputPositionsLeft0123_u_32x4, constantChannels_u_32x4), inputPositionsBottom0123_u_32x4, constantInputStrideElements_u_32x4);
5079 vst1q_u32(bottomLeftOffsetsElements + 0, bottomLeftOffsetsElements0123_u_32x4);
5080 const uint32x4_t bottomLeftOffsetsElements4567_u_32x4 = vmlaq_u32(vmulq_u32(inputPositionsLeft4567_u_32x4, constantChannels_u_32x4), inputPositionsBottom4567_u_32x4, constantInputStrideElements_u_32x4);
5081 vst1q_u32(bottomLeftOffsetsElements + 4, bottomLeftOffsetsElements4567_u_32x4);
5082
5083
5084 // we determine the fractional portions of the x' and y' and [0.0, 1.0] -> [0, 128]
5085 float32x4_t tx0123_f_32x4 = vmulq_f32(vsubq_f32(inputPositionsX0123_f_32x4, vcvtq_f32_u32(inputPositionsLeft0123_u_32x4)), constant128_f_32x4);
5086 float32x4_t tx4567_f_32x4 = vmulq_f32(vsubq_f32(inputPositionsX4567_f_32x4, vcvtq_f32_u32(inputPositionsLeft4567_u_32x4)), constant128_f_32x4);
5087
5088 float32x4_t ty0123_f_32x4 = vmulq_f32(vsubq_f32(inputPositionsY0123_f_32x4, vcvtq_f32_u32(inputPositionsTop0123_u_32x4)), constant128_f_32x4);
5089 float32x4_t ty4567_f_32x4 = vmulq_f32(vsubq_f32(inputPositionsY4567_f_32x4, vcvtq_f32_u32(inputPositionsTop4567_u_32x4)), constant128_f_32x4);
5090
5091 const uint32x4_t tx0123_128_u_32x4 = vcvtq_u32_f32(vaddq_f32(tx0123_f_32x4, vdupq_n_f32(0.5)));
5092 const uint32x4_t tx4567_128_u_32x4 = vcvtq_u32_f32(vaddq_f32(tx4567_f_32x4, vdupq_n_f32(0.5)));
5093
5094 const uint32x4_t ty0123_128_u_32x4 = vcvtq_u32_f32(vaddq_f32(ty0123_f_32x4, vdupq_n_f32(0.5)));
5095 const uint32x4_t ty4567_128_u_32x4 = vcvtq_u32_f32(vaddq_f32(ty4567_f_32x4, vdupq_n_f32(0.5)));
5096
5097 const uint16x8_t tx01234567_128_u_16x8 = vcombine_u16(vmovn_u32(tx0123_128_u_32x4), vmovn_u32(tx4567_128_u_32x4));
5098 const uint16x8_t ty01234567_128_u_16x8 = vcombine_u16(vmovn_u32(ty0123_128_u_32x4), vmovn_u32(ty4567_128_u_32x4));
5099
5100 const uint8x16_t tx_ty_128_u_8x16 = vcombine_u8(vmovn_u16(tx01234567_128_u_16x8), vmovn_u16(ty01234567_128_u_16x8));
5101
5102
5103 vst1q_u8(pixels + 0, constantBorderColor_u_8x16); // initialize with border color
5104 vst1q_u8(pixels + 16, constantBorderColor_u_8x16);
5105
5106 struct LeftRightPixel
5107 {
5108 uint8_t left;
5109 uint8_t right;
5110 };
5111
5112 static_assert(sizeof(LeftRightPixel) == 2, "Invalid data type!");
5113
5114 // we gather the individual source pixel values from the source image,
5115 // based on the calculated pixel locations
5116 for (unsigned int i = 0u; i < 8u; ++i)
5117 {
5118 if (validPixels[i])
5119 {
5120 ocean_assert((topLeftOffsetsElements[i] % inputStrideElements) < inputWidth - 1u); // we need to have one additional pixel to the right (as we copy two pixels at once)
5121 ocean_assert((bottomLeftOffsetsElements[i] % inputStrideElements) < inputWidth - 1u);
5122
5123 ((LeftRightPixel*)pixels)[0u + i] = *(LeftRightPixel*)(input + topLeftOffsetsElements[i]);
5124 ((LeftRightPixel*)pixels)[8u + i] = *(LeftRightPixel*)(input + bottomLeftOffsetsElements[i]);
5125 }
5126 }
5127
5128 const uint8x8x2_t topLeft_topRight_u_8x8x2 = vld2_u8(pixels);
5129 const uint8x8x2_t bottomLeft_bottomRight_u_8x8x2 = vld2_u8(pixels + 16);
5130
5131 interpolate8Pixels1Channel8BitNEON(topLeft_topRight_u_8x8x2.val[0], topLeft_topRight_u_8x8x2.val[1], bottomLeft_bottomRight_u_8x8x2.val[0], bottomLeft_bottomRight_u_8x8x2.val[1], tx_ty_128_u_8x16, outputPixelData);
5132
5133 outputPixelData += 8;
5134 }
5135 }
5136}
5137
5138template <unsigned int tChannels>
5139void FrameInterpolatorBilinear::lookup8BitPerChannelSubsetNEON(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable* input_LT_output, const bool offset, const uint8_t* borderColor, uint8_t* output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows, const bool useOptimizedNEON, const bool useOptimizedBilinearValuesAndFactorCalculation, const bool useOptimizedNEONFactorReplication)
5140{
5141 ocean_assert(input_LT_output != nullptr);
5142 ocean_assert(input != nullptr && output != nullptr);
5143
5144 ocean_assert(inputWidth != 0u && inputHeight != 0u);
5145 ocean_assert(firstRow + numberRows <= input_LT_output->sizeY());
5146
5147 using PixelType = typename DataType<uint8_t, tChannels>::Type;
5148
5149 const uint8_t zeroColor[tChannels] = {uint8_t(0)};
5150 const PixelType* const bColor = borderColor ? (PixelType*)borderColor : (PixelType*)zeroColor;
5151
5152 const unsigned int outputWidth = (unsigned int)(input_LT_output->sizeX());
5153 ocean_assert(outputWidth >= 4u);
5154
5155 static_assert(std::is_same<Vector2, LookupTable::Type>::value, "Invalid data type!");
5156
5157 const unsigned int inputStrideElements = inputWidth * tChannels + inputPaddingElements;
5158 const unsigned int outputStrideElements = outputWidth * tChannels + outputPaddingElements;
5159
5160 Memory rowLookupMemory = Memory::create<VectorF2>(outputWidth);
5161 VectorF2* const rowLookupData = rowLookupMemory.data<VectorF2>();
5162
5163 const float32x4_t constantZero_f_32x4 = vdupq_n_f32(0.0f); // [0.0f, 0.0f, 0.0f, 0.0f]
5164 const float32x4_t constantFour_f_32x4 = vdupq_n_f32(4.0f); // [4.0f, 4.0f, 4.0f, 4.0f]
5165
5166 // [0.0f, 1.0f, 2.0f, 3.0f]
5167 const float f_0123[4] = {0.0f, 1.0f, 2.0f, 3.0f};
5168 float32x4_t conststant0123_f_32x4 = vld1q_f32(f_0123);
5169
5170 const uint32x4_t constantOne_u_32x4 = vdupq_n_u32(1u);
5171
5172 const uint32x4_t constantChannels_u_32x4 = vdupq_n_u32(tChannels);
5173
5174 const float32x4_t constantInputWidth1_f_32x4 = vdupq_n_f32(float(inputWidth - 1u));
5175 const float32x4_t constantInputHeight1_f_32x4 = vdupq_n_f32(float(inputHeight - 1u));
5176
5177#if defined(__aarch64__)
5178 const float32x4_t constant128_f_32x4 = vdupq_n_f32(128.0f);
5179#endif
5180
5181 const uint32x4_t constantInputStrideElements_u_32x4 = vdupq_n_u32(inputStrideElements);
5182 const uint32x4_t constantInputWidth1_u_32x4 = vdupq_n_u32(inputWidth - 1u);
5183 const uint32x4_t constantInputHeight1_u_32x4 = vdupq_n_u32(inputHeight - 1u);
5184
5185 unsigned int validPixels[4];
5186
5187 unsigned int topLeftOffsetsElements[4];
5188 unsigned int topRightOffsetsElements[4];
5189 unsigned int bottomLeftOffsetsElements[4];
5190 unsigned int bottomRightOffsetsElements[4];
5191
5192 for (unsigned int y = firstRow; y < firstRow + numberRows; ++y)
5193 {
5194 PixelType* outputPixelData = (PixelType*)(output + y * outputStrideElements);
5195
5196 input_LT_output->bilinearValues<VectorF2>(y, rowLookupData, useOptimizedBilinearValuesAndFactorCalculation);
5197
5198 float32x4_t additionalInputOffsetX_f_32x4 = conststant0123_f_32x4;
5199 const float32x4_t additionalInputOffsetY_f_32x4 = vdupq_n_f32(float(y));
5200
5201 for (unsigned int x = 0u; x < outputWidth; x += 4u)
5202 {
5203 if (x + 4u > outputWidth)
5204 {
5205 // the last iteration will not fit into the output frame,
5206 // so we simply shift x left by some pixels (at most 3) and we will calculate some pixels again
5207
5208 ocean_assert(x >= 4u && outputWidth > 4u);
5209 const unsigned int newX = outputWidth - 4u;
5210
5211 ocean_assert(x > newX);
5212 const unsigned int xOffset = x - newX;
5213
5214 outputPixelData -= xOffset;
5215
5216 if (offset)
5217 {
5218 additionalInputOffsetX_f_32x4 = vsubq_f32(additionalInputOffsetX_f_32x4, vdupq_n_f32(float(xOffset)));
5219 }
5220
5221 x = newX;
5222
5223 // the for loop will stop after this iteration
5224 ocean_assert(!(x + 4u < outputWidth));
5225 }
5226
5227 const float32x4x2_t inputPositions_f_32x4x2 = vld2q_f32((const float*)(rowLookupData + x));
5228
5229 float32x4_t inputPositionsX_f_32x4 = inputPositions_f_32x4x2.val[0];
5230 float32x4_t inputPositionsY_f_32x4 = inputPositions_f_32x4x2.val[1];
5231
5232 if (offset)
5233 {
5234 inputPositionsX_f_32x4 = vaddq_f32(inputPositionsX_f_32x4, additionalInputOffsetX_f_32x4);
5235 inputPositionsY_f_32x4 = vaddq_f32(inputPositionsY_f_32x4, additionalInputOffsetY_f_32x4);
5236
5237 additionalInputOffsetX_f_32x4 = vaddq_f32(additionalInputOffsetX_f_32x4, constantFour_f_32x4);
5238 }
5239
5240 // now we check whether we are inside the input frame
5241 const uint32x4_t validPixelsX_u_32x4 = vandq_u32(vcleq_f32(inputPositionsX_f_32x4, constantInputWidth1_f_32x4), vcgeq_f32(inputPositionsX_f_32x4, constantZero_f_32x4)); // inputPosition.x() >= 0 && inputPosition.x() <= (inputWidth - 1) ? 0xFFFFFF : 0x000000
5242 const uint32x4_t validPixelsY_u_32x4 = vandq_u32(vcleq_f32(inputPositionsY_f_32x4, constantInputHeight1_f_32x4), vcgeq_f32(inputPositionsY_f_32x4, constantZero_f_32x4)); // inputPosition.y() >= 0 && inputPosition.y() <= (inputHeight - 1) ? 0xFFFFFF : 0x000000
5243
5244 const uint32x4_t validPixels_u_32x4 = vandq_u32(validPixelsX_u_32x4, validPixelsY_u_32x4); // is_inside_input_frame(inputPosition) ? 0xFFFFFF : 0x000000
5245
5246 vst1q_u32(validPixels, validPixels_u_32x4);
5247
5248 const uint32x4_t inputPositionsLeft_u_32x4 = vcvtq_u32_f32(inputPositionsX_f_32x4);
5249 const uint32x4_t inputPositionsTop_u_32x4 = vcvtq_u32_f32(inputPositionsY_f_32x4);
5250
5251 const uint32x4_t inputPositionsRight_u_32x4 = vminq_u32(vaddq_u32(inputPositionsLeft_u_32x4, constantOne_u_32x4), constantInputWidth1_u_32x4);
5252 const uint32x4_t inputPositionsBottom_u_32x4 = vminq_u32(vaddq_u32(inputPositionsTop_u_32x4, constantOne_u_32x4), constantInputHeight1_u_32x4);
5253
5254 const uint32x4_t topLeftOffsetsElements_u_32x4 = vmlaq_u32(vmulq_u32(inputPositionsLeft_u_32x4, constantChannels_u_32x4), inputPositionsTop_u_32x4, constantInputStrideElements_u_32x4);
5255 const uint32x4_t topRightOffsetsElements_u_32x4 = vmlaq_u32(vmulq_u32(inputPositionsRight_u_32x4, constantChannels_u_32x4), inputPositionsTop_u_32x4, constantInputStrideElements_u_32x4);
5256 const uint32x4_t bottomLeftOffsetsElements_u_32x4 = vmlaq_u32(vmulq_u32(inputPositionsLeft_u_32x4, constantChannels_u_32x4), inputPositionsBottom_u_32x4, constantInputStrideElements_u_32x4);
5257 const uint32x4_t bottomRightOffsetsElements_u_32x4 = vmlaq_u32(vmulq_u32(inputPositionsRight_u_32x4, constantChannels_u_32x4), inputPositionsBottom_u_32x4, constantInputStrideElements_u_32x4);
5258
5259 // we determine the fractional portions of the x' and y':
5260 uint32x4_t tx_128_u_32x4;
5261 uint32x4_t ty_128_u_32x4;
5262
5263#if defined(__aarch64__)
5264 // vrndmq_f32 / vcvtaq_u32_f32 are AArch64-only intrinsics; ARMv7 falls through to the original sequence.
5265 if (useOptimizedBilinearValuesAndFactorCalculation)
5266 {
5267 // Compute fractional part independently of left/top, allowing parallel execution with bounds
5268 const float32x4_t tx_f_32x4 = vmulq_f32(vsubq_f32(inputPositionsX_f_32x4, vrndmq_f32(inputPositionsX_f_32x4)), constant128_f_32x4);
5269 const float32x4_t ty_f_32x4 = vmulq_f32(vsubq_f32(inputPositionsY_f_32x4, vrndmq_f32(inputPositionsY_f_32x4)), constant128_f_32x4);
5270
5271 tx_128_u_32x4 = vcvtaq_u32_f32(tx_f_32x4);
5272 ty_128_u_32x4 = vcvtaq_u32_f32(ty_f_32x4);
5273 }
5274 else
5275#endif
5276 {
5277 // Original: subtract, scale, round
5278 float32x4_t tx_f_32x4 = vsubq_f32(inputPositionsX_f_32x4, vcvtq_f32_u32(inputPositionsLeft_u_32x4));
5279 float32x4_t ty_f_32x4 = vsubq_f32(inputPositionsY_f_32x4, vcvtq_f32_u32(inputPositionsTop_u_32x4));
5280
5281 // we use integer interpolation [0.0, 1.0] -> [0, 128]
5282 tx_f_32x4 = vmulq_f32(tx_f_32x4, vdupq_n_f32(128.0f));
5283 ty_f_32x4 = vmulq_f32(ty_f_32x4, vdupq_n_f32(128.0f));
5284
5285 tx_128_u_32x4 = vcvtq_u32_f32(vaddq_f32(tx_f_32x4, vdupq_n_f32(0.5)));
5286 ty_128_u_32x4 = vcvtq_u32_f32(vaddq_f32(ty_f_32x4, vdupq_n_f32(0.5)));
5287 }
5288
5289 if constexpr (tChannels == 4u)
5290 {
5291 if (useOptimizedNEON)
5292 {
5293 // Optimized 4-channel path: inline pixel gather + widening-multiply interpolation.
5294 // Eliminates NEON->stack->scalar->stack->NEON roundtrip by extracting offsets
5295 // directly from NEON registers via vgetq_lane_u32 (AArch64 umov).
5296
5297 PixelType topLeftPixels[4];
5298 PixelType topRightPixels[4];
5299 PixelType bottomLeftPixels[4];
5300 PixelType bottomRightPixels[4];
5301
5302 topLeftPixels[0] = validPixels[0] ? *((const PixelType*)(input + vgetq_lane_u32(topLeftOffsetsElements_u_32x4, 0))) : *bColor;
5303 topLeftPixels[1] = validPixels[1] ? *((const PixelType*)(input + vgetq_lane_u32(topLeftOffsetsElements_u_32x4, 1))) : *bColor;
5304 topLeftPixels[2] = validPixels[2] ? *((const PixelType*)(input + vgetq_lane_u32(topLeftOffsetsElements_u_32x4, 2))) : *bColor;
5305 topLeftPixels[3] = validPixels[3] ? *((const PixelType*)(input + vgetq_lane_u32(topLeftOffsetsElements_u_32x4, 3))) : *bColor;
5306
5307 topRightPixels[0] = validPixels[0] ? *((const PixelType*)(input + vgetq_lane_u32(topRightOffsetsElements_u_32x4, 0))) : *bColor;
5308 topRightPixels[1] = validPixels[1] ? *((const PixelType*)(input + vgetq_lane_u32(topRightOffsetsElements_u_32x4, 1))) : *bColor;
5309 topRightPixels[2] = validPixels[2] ? *((const PixelType*)(input + vgetq_lane_u32(topRightOffsetsElements_u_32x4, 2))) : *bColor;
5310 topRightPixels[3] = validPixels[3] ? *((const PixelType*)(input + vgetq_lane_u32(topRightOffsetsElements_u_32x4, 3))) : *bColor;
5311
5312 bottomLeftPixels[0] = validPixels[0] ? *((const PixelType*)(input + vgetq_lane_u32(bottomLeftOffsetsElements_u_32x4, 0))) : *bColor;
5313 bottomLeftPixels[1] = validPixels[1] ? *((const PixelType*)(input + vgetq_lane_u32(bottomLeftOffsetsElements_u_32x4, 1))) : *bColor;
5314 bottomLeftPixels[2] = validPixels[2] ? *((const PixelType*)(input + vgetq_lane_u32(bottomLeftOffsetsElements_u_32x4, 2))) : *bColor;
5315 bottomLeftPixels[3] = validPixels[3] ? *((const PixelType*)(input + vgetq_lane_u32(bottomLeftOffsetsElements_u_32x4, 3))) : *bColor;
5316
5317 bottomRightPixels[0] = validPixels[0] ? *((const PixelType*)(input + vgetq_lane_u32(bottomRightOffsetsElements_u_32x4, 0))) : *bColor;
5318 bottomRightPixels[1] = validPixels[1] ? *((const PixelType*)(input + vgetq_lane_u32(bottomRightOffsetsElements_u_32x4, 1))) : *bColor;
5319 bottomRightPixels[2] = validPixels[2] ? *((const PixelType*)(input + vgetq_lane_u32(bottomRightOffsetsElements_u_32x4, 2))) : *bColor;
5320 bottomRightPixels[3] = validPixels[3] ? *((const PixelType*)(input + vgetq_lane_u32(bottomRightOffsetsElements_u_32x4, 3))) : *bColor;
5321
5322 const uint8x16_t topLeftPixels_u8x16 = vld1q_u8((const uint8_t*)topLeftPixels);
5323 const uint8x16_t topRightPixels_u8x16 = vld1q_u8((const uint8_t*)topRightPixels);
5324 const uint8x16_t bottomLeftPixels_u8x16 = vld1q_u8((const uint8_t*)bottomLeftPixels);
5325 const uint8x16_t bottomRightPixels_u8x16 = vld1q_u8((const uint8_t*)bottomRightPixels);
5326
5327 interpolate4Pixels4Channel8BitPerChannelNEON(topLeftPixels_u8x16, topRightPixels_u8x16, bottomLeftPixels_u8x16, bottomRightPixels_u8x16, tx_128_u_32x4, ty_128_u_32x4, outputPixelData, useOptimizedNEONFactorReplication);
5328 }
5329 else
5330 {
5331 vst1q_u32(topLeftOffsetsElements, topLeftOffsetsElements_u_32x4);
5332 vst1q_u32(topRightOffsetsElements, topRightOffsetsElements_u_32x4);
5333 vst1q_u32(bottomLeftOffsetsElements, bottomLeftOffsetsElements_u_32x4);
5334 vst1q_u32(bottomRightOffsetsElements, bottomRightOffsetsElements_u_32x4);
5335
5336 interpolate4Pixels8BitPerChannelNEON<tChannels>(input, topLeftOffsetsElements, topRightOffsetsElements, bottomLeftOffsetsElements, bottomRightOffsetsElements, validPixels, *bColor, tx_128_u_32x4, ty_128_u_32x4, outputPixelData);
5337 }
5338 }
5339 else
5340 {
5341 vst1q_u32(topLeftOffsetsElements, topLeftOffsetsElements_u_32x4);
5342 vst1q_u32(topRightOffsetsElements, topRightOffsetsElements_u_32x4);
5343 vst1q_u32(bottomLeftOffsetsElements, bottomLeftOffsetsElements_u_32x4);
5344 vst1q_u32(bottomRightOffsetsElements, bottomRightOffsetsElements_u_32x4);
5345
5346 interpolate4Pixels8BitPerChannelNEON<tChannels>(input, topLeftOffsetsElements, topRightOffsetsElements, bottomLeftOffsetsElements, bottomRightOffsetsElements, validPixels, *bColor, tx_128_u_32x4, ty_128_u_32x4, outputPixelData);
5347 }
5348
5349 outputPixelData += 4;
5350 }
5351 }
5352}
5353
5354#endif // defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
5355
5356template <unsigned int tChannels>
5357void FrameInterpolatorBilinear::lookupMask8BitPerChannelSubset(const uint8_t* input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable* input_LT_output, const bool offset, uint8_t* output, uint8_t* outputMask, const uint8_t maskValue, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
5358{
5359 ocean_assert(input_LT_output != nullptr);
5360 ocean_assert(input != nullptr && output != nullptr);
5361
5362 ocean_assert(inputWidth != 0u && inputHeight != 0u);
5363 ocean_assert(firstRow + numberRows <= input_LT_output->sizeY());
5364
5365 using PixelType = typename DataType<uint8_t, tChannels>::Type;
5366
5367 const unsigned int columns = (unsigned int)(input_LT_output->sizeX());
5368
5369 const unsigned int outputStrideElements = tChannels * columns + outputPaddingElements;
5370 const unsigned int outputMaskStrideElements = columns + outputMaskPaddingElements;
5371
5372 static_assert(std::is_same<Vector2, LookupTable::Type>::value, "Invalid data type!");
5373
5374 const Scalar inputWidth1 = Scalar(inputWidth - 1u);
5375 const Scalar inputHeight1 = Scalar(inputHeight - 1u);
5376
5377 Memory rowLookupMemory = Memory::create<Vector2>(columns);
5378 Vector2* const rowLookupData = rowLookupMemory.data<Vector2>();
5379
5380 for (unsigned int y = firstRow; y < firstRow + numberRows; ++y)
5381 {
5382 input_LT_output->bilinearValues(y, rowLookupData);
5383
5384 PixelType* outputData = (PixelType*)(output + y * outputStrideElements);
5385 uint8_t* outputMaskData = outputMask + y * outputMaskStrideElements;
5386
5387 for (unsigned int x = 0u; x < columns; ++x)
5388 {
5389 const Vector2& lookupValue = rowLookupData[x];
5390
5391 const Vector2 inputPosition = offset ? Vector2(Scalar(x) + lookupValue.x(), Scalar(y) + lookupValue.y()) : lookupValue;
5392
5393 if (inputPosition.x() >= 0 && inputPosition.y() >= 0 && inputPosition.x() <= inputWidth1 && inputPosition.y() <= inputHeight1)
5394 {
5395 interpolatePixel8BitPerChannel<tChannels, PC_TOP_LEFT>(input, inputWidth, inputHeight, inputPaddingElements, inputPosition, (uint8_t*)(outputData));
5396 *outputMaskData = maskValue;
5397 }
5398 else
5399 {
5400 *outputMaskData = 0xFFu - maskValue;
5401 }
5402
5403 outputData++;
5404 outputMaskData++;
5405 }
5406 }
5407}
5408
5409template <unsigned int tChannels>
5410void FrameInterpolatorBilinear::scale8BitPerChannel(const uint8_t* source, uint8_t* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker* worker)
5411{
5412 ocean_assert(source != nullptr && target != nullptr);
5413 ocean_assert(sourceWidth >= 1u && sourceHeight >= 1u);
5414 ocean_assert(targetWidth >= 1u && targetHeight >= 1u);
5415 ocean_assert(sourceX_s_targetX > 0.0);
5416 ocean_assert(sourceY_s_targetY > 0.0);
5417
5418 if (sourceWidth == targetWidth && sourceHeight == targetHeight)
5419 {
5420 FrameConverter::subFrame<uint8_t>(source, target, sourceWidth, sourceHeight, targetWidth, targetHeight, tChannels, 0u, 0u, 0u, 0u, sourceWidth, sourceHeight, sourcePaddingElements, targetPaddingElements);
5421 return;
5422 }
5423
5424 if (worker && sourceWidth * tChannels >= 16u && targetWidth >= 8u)
5425 {
5426#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
5427 if (sourceWidth <= 65535u && sourceHeight <= 65535u && targetWidth <= 65535u && targetHeight <= 65535u)
5428 {
5429 worker->executeFunction(Worker::Function::createStatic(&scale8BitPerChannelSubset7BitPrecisionNEON, source, target, sourceWidth, sourceHeight, targetWidth, targetHeight, tChannels, sourceX_s_targetX, sourceY_s_targetY, sourcePaddingElements, targetPaddingElements, 0u, 0u), 0u, targetHeight);
5430 return;
5431 }
5432#else
5433 worker->executeFunction(Worker::Function::createStatic(&scale8BitPerChannelSubset<tChannels>, source, target, sourceWidth, sourceHeight, targetWidth, targetHeight, sourceX_s_targetX, sourceY_s_targetY, sourcePaddingElements, targetPaddingElements, 0u, 0u), 0u, targetHeight);
5434#endif
5435 }
5436 else
5437 {
5438 if (sourceWidth * tChannels >= 16u && targetWidth >= 8u)
5439 {
5440#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
5441 if (sourceWidth <= 65535u && sourceHeight <= 65535u && targetWidth <= 65535u && targetHeight <= 65535u)
5442 {
5443 scale8BitPerChannelSubset7BitPrecisionNEON(source, target, sourceWidth, sourceHeight, targetWidth, targetHeight, tChannels, sourceX_s_targetX, sourceY_s_targetY, sourcePaddingElements, targetPaddingElements, 0u, targetHeight);
5444 return;
5445 }
5446#endif
5447 }
5448
5449 scale8BitPerChannelSubset<tChannels>(source, target, sourceWidth, sourceHeight, targetWidth, targetHeight, sourceX_s_targetX, sourceY_s_targetY, sourcePaddingElements, targetPaddingElements, 0u, targetHeight);
5450 }
5451}
5452
5453template <unsigned int tChannels>
5454void FrameInterpolatorBilinear::scale8BitPerChannelSubset(const uint8_t* source, uint8_t* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
5455{
5456 ocean_assert(source != nullptr && target != nullptr);
5457 ocean_assert(sourceWidth != 0u && sourceHeight != 0u);
5458 ocean_assert_and_suppress_unused(targetWidth >= 1u && targetHeight >= 1u, targetHeight);
5459 ocean_assert(sourceX_s_targetX > 0.0 && sourceY_s_targetY > 0.0);
5460
5461 const Scalar sourceX_T_targetX = Scalar(sourceX_s_targetX);
5462 const Scalar sourceY_T_targetY = Scalar(sourceY_s_targetY);
5463
5464 /*
5465 * We determine the sub-pixel accurate source location for each target pixel as follows:
5466 *
5467 * Example with a downsampling by factor 4:
5468 * sourceRow with 12 pixels: | 0 1 2 3 4 5 6 7 8 9 A B |
5469 * targetRow with 3 pixels: | 0 1 2 |
5470 *
5471 * Thus, the source row can be separated into three blocks;
5472 * and we want to extract the color information from the center of the blocks:
5473 * sourceRow with 12 pixels: | 0 1 2 3 | 4 5 6 7 | 8 9 A B |
5474 * targetRow with 3 pixels: | 0 | 1 | 2 | (sourceX_s_targetX = 4)
5475 *
5476 * Thus, we add 0.5 to each target coordinate before converting it to a source location;
5477 * and subtract 0.5 again afterwards:
5478 * sourceX = (targetX + 0.5) * sourceX_s_targetX - 0.5
5479 *
5480 * e.g., (0 + 0.5) * 4 - 0.5 = 1.5
5481 * (1 + 0.5) * 4 - 0.5 = 5.5
5482 *
5483 *
5484 * Example with a downsampling by factor 3:
5485 * sourceRow with 9 pixels: | 0 1 2 3 4 5 6 7 8 |
5486 * targetRow with 3 pixels: | 0 1 2 |
5487 *
5488 * sourceRow with 9 pixels: | 0 1 2 | 3 4 5 | 6 7 8 |
5489 * targetRow with 3 pixels: | 0 | 1 | 2 | (sourceX_s_targetX = 3)
5490 *
5491 * e.g., (0 + 0.5) * 3 - 0.5 = 1
5492 * (1 + 0.5) * 3 - 0.5 = 4
5493 *
5494 *
5495 * Example with a downsampling by factor 2:
5496 * sourceRow with 6 pixels: | 0 1 2 3 4 5 |
5497 * targetRow with 3 pixels: | 0 1 2 |
5498 *
5499 * sourceRow with 6 pixels: | 0 1 | 2 3 | 4 5 |
5500 * targetRow with 3 pixels: | 0 | 1 | 2 | (sourceX_s_targetX = 2)
5501 *
5502 * e.g., (0 + 0.5) * 2 - 0.5 = 0.5
5503 * (1 + 0.5) * 2 - 0.5 = 2.5
5504 *
5505 *
5506 * we can simplify the calculation (as we have a constant term):
5507 * sourceX = (sourceX_s_targetX * targetX) + (sourceX_s_targetX * 0.5 - 0.5)
5508 */
5509
5510 const unsigned int sourceStrideElements = sourceWidth * tChannels + sourcePaddingElements;
5511
5512 const Scalar sourceX_T_targetXOffset = sourceX_T_targetX * Scalar(0.5) - Scalar(0.5);
5513 const Scalar sourceY_T_targetYOffset = sourceY_T_targetY * Scalar(0.5) - Scalar(0.5);
5514
5515 const Scalar sourceWidth_1 = Scalar(sourceWidth - 1u);
5516 const Scalar sourceHeight_1 = Scalar(sourceHeight - 1u);
5517
5518 target += (targetWidth * tChannels + targetPaddingElements) * firstTargetRow;
5519
5520 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberTargetRows; ++y)
5521 {
5522 const Scalar sy = minmax(Scalar(0), sourceY_T_targetYOffset + sourceY_T_targetY * Scalar(y), sourceHeight_1);
5523 ocean_assert(sy >= Scalar(0) && sy < Scalar(sourceHeight));
5524
5525 const unsigned int sTop = (unsigned int)sy;
5526 ocean_assert(sy >= Scalar(sTop));
5527
5528 const Scalar ty = sy - Scalar(sTop);
5529 ocean_assert(ty >= 0 && ty <= 1);
5530
5531 const unsigned int factorBottom = (unsigned int)(ty * Scalar(128) + Scalar(0.5));
5532 const unsigned int factorTop = 128u - factorBottom;
5533
5534 const uint8_t* const sourceTop = source + sourceStrideElements * sTop;
5535 const uint8_t* const sourceBottom = (sTop + 1u < sourceHeight) ? sourceTop + sourceStrideElements : sourceTop;
5536
5537 for (unsigned int x = 0; x < targetWidth; ++x)
5538 {
5539 const Scalar sx = minmax(Scalar(0), sourceX_T_targetXOffset + sourceX_T_targetX * Scalar(x), sourceWidth_1);
5540 ocean_assert(sx >= Scalar(0) && sx < Scalar(sourceWidth));
5541
5542 const unsigned int sLeft = (unsigned int)sx;
5543 ocean_assert(sx >= Scalar(sLeft));
5544
5545 const Scalar tx = sx - Scalar(sLeft);
5546 ocean_assert(tx >= 0 && tx <= 1);
5547
5548 const unsigned int factorRight = (unsigned int)(tx * Scalar(128) + Scalar(0.5));
5549 const unsigned int factorLeft = 128u - factorRight;
5550
5551 const unsigned int sourceRightOffset = sLeft + 1u < sourceWidth ? tChannels : 0u;
5552
5553 const uint8_t* const sourceTopLeft = sourceTop + sLeft * tChannels;
5554 const uint8_t* const sourceBottomLeft = sourceBottom + sLeft * tChannels;
5555
5556 const unsigned int factorTopLeft = factorTop * factorLeft;
5557 const unsigned int factorTopRight = factorTop * factorRight;
5558 const unsigned int factorBottomLeft = factorBottom * factorLeft;
5559 const unsigned int factorBottomRight = factorBottom * factorRight;
5560
5561 for (unsigned int n = 0u; n < tChannels; ++n)
5562 {
5563 target[n] = (uint8_t)((sourceTopLeft[n] * factorTopLeft + sourceTopLeft[sourceRightOffset + n] * factorTopRight
5564 + sourceBottomLeft[n] * factorBottomLeft + sourceBottomLeft[sourceRightOffset + n] * factorBottomRight + 8192u) >> 14u);
5565 }
5566
5567 target += tChannels;
5568 }
5569
5570 target += targetPaddingElements;
5571 }
5572}
5573
5574template <typename T>
5575void FrameInterpolatorBilinear::interpolateRowVertical(const T* sourceRowTop, const T* sourceRowBottom, T* targetRow, const unsigned int elements, const float factorBottom)
5576{
5577 ocean_assert(sourceRowTop != nullptr);
5578 ocean_assert(sourceRowBottom != nullptr);
5579 ocean_assert(targetRow != nullptr);
5580 ocean_assert(elements >= 1u);
5581 ocean_assert(factorBottom >= 0.0f && factorBottom <= 1.0f);
5582
5583 using FloatType = typename FloatTyper<T>::Type;
5584
5585 const FloatType internalFactorBottom = FloatType(factorBottom);
5586 const FloatType internalFactorTop = FloatType(1.0f - factorBottom);
5587
5588 for (unsigned int n = 0u; n < elements; ++n)
5589 {
5590 targetRow[n] = T(FloatType(sourceRowTop[n]) * internalFactorTop + FloatType(sourceRowBottom[n]) * internalFactorBottom);
5591 }
5592}
5593
5594template <typename T, unsigned int tChannels>
5595void FrameInterpolatorBilinear::interpolateRowHorizontal(const T* extendedSourceRow, T* targetRow, const unsigned int targetWidth, const unsigned int channels, const unsigned int* interpolationLocations, const float* interpolationFactorsRight)
5596{
5597 static_assert(tChannels != 0u, "Invalid channel number!");
5598
5599 ocean_assert(extendedSourceRow != nullptr);
5600 ocean_assert(targetRow != nullptr);
5601 ocean_assert(targetWidth >= 1u);
5602 ocean_assert(interpolationLocations != nullptr);
5603 ocean_assert(interpolationFactorsRight != nullptr);
5604 ocean_assert_and_suppress_unused(channels == tChannels, channels);
5605
5606 using FloatType = typename FloatTyper<T>::Type;
5607
5608 for (unsigned int x = 0u; x < targetWidth; ++x)
5609 {
5610 const FloatType internalFactorRight = FloatType(interpolationFactorsRight[x]);
5611 ocean_assert(internalFactorRight >= FloatType(0) && internalFactorRight <= FloatType(1));
5612
5613 const FloatType internalFactorLeft = FloatType(1.0f - interpolationFactorsRight[x]);
5614
5615 const unsigned int& leftLocation = interpolationLocations[x];
5616 const unsigned int rightLocation = leftLocation + tChannels; // location is defined in relation to elements, not to pixels
5617
5618 for (unsigned int n = 0u; n < tChannels; ++n)
5619 {
5620 targetRow[x * tChannels + n] = T(FloatType(extendedSourceRow[leftLocation + n]) * internalFactorLeft + FloatType(extendedSourceRow[rightLocation + n]) * internalFactorRight);
5621 }
5622 }
5623}
5624
5625#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
5626
5627#ifdef OCEAN_WE_KEEP_THIS_IMPLEMENTATION_AS_WE_NEED_THIS_TO_FOR_A_NEW_NEON_IMPLEMENTATION
5628
5629template <>
5630inline void FrameInterpolatorBilinear::scale8BitPerChannelSubset7BitPrecisionNEON<2u, 8u>(const uint8_t* source, uint8_t* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
5631{
5632 ocean_assert(source != nullptr && target != nullptr);
5633 ocean_assert(sourceWidth >= 2u && sourceWidth <= 65535u);
5634 ocean_assert(sourceHeight >= 1u && sourceHeight <= 65535u);
5635 ocean_assert(targetWidth >= 8u && targetWidth <= 65535u);
5636 ocean_assert(targetHeight >= 1u && targetHeight <= 65535u);
5637 ocean_assert(sourceX_s_targetX > 0.0 && sourceY_s_targetY > 0.0);
5638
5639 ocean_assert(sourcePaddingElements == 0u); // not supported
5640 ocean_assert(targetPaddingElements == 0u);
5641
5642 using PixelType = typename DataType<uint8_t, 2u>::Type;
5643
5644 PixelType* targetPixelData = (PixelType*)target + firstTargetRow * targetWidth;
5645 const PixelType* const sourcePixelData = (const PixelType*)source;
5646
5647 // our offset values for the eight left pixels in relation to the first pixel of the row
5648 unsigned int leftOffsets[8];
5649
5650 // this function uses fixed point numbers with 16 bit for the calculation of the interpolation positions and factors:
5651 // fixedPointLocation = floatLocation * 2^16
5652 //
5653 // [FEDCBA98, 76543210]
5654 // [pixel , subpixel]
5655 //
5656 // fixedPointLocation = pixel + subpixel / 2^16
5657 //
5658 // Thus, the upper 16 bit represent the location of e.g., the left pixel (for the linear interpolation)
5659 // while the lower 16 bit represent one of both interpolation factors (and 2^16 - subpixel represents the second interpolation factor)
5660
5661 const unsigned int sourceX_T_targetX_fixed16 = (unsigned int)(double(0x10000u) * sourceX_s_targetX + 0.5);
5662 const unsigned int sourceY_T_targetY_fixed16 = (unsigned int)(double(0x10000u) * sourceY_s_targetY + 0.5);
5663
5664 const int targetOffsetX_fixed16 = (int)(double(0x10000u) * ((sourceX_s_targetX * 0.5) - 0.5) + 0.5);
5665 const int targetOffsetY_fixed16 = (int)(double(0x10000u) * ((sourceY_s_targetY * 0.5) - 0.5) + 0.5);
5666
5667 // we store 4 integers: [sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16]
5668 const uint32x4_t m128_u_sourceX_T_targetX_fixed16 = vdupq_n_u32(sourceX_T_targetX_fixed16);
5669
5670 // we store 4 integers: [sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16]
5671 const int32x4_t m128_s_targetOffsetX_fixed16 = vdupq_n_s32(targetOffsetX_fixed16);
5672
5673 // we store 4 integers: [sourceWidth - 2, sourceWidth - 2, sourceWidth - 2, sourceWidth - 2]
5674 const uint32x4_t m128_u_sourceWidth_2 = vdupq_n_u32(sourceWidth - 2u);
5675
5676 // we store 4 integers: [0, 0, 0, 0]
5677 const int32x4_t m128_s_zero = vdupq_n_s32(0);
5678
5679 const unsigned int u_0123[4] = {0u, 1u, 2u, 3u};
5680 const uint32x4_t m128_u_0123 = vld1q_u32(u_0123);
5681
5682 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberTargetRows; ++y)
5683 {
5684 const unsigned int sourceY_fixed16 = minmax<int>(0, targetOffsetY_fixed16 + int(sourceY_T_targetY_fixed16 * y), (sourceHeight - 1u) << 16u);
5685
5686 const unsigned int sourceRowTop = sourceY_fixed16 >> 16u; // we must not round here
5687 const unsigned int factorBottom_fixed16 = sourceY_fixed16 & 0x0000FFFFu;
5688 const unsigned int factorBottom = factorBottom_fixed16 >> 9u;
5689
5690 const uint8x8_t m64_u_factorsBottom = vdup_n_u8(factorBottom);
5691 // factorTop = 128 - factorBottom
5692 const uint8x8_t m64_u_factorsTop = vdup_n_u8(128u - factorBottom);
5693
5694 const unsigned int sourceRowBottom = min(sourceRowTop + 1u, sourceHeight - 1u);
5695
5696 const PixelType* const sourceTopRowPixelData = sourcePixelData + sourceRowTop * sourceWidth;
5697 const PixelType* const sourceBottomRowPixelData = sourcePixelData + sourceRowBottom * sourceWidth;
5698
5699 for (unsigned int x = 0; x < targetWidth; x += 8u)
5700 {
5701 if (x + 8u > targetWidth)
5702 {
5703 // the last iteration will not fit into the output frame,
5704 // so we simply shift x left by some pixels (at most 7) and we will calculate some pixels again
5705
5706 ocean_assert(x >= 8u && targetWidth > 8u);
5707 const unsigned int newX = targetWidth - 8u;
5708
5709 ocean_assert(x > newX);
5710 targetPixelData -= x - newX;
5711
5712 x = newX;
5713
5714 // the for loop will stop after this iteration
5715 ocean_assert(!(x + 8u < targetWidth));
5716 }
5717
5718
5719 // we need four successive x coordinate floats:
5720 // [x + 3, x + 2, x + 1; x + 0]
5721 const uint32x4_t m128_u_x_0123 = vaddq_u32(vdupq_n_u32(x), m128_u_0123);
5722 const uint32x4_t m128_u_x_4567 = vaddq_u32(vdupq_n_u32(x + 4u), m128_u_0123);
5723
5724 // we calculate the four source locations for our four target locations
5725 const int32x4_t m128_s_sourceX_0123_fixed16 = vmaxq_s32(m128_s_zero, vaddq_s32(m128_s_targetOffsetX_fixed16, vreinterpretq_s32_u32(vmulq_u32(m128_u_sourceX_T_targetX_fixed16, m128_u_x_0123))));
5726 const uint32x4_t m128_u_sourceX_0123_fixed16 = vreinterpretq_u32_s32(m128_s_sourceX_0123_fixed16);
5727
5728 const int32x4_t m128_s_sourceX_4567_fixed16 = vmaxq_s32(m128_s_zero, vaddq_s32(m128_s_targetOffsetX_fixed16, vreinterpretq_s32_u32(vmulq_u32(m128_u_sourceX_T_targetX_fixed16, m128_u_x_4567))));
5729 const uint32x4_t m128_u_sourceX_4567_fixed16 = vreinterpretq_u32_s32(m128_s_sourceX_4567_fixed16);
5730
5731 // now we determine the pixel/integer accurate source locations
5732 // m128_u_left = min(floor(m128_f_sourceX), sourceWidth - 2)
5733 const uint32x4_t m128_u_left_0123 = vminq_u32(vshrq_n_u32(m128_u_sourceX_0123_fixed16, 16), m128_u_sourceWidth_2); // not vrshrq_n_u32 as we must not round here
5734 const uint32x4_t m128_u_left_4567 = vminq_u32(vshrq_n_u32(m128_u_sourceX_4567_fixed16, 16), m128_u_sourceWidth_2);
5735
5736 // we store the offsets we have calculated
5737 vst1q_u32(leftOffsets + 0, m128_u_left_0123);
5738 vst1q_u32(leftOffsets + 4, m128_u_left_4567);
5739
5740
5741
5742 // we load the individal pixels to our four (de-interleaved) 8x8 bit registers (we do this for the top-left and top-right pixels)
5743 // note: loading of each pixel individually is significantly slower than loading two neighboring pixels within one iteration
5744
5745 uint8x8x2_t topLeftPixels;
5746 uint8x8x2_t topRightPixels;
5747
5748 topLeftPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[0u] + 0), topLeftPixels, 0);
5749 topRightPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[0u] + 1), topRightPixels, 0);
5750
5751 topLeftPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[1u] + 0), topLeftPixels, 1);
5752 topRightPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[1u] + 1), topRightPixels, 1);
5753
5754 topLeftPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[2u] + 0), topLeftPixels, 2);
5755 topRightPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[2u] + 1), topRightPixels, 2);
5756
5757 topLeftPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[3u] + 0), topLeftPixels, 3);
5758 topRightPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[3u] + 1), topRightPixels, 3);
5759
5760 topLeftPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[4u] + 0), topLeftPixels, 4);
5761 topRightPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[4u] + 1), topRightPixels, 4);
5762
5763 topLeftPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[5u] + 0), topLeftPixels, 5);
5764 topRightPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[5u] + 1), topRightPixels, 5);
5765
5766 topLeftPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[6u] + 0), topLeftPixels, 6);
5767 topRightPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[6u] + 1), topRightPixels, 6);
5768
5769 topLeftPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[7u] + 0), topLeftPixels, 7);
5770 topRightPixels = vld2_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[7u] + 1), topRightPixels, 7);
5771
5772
5773 // we load the individual pixels to our four (de-interleaved) 8x8 bit registers (we do this for the bottom-left and bottom-right pixels)
5774
5775 uint8x8x2_t bottomLeftPixels;
5776 uint8x8x2_t bottomRightPixels;
5777
5778 bottomLeftPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[0u] + 0), bottomLeftPixels, 0);
5779 bottomRightPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[0u] + 1), bottomRightPixels, 0);
5780
5781 bottomLeftPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[1u] + 0), bottomLeftPixels, 1);
5782 bottomRightPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[1u] + 1), bottomRightPixels, 1);
5783
5784 bottomLeftPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[2u] + 0), bottomLeftPixels, 2);
5785 bottomRightPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[2u] + 1), bottomRightPixels, 2);
5786
5787 bottomLeftPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[3u] + 0), bottomLeftPixels, 3);
5788 bottomRightPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[3u] + 1), bottomRightPixels, 3);
5789
5790 bottomLeftPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[4u] + 0), bottomLeftPixels, 4);
5791 bottomRightPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[4u] + 1), bottomRightPixels, 4);
5792
5793 bottomLeftPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[5u] + 0), bottomLeftPixels, 5);
5794 bottomRightPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[5u] + 1), bottomRightPixels, 5);
5795
5796 bottomLeftPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[6u] + 0), bottomLeftPixels, 6);
5797 bottomRightPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[6u] + 1), bottomRightPixels, 6);
5798
5799 bottomLeftPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[7u] + 0), bottomLeftPixels, 7);
5800 bottomRightPixels = vld2_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[7u] + 1), bottomRightPixels, 7);
5801
5802
5803
5804 // we determine the multiplication factors for the right pixels - which are already stored in the lower 16 bits
5805 // we need an accuracy of 7 bits (values between 0 and 128):
5806 // 76 54 32 10
5807 // [F3 F2 F1 F0]
5808 const uint16x4_t m64_u_factorsRight_0123 = vrshrn_n_u32(vandq_u32(m128_u_sourceX_0123_fixed16, vdupq_n_u32(0x0000FFFFu)), 9);
5809 const uint16x4_t m64_u_factorsRight_4567 = vrshrn_n_u32(vandq_u32(m128_u_sourceX_4567_fixed16, vdupq_n_u32(0x0000FFFFu)), 9);
5810
5811 // as we have the pixel information de-interleaved, we can store all 8 interpolation factors together into one 8x8 bit register:
5812 const uint16x8_t m128_u_factorsRight = vcombine_u16(m64_u_factorsRight_0123, m64_u_factorsRight_4567);
5813 const uint8x8_t m64_u_factorsRight = vqmovn_u16(m128_u_factorsRight);
5814 const uint8x8_t m64_u_factorsLeft = vsub_u8(vdup_n_u8(128u), m64_u_factorsRight);
5815
5816
5817
5818 // we determine the intermediate interpolation results for the top row (and we narrow down the 16 bit results 8 bit results)
5819 uint16x8_t m128_muliplicationChannel_0 = vmull_u8(topLeftPixels.val[0], m64_u_factorsLeft);
5820 uint16x8_t m128_muliplicationChannel_1 = vmull_u8(topLeftPixels.val[1], m64_u_factorsLeft);
5821
5822 m128_muliplicationChannel_0 = vmlal_u8(m128_muliplicationChannel_0, topRightPixels.val[0], m64_u_factorsRight);
5823 m128_muliplicationChannel_1 = vmlal_u8(m128_muliplicationChannel_1, topRightPixels.val[1], m64_u_factorsRight);
5824
5825 uint8x8_t m64_topRowChannel_0 = vrshrn_n_u16(m128_muliplicationChannel_0, 7);
5826 uint8x8_t m64_topRowChannel_1 = vrshrn_n_u16(m128_muliplicationChannel_1, 7);
5827
5828
5829
5830 // we determine the intermediate interpolation results for the bottom row (and we narrow down the 16 bit results 8 bit results)
5831 m128_muliplicationChannel_0 = vmull_u8(bottomLeftPixels.val[0], m64_u_factorsLeft);
5832 m128_muliplicationChannel_1 = vmull_u8(bottomLeftPixels.val[1], m64_u_factorsLeft);
5833
5834 m128_muliplicationChannel_0 = vmlal_u8(m128_muliplicationChannel_0, bottomRightPixels.val[0], m64_u_factorsRight);
5835 m128_muliplicationChannel_1 = vmlal_u8(m128_muliplicationChannel_1, bottomRightPixels.val[1], m64_u_factorsRight);
5836
5837 uint8x8_t m64_bottomRowChannel_0 = vrshrn_n_u16(m128_muliplicationChannel_0, 7);
5838 uint8x8_t m64_bottomRowChannel_1 = vrshrn_n_u16(m128_muliplicationChannel_1, 7);
5839
5840
5841
5842 // finnally we determine the interpolation result between top and bottom row
5843 m128_muliplicationChannel_0 = vmull_u8(m64_topRowChannel_0, m64_u_factorsTop);
5844 m128_muliplicationChannel_1 = vmull_u8(m64_topRowChannel_1, m64_u_factorsTop);
5845
5846 m128_muliplicationChannel_0 = vmlal_u8(m128_muliplicationChannel_0, m64_bottomRowChannel_0, m64_u_factorsBottom);
5847 m128_muliplicationChannel_1 = vmlal_u8(m128_muliplicationChannel_1, m64_bottomRowChannel_1, m64_u_factorsBottom);
5848
5849
5850 // we narrow down the interpolation results and we store them
5851 uint8x8x2_t result;
5852 result.val[0] = vrshrn_n_u16(m128_muliplicationChannel_0, 7);
5853 result.val[1] = vrshrn_n_u16(m128_muliplicationChannel_1, 7);
5854
5855 // we write back the results and interleave them automatically
5856 vst2_u8((uint8_t*)targetPixelData, result);
5857
5858 targetPixelData += 8;
5859 }
5860
5861 // we need to process the last pixel again, as this pixel may have received wrong interpolation factors as we always load two successive pixels into our NEON registers
5862 // **TODO** this is just a temporary solution, check how we can avoid this additional step
5863
5864 const unsigned int firstInvalidTargetX = (((sourceWidth - 1u) << 16u) - targetOffsetX_fixed16) / sourceX_T_targetX_fixed16;
5865
5866 for (unsigned int x = firstInvalidTargetX; x < targetWidth; ++x)
5867 {
5868 const unsigned int lastSourcePixelPosition_fixed16 = minmax<int>(0, targetOffsetX_fixed16 + int(sourceX_T_targetX_fixed16 * x), (sourceWidth - 1u) << 16u);
5869
5870 const unsigned int lastSourcePixelLeft = lastSourcePixelPosition_fixed16 >> 16u;
5871 ocean_assert(lastSourcePixelLeft < sourceWidth);
5872 const unsigned int lastSourcePixelRight = min(lastSourcePixelLeft + 1u, sourceWidth - 1u);
5873
5874 const unsigned int factorRight_fixed16 = lastSourcePixelPosition_fixed16 & 0x0000FFFFu;
5875
5876 const unsigned int factorRight = factorRight_fixed16 >> 9u;
5877 const unsigned int factorLeft = 128u - factorRight;
5878
5879 for (unsigned int c = 0u; c < 2u; ++c)
5880 {
5881 ((uint8_t*)(targetPixelData - (targetWidth - x)))[c] = ((((const uint8_t*)(sourceTopRowPixelData + lastSourcePixelLeft))[c] * factorLeft + ((const uint8_t*)(sourceTopRowPixelData + lastSourcePixelRight))[c] * factorRight) * (128u - factorBottom)
5882 + (((const uint8_t*)(sourceBottomRowPixelData + lastSourcePixelLeft))[c] * factorLeft + ((const uint8_t*)(sourceBottomRowPixelData + lastSourcePixelRight))[c] * factorRight) * factorBottom + 8192u) >> 14u;
5883 }
5884 }
5885 }
5886}
5887
5888#endif // OCEAN_WE_KEEP_THIS_IMPLEMENTATION_AS_WE_NEED_THIS_TO_FOR_A_NEW_NEON_IMPLEMENTATION
5889
5890#ifdef OCEAN_WE_KEEP_ALSO_THIS_SLOW_IMPLEMENTATION_SHOWING_A_SLIGHTLY_DIFFERENT_APPROACH
5891
5892template <>
5893inline void FrameInterpolatorBilinear::scale8BitPerChannelSubset7BitPrecisionNEON<2u, 8u>(const uint8_t* source, uint8_t* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
5894{
5895 ocean_assert(source != nullptr && target != nullptr);
5896 ocean_assert(sourceWidth >= 2u && sourceWidth <= 65535u);
5897 ocean_assert(sourceHeight >= 0u && sourceHeight <= 65535u);
5898 ocean_assert(targetWidth >= 8u && targetWidth <= 65535u)
5899 ocean_assert(targetHeight >= 1u && targetHeight <= 65535u);
5900 ocean_assert(sourceX_s_targetX > 0.0 && sourceY_s_targetY > 0.0);
5901
5902 ocean_assert(sourcePaddingElements == 0u); // not supported
5903 ocean_assert(targetPaddingElements == 0u);
5904
5905 using PixelType = typename DataType<uint8_t, 2u>::Type;
5906
5907 PixelType* targetPixelData = (PixelType*)target + firstTargetRow * targetWidth;
5908 const PixelType* const sourcePixelData = (const PixelType*)source;
5909
5910 // our offset values for the four left pixels in relation to the first pixel of the row
5911 unsigned int leftOffsets[8];
5912
5913 // our color values of the eight top and bottom pixels (32 bit = 16 bit left and 16 bit right)
5914 unsigned int topPixels[8];
5915 unsigned int bottomPixels[8];
5916
5917 // this function uses fixed point numbers with 16 bit for the calculation of the interpolation positions and factors:
5918 // fixedPointLocation = floatLocation * 2^16
5919 //
5920 // [FEDCBA98, 76543210]
5921 // [pixel , subpixel]
5922 //
5923 // fixedPointLocation = pixel + subpixel / 2^16
5924 //
5925 // Thus, the upper 16 bit represent the location of e.g., the left pixel (for the linear interpolation)
5926 // while the lower 16 bit represent one of both interpolation factors (and 2^16 - subpixel represents the second interpolation factor)
5927
5928 const unsigned int sourceX_T_targetX_fixed16 = (unsigned int)(double(0x10000u) * sourceX_s_targetX + 0.5);
5929 const unsigned int sourceY_T_targetY_fixed16 = (unsigned int)(double(0x10000u) * sourceY_s_targetY + 0.5);
5930
5931 const int targetOffsetX_fixed16 = (int)(double(0x10000u) * ((sourceX_s_targetX * 0.5) - 0.5) + 0.5);
5932 const int targetOffsetY_fixed16 = (int)(double(0x10000u) * ((sourceY_s_targetY * 0.5) - 0.5) + 0.5);
5933
5934 // we store 4 integers: [sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16]
5935 const uint32x4_t m128_u_sourceX_T_targetX_fixed16 = vdupq_n_u32(sourceX_T_targetX_fixed16);
5936
5937 // we store 4 integers: [sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16]
5938 const int32x4_t m128_s_targetOffsetX_fixed16 = vdupq_n_s32(targetOffsetX_fixed16);
5939
5940 // we store 4 integers: [sourceWidth - 2, sourceWidth - 2, sourceWidth - 2, sourceWidth - 2]
5941 const uint32x4_t m128_u_sourceWidth_2 = vdupq_n_u32(sourceWidth - 2u);
5942
5943 // we store 4 integers: [0, 0, 0, 0]
5944 const int32x4_t m128_s_zero = vdupq_n_s32(0);
5945
5946 const unsigned int u_0123[4] = {0u, 1u, 2u, 3u};
5947 const uint32x4_t m128_u_0123 = vld1q_u32(u_0123);
5948
5949 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberTargetRows; ++y)
5950 {
5951 const unsigned int sourceY_fixed16 = minmax<int>(0, targetOffsetY_fixed16 + int(sourceY_T_targetY_fixed16 * y), (sourceHeight - 1u) << 16u);
5952
5953 const unsigned int sourceRowTop = sourceY_fixed16 >> 16u; // we must not round here
5954 const unsigned int factorBottom_fixed16 = sourceY_fixed16 & 0x0000FFFFu;
5955 const unsigned int factorBottom = factorBottom_fixed16 >> 9u;
5956
5957 const uint8x8_t m64_u_factorsBottom = vdup_n_u8(factorBottom);
5958 // factorTop = 128 - factorBottom
5959 const uint8x8_t m64_u_factorsTop = vdup_n_u8(128u - factorBottom);
5960
5961 const unsigned int sourceRowBottom = min(sourceRowTop + 1u, sourceHeight - 1u);
5962
5963 const PixelType* const sourceTopRowPixelData = sourcePixelData + sourceRowTop * sourceWidth;
5964 const PixelType* const sourceBottomRowPixelData = sourcePixelData + sourceRowBottom * sourceWidth;
5965
5966 for (unsigned int x = 0; x < targetWidth; x += 8u)
5967 {
5968 if (x + 8u > targetWidth)
5969 {
5970 // the last iteration will not fit into the output frame,
5971 // so we simply shift x left by some pixels (at most 7) and we will calculate some pixels again
5972
5973 ocean_assert(x >= 8u && targetWidth > 8u);
5974 const unsigned int newX = targetWidth - 8u;
5975
5976 ocean_assert(x > newX);
5977 targetPixelData -= x - newX;
5978
5979 x = newX;
5980
5981 // the for loop will stop after this iteration
5982 ocean_assert(!(x + 8u < targetWidth));
5983 }
5984
5985
5986 // we need four successive x coordinate floats:
5987 // [x + 3, x + 2, x + 1; x + 0]
5988 const uint32x4_t m128_u_x_0123 = vaddq_u32(vdupq_n_u32(x), m128_u_0123);
5989 const uint32x4_t m128_u_x_4567 = vaddq_u32(vdupq_n_u32(x + 4u), m128_u_0123);
5990
5991 // we calculate the four source locations for our four target locations
5992 const int32x4_t m128_s_sourceX_0123_fixed16 = vmaxq_s32(m128_s_zero, vaddq_s32(m128_s_targetOffsetX_fixed16, vreinterpretq_s32_u32(vmulq_u32(m128_u_sourceX_T_targetX_fixed16, m128_u_x_0123))));
5993 const uint32x4_t m128_u_sourceX_0123_fixed16 = vreinterpretq_u32_s32(m128_s_sourceX_0123_fixed16);
5994
5995 const int32x4_t m128_s_sourceX_4567_fixed16 = vmaxq_s32(m128_s_zero, vaddq_s32(m128_s_targetOffsetX_fixed16, vreinterpretq_s32_u32(vmulq_u32(m128_u_sourceX_T_targetX_fixed16, m128_u_x_4567))));
5996 const uint32x4_t m128_u_sourceX_4567_fixed16 = vreinterpretq_u32_s32(m128_s_sourceX_4567_fixed16);
5997
5998 // now we determine the pixel/integer accurate source locations
5999 // m128_u_left = min(floor(m128_f_sourceX), sourceWidth - 2)
6000 const uint32x4_t m128_u_left_0123 = vminq_u32(vshrq_n_u32(m128_u_sourceX_0123_fixed16, 16), m128_u_sourceWidth_2); // not vrshrq_n_u32 as we must not round here
6001 const uint32x4_t m128_u_left_4567 = vminq_u32(vshrq_n_u32(m128_u_sourceX_4567_fixed16, 16), m128_u_sourceWidth_2);
6002
6003 // we store the offsets we have calculated
6004 vst1q_u32(leftOffsets + 0, m128_u_left_0123);
6005 vst1q_u32(leftOffsets + 4, m128_u_left_4567);
6006
6007
6008
6009 // we load the left and the right pixels into an intermediate buffer
6010 // with following pattern (with top-left TL, and top-right TR):
6011 // F E D C B A 9 8 7 6 5 4 3 2 1 0
6012 // [TR3 TR3 TL3 TL3 TR2 TR2 TL2 TL2 TR1 TR1 TL1 TL1 TR0 TR0 TL0 TL0]
6013 // [TR7 TR7 TL7 TL7 TR6 TR6 TL6 TL6 TR5 TR5 TL5 TL5 TR4 TR4 TL4 TL4]
6014
6015 for (unsigned int n = 0u; n < 8u; ++n)
6016 {
6017 topPixels[n] = *(unsigned int*)(sourceTopRowPixelData + leftOffsets[n]);
6018 }
6019
6020 const uint16x8_t m128_topPixels_0123 = vreinterpretq_u16_u32(vld1q_u32(topPixels + 0));
6021 const uint16x8_t m128_topPixels_4567 = vreinterpretq_u16_u32(vld1q_u32(topPixels + 4));
6022
6023 for (unsigned int n = 0u; n < 8u; ++n)
6024 {
6025 bottomPixels[n] = *(unsigned int*)(sourceBottomRowPixelData + leftOffsets[n]);
6026 }
6027
6028 const uint16x8_t m128_bottomPixels_0123 = vreinterpretq_u16_u32(vld1q_u32(bottomPixels + 0));
6029 const uint16x8_t m128_bottomPixels_4567 = vreinterpretq_u16_u32(vld1q_u32(bottomPixels + 4));
6030
6031
6032 // we determine the multiplication factors for the right pixels - which are already stored in the lower 16 bits
6033 // we need an accuracy of 7 bits (values between 0 and 128):
6034 // 76 54 32 10
6035 // [F3 F2 F1 F0]
6036 const uint16x4_t m64_u_factorsRight_0123 = vrshrn_n_u32(vandq_u32(m128_u_sourceX_0123_fixed16, vdupq_n_u32(0x0000FFFFu)), 9);
6037 const uint16x4_t m64_u_factorsRight_4567 = vrshrn_n_u32(vandq_u32(m128_u_sourceX_4567_fixed16, vdupq_n_u32(0x0000FFFFu)), 9);
6038
6039 // as we will have the pixel information de-interleaved, we can store all 8 interpolation factors together into one 8x8 bit register:
6040 const uint16x8_t m128_u_factorsRight = vcombine_u16(m64_u_factorsRight_0123, m64_u_factorsRight_4567);
6041 const uint8x8_t m64_u_factorsRight = vqmovn_u16(m128_u_factorsRight);
6042
6043 // nw we have the interpolation factors for 8 left and 8 right pixels:
6044 // 7 6 5 4 3 2 1 0
6045 // [F7 F6 F5 F4 F3 F2 F1 F0]
6046 const uint8x8_t m64_u_factorsLeft = vsub_u8(vdup_n_u8(128u), m64_u_factorsRight);
6047
6048
6049 // we de-interleave the top pixels to left and right pixels:
6050 // F E D C B A 9 8 7 6 5 4 3 2 1 0
6051 // [TL7 TL7 TL6 TL6 TL5 TL5 TL4 TL4 TL3 TL3 TL2 TL2 TL1 TL1 TL0 TL0]
6052 // [TR7 TR7 TR6 TR6 TR5 TR5 TR4 TR4 TR3 TR3 TR2 TR2 TR1 TR1 TR0 TR0]
6053 const uint16x8x2_t m2_128_topPixelsLeftRight = vuzpq_u16(m128_topPixels_0123, m128_topPixels_4567);
6054
6055 // we de-interleave the pixels again to separate channel 0 and channel 1:
6056 // 7 6 5 4 3 2 1 0
6057 // channel 0: [TL7 TL6 TL5 TL4 TL3 TL2 TL1 TL0]
6058 // channel 1: [TL7 TL6 TL5 TL4 TL3 TL2 TL1 TL0]
6059 const uint8x8x2_t m2_64_topPixelsLeft_channels_01 = vuzp_u8(vget_low_u8(vreinterpretq_u8_u16(m2_128_topPixelsLeftRight.val[0])), vget_high_u8(vreinterpretq_u8_u16(m2_128_topPixelsLeftRight.val[0])));
6060 const uint8x8x2_t m2_64_topPixelsRight_channels_01 = vuzp_u8(vget_low_u8(vreinterpretq_u8_u16(m2_128_topPixelsLeftRight.val[1])), vget_high_u8(vreinterpretq_u8_u16(m2_128_topPixelsLeftRight.val[1])));
6061
6062 const uint8x8_t& m64_topPixelsLeft_channel_0 = m2_64_topPixelsLeft_channels_01.val[0];
6063 const uint8x8_t& m64_topPixelsLeft_channel_1 = m2_64_topPixelsLeft_channels_01.val[1];
6064
6065 const uint8x8_t& m64_topPixelsRight_channel_0 = m2_64_topPixelsRight_channels_01.val[0];
6066 const uint8x8_t& m64_topPixelsRight_channel_1 = m2_64_topPixelsRight_channels_01.val[1];
6067
6068
6069 // we determine the intermediate interpolation results for the top row (and we narrow down the 16 bit results 8 bit results)
6070 uint16x8_t m128_muliplication_channel_0 = vmull_u8(m64_topPixelsLeft_channel_0, m64_u_factorsLeft);
6071 uint16x8_t m128_muliplication_channel_1 = vmull_u8(m64_topPixelsLeft_channel_1, m64_u_factorsLeft);
6072
6073 m128_muliplication_channel_0 = vmlal_u8(m128_muliplication_channel_0, m64_topPixelsRight_channel_0, m64_u_factorsRight);
6074 m128_muliplication_channel_1 = vmlal_u8(m128_muliplication_channel_1, m64_topPixelsRight_channel_1, m64_u_factorsRight);
6075
6076 const uint8x8_t m64_topRow_channel_0 = vrshrn_n_u16(m128_muliplication_channel_0, 7);
6077 const uint8x8_t m64_topRow_channel_1 = vrshrn_n_u16(m128_muliplication_channel_1, 7);
6078
6079
6080 // we proceed with the bottom pixels (as we did with the top pixels)
6081 const uint16x8x2_t m2_128_bottomPixelsLeftRight = vuzpq_u16(m128_bottomPixels_0123, m128_bottomPixels_4567);
6082
6083 const uint8x8x2_t m2_64_bottomPixelsLeft_channels_01 = vuzp_u8(vget_low_u8(vreinterpretq_u8_u16(m2_128_bottomPixelsLeftRight.val[0])), vget_high_u8(vreinterpretq_u8_u16(m2_128_bottomPixelsLeftRight.val[0])));
6084 const uint8x8x2_t m2_64_bottomPixelsRight_channels_01 = vuzp_u8(vget_low_u8(vreinterpretq_u8_u16(m2_128_bottomPixelsLeftRight.val[1])), vget_high_u8(vreinterpretq_u8_u16(m2_128_bottomPixelsLeftRight.val[1])));
6085
6086 const uint8x8_t& m64_bottomPixelsLeft_channel_0 = m2_64_bottomPixelsLeft_channels_01.val[0];
6087 const uint8x8_t& m64_bottomPixelsLeft_channel_1 = m2_64_bottomPixelsLeft_channels_01.val[1];
6088
6089 const uint8x8_t& m64_bottomPixelsRight_channel_0 = m2_64_bottomPixelsRight_channels_01.val[0];
6090 const uint8x8_t& m64_bottomPixelsRight_channel_1 = m2_64_bottomPixelsRight_channels_01.val[1];
6091
6092
6093 // we determine the intermediate interpolation results for the bottom row (and we narrow down the 16 bit results 8 bit results)
6094 m128_muliplication_channel_0 = vmull_u8(m64_bottomPixelsLeft_channel_0, m64_u_factorsLeft);
6095 m128_muliplication_channel_1 = vmull_u8(m64_bottomPixelsLeft_channel_1, m64_u_factorsLeft);
6096
6097 m128_muliplication_channel_0 = vmlal_u8(m128_muliplication_channel_0, m64_bottomPixelsRight_channel_0, m64_u_factorsRight);
6098 m128_muliplication_channel_1 = vmlal_u8(m128_muliplication_channel_1, m64_bottomPixelsRight_channel_1, m64_u_factorsRight);
6099
6100 const uint8x8_t m64_bottomRow_channel_0 = vrshrn_n_u16(m128_muliplication_channel_0, 7);
6101 const uint8x8_t m64_bottomRow_channel_1 = vrshrn_n_u16(m128_muliplication_channel_1, 7);
6102
6103
6104 // finnally we determine the interpolation result between top and bottom row
6105 m128_muliplication_channel_0 = vmull_u8(m64_topRow_channel_0, m64_u_factorsTop);
6106 m128_muliplication_channel_1 = vmull_u8(m64_topRow_channel_1, m64_u_factorsTop);
6107
6108 m128_muliplication_channel_0 = vmlal_u8(m128_muliplication_channel_0, m64_bottomRow_channel_0, m64_u_factorsBottom);
6109 m128_muliplication_channel_1 = vmlal_u8(m128_muliplication_channel_1, m64_bottomRow_channel_1, m64_u_factorsBottom);
6110
6111
6112 // we narrow down the interpolation results and we store them
6113 uint8x8x2_t m2_64_result;
6114 m2_64_result.val[0] = vrshrn_n_u16(m128_muliplication_channel_0, 7);
6115 m2_64_result.val[1] = vrshrn_n_u16(m128_muliplication_channel_1, 7);
6116
6117 // we write back the results and interleave them automatically
6118 vst2_u8((uint8_t*)targetPixelData, m2_64_result);
6119
6120 targetPixelData += 8;
6121 }
6122
6123 // we need to process the last pixel again, as this pixel may have received wrong interpolation factors as we always load two successive pixels into our NEON registers
6124 // **TODO** this is just a temporary solution, check how we can avoid this additional step
6125
6126 const unsigned int firstInvalidTargetX = (((sourceWidth - 1u) << 16u) - targetOffsetX_fixed16) / sourceX_T_targetX_fixed16;
6127
6128 for (unsigned int x = firstInvalidTargetX; x < targetWidth; ++x)
6129 {
6130 const unsigned int lastSourcePixelPosition_fixed16 = minmax<int>(0, targetOffsetX_fixed16 + int(sourceX_T_targetX_fixed16 * x), (sourceWidth - 1u) << 16u);
6131
6132 const unsigned int lastSourcePixelLeft = lastSourcePixelPosition_fixed16 >> 16u;
6133 ocean_assert(lastSourcePixelLeft < sourceWidth);
6134 const unsigned int lastSourcePixelRight = min(lastSourcePixelLeft + 1u, sourceWidth - 1u);
6135
6136 const unsigned int factorRight_fixed16 = lastSourcePixelPosition_fixed16 & 0x0000FFFFu;
6137
6138 const unsigned int factorRight = factorRight_fixed16 >> 9u;
6139 const unsigned int factorLeft = 128u - factorRight;
6140
6141 for (unsigned int c = 0u; c < 2u; ++c)
6142 {
6143 ((uint8_t*)(targetPixelData - (targetWidth - x)))[c] = ((((const uint8_t*)(sourceTopRowPixelData + lastSourcePixelLeft))[c] * factorLeft + ((const uint8_t*)(sourceTopRowPixelData + lastSourcePixelRight))[c] * factorRight) * (128u - factorBottom)
6144 + (((const uint8_t*)(sourceBottomRowPixelData + lastSourcePixelLeft))[c] * factorLeft + ((const uint8_t*)(sourceBottomRowPixelData + lastSourcePixelRight))[c] * factorRight) * factorBottom + 8192u) >> 14u;
6145 }
6146 }
6147 }
6148}
6149
6150#endif // OCEAN_WE_KEEP_ALSO_THIS_SLOW_IMPLEMENTATION_SHOWING_A_SLIGHTLY_DIFFERENT_APPROACH
6151
6152#ifdef OCEAN_WE_KEEP_THIS_IMPLEMENTATION_AS_WE_NEED_THIS_TO_FOR_A_NEW_NEON_IMPLEMENTATION
6153
6154template <>
6155inline void FrameInterpolatorBilinear::scale8BitPerChannelSubset7BitPrecisionNEON<3u, 8u>(const uint8_t* source, uint8_t* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
6156{
6157 ocean_assert(source != nullptr && target != nullptr);
6158 ocean_assert(sourceWidth >= 2u && sourceWidth <= 65535u);
6159 ocean_assert(sourceHeight >= 1u && sourceHeight <= 65535u);
6160 ocean_assert(targetWidth >= 8u && targetWidth <= 65535u);
6161 ocean_assert(targetHeight >= 1u && targetHeight <= 65535u);
6162 ocean_assert(sourceX_s_targetX > 0.0 && sourceY_s_targetY > 0.0);
6163
6164 ocean_assert(sourcePaddingElements == 0u); // not supported
6165 ocean_assert(targetPaddingElements == 0u);
6166
6167 using PixelType = typename DataType<uint8_t, 3u>::Type;
6168
6169 PixelType* targetPixelData = (PixelType*)target + firstTargetRow * targetWidth;
6170 const PixelType* const sourcePixelData = (const PixelType*)source;
6171
6172 // our offset values for the eight left pixels in relation to the first pixel of the row
6173 unsigned int leftOffsets[8];
6174
6175 // this function uses fixed point numbers with 16 bit for the calculation of the interpolation positions and factors:
6176 // fixedPointLocation = floatLocation * 2^16
6177 //
6178 // [FEDCBA98, 76543210]
6179 // [pixel , subpixel]
6180 //
6181 // fixedPointLocation = pixel + subpixel / 2^16
6182 //
6183 // Thus, the upper 16 bit represent the location of e.g., the left pixel (for the linear interpolation)
6184 // while the lower 16 bit represent one of both interpolation factors (and 2^16 - subpixel represents the second interpolation factor)
6185
6186 const unsigned int sourceX_T_targetX_fixed16 = (unsigned int)(double(0x10000u) * sourceX_s_targetX + 0.5);
6187 const unsigned int sourceY_T_targetY_fixed16 = (unsigned int)(double(0x10000u) * sourceY_s_targetY + 0.5);
6188
6189 const int targetOffsetX_fixed16 = (int)(double(0x10000u) * ((sourceX_s_targetX * 0.5) - 0.5) + 0.5);
6190 const int targetOffsetY_fixed16 = (int)(double(0x10000u) * ((sourceY_s_targetY * 0.5) - 0.5) + 0.5);
6191
6192 // we store 4 integers: [sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16]
6193 const uint32x4_t m128_u_sourceX_T_targetX_fixed16 = vdupq_n_u32(sourceX_T_targetX_fixed16);
6194
6195 // we store 4 integers: [sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16]
6196 const int32x4_t m128_s_targetOffsetX_fixed16 = vdupq_n_s32(targetOffsetX_fixed16);
6197
6198 // we store 4 integers: [sourceWidth - 2, sourceWidth - 2, sourceWidth - 2, sourceWidth - 2]
6199 const uint32x4_t m128_u_sourceWidth_2 = vdupq_n_u32(sourceWidth - 2u);
6200
6201 // we store 4 integers: [0, 0, 0, 0]
6202 const int32x4_t m128_s_zero = vdupq_n_s32(0);
6203
6204 const unsigned int u_0123[4] = {0u, 1u, 2u, 3u};
6205 const uint32x4_t m128_u_0123 = vld1q_u32(u_0123);
6206
6207 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberTargetRows; ++y)
6208 {
6209 const unsigned int sourceY_fixed16 = minmax<int>(0, targetOffsetY_fixed16 + int(sourceY_T_targetY_fixed16 * y), (sourceHeight - 1u) << 16u);
6210
6211 const unsigned int sourceRowTop = sourceY_fixed16 >> 16u; // we must not round here
6212 const unsigned int factorBottom_fixed16 = sourceY_fixed16 & 0x0000FFFFu;
6213 const unsigned int factorBottom = factorBottom_fixed16 >> 9u;
6214
6215 const uint8x8_t m64_u_factorsBottom = vdup_n_u8(factorBottom);
6216 // factorTop = 128 - factorBottom
6217 const uint8x8_t m64_u_factorsTop = vdup_n_u8(128u - factorBottom);
6218
6219 const unsigned int sourceRowBottom = min(sourceRowTop + 1u, sourceHeight - 1u);
6220
6221 const PixelType* const sourceTopRowPixelData = sourcePixelData + sourceRowTop * sourceWidth;
6222 const PixelType* const sourceBottomRowPixelData = sourcePixelData + sourceRowBottom * sourceWidth;
6223
6224 for (unsigned int x = 0; x < targetWidth; x += 8u)
6225 {
6226 if (x + 8u > targetWidth)
6227 {
6228 // the last iteration will not fit into the output frame,
6229 // so we simply shift x left by some pixels (at most 7) and we will calculate some pixels again
6230
6231 ocean_assert(x >= 8u && targetWidth > 8u);
6232 const unsigned int newX = targetWidth - 8u;
6233
6234 ocean_assert(x > newX);
6235 targetPixelData -= x - newX;
6236
6237 x = newX;
6238
6239 // the for loop will stop after this iteration
6240 ocean_assert(!(x + 8u < targetWidth));
6241 }
6242
6243
6244 // we need four successive x coordinate floats:
6245 // [x + 3, x + 2, x + 1; x + 0]
6246 const uint32x4_t m128_u_x_0123 = vaddq_u32(vdupq_n_u32(x), m128_u_0123);
6247 const uint32x4_t m128_u_x_4567 = vaddq_u32(vdupq_n_u32(x + 4u), m128_u_0123);
6248
6249 // we calculate the four source locations for our four target locations
6250 const int32x4_t m128_s_sourceX_0123_fixed16 = vmaxq_s32(m128_s_zero, vaddq_s32(m128_s_targetOffsetX_fixed16, vreinterpretq_s32_u32(vmulq_u32(m128_u_sourceX_T_targetX_fixed16, m128_u_x_0123))));
6251 const uint32x4_t m128_u_sourceX_0123_fixed16 = vreinterpretq_u32_s32(m128_s_sourceX_0123_fixed16);
6252
6253 const int32x4_t m128_s_sourceX_4567_fixed16 = vmaxq_s32(m128_s_zero, vaddq_s32(m128_s_targetOffsetX_fixed16, vreinterpretq_s32_u32(vmulq_u32(m128_u_sourceX_T_targetX_fixed16, m128_u_x_4567))));
6254 const uint32x4_t m128_u_sourceX_4567_fixed16 = vreinterpretq_u32_s32(m128_s_sourceX_4567_fixed16);
6255
6256 // now we determine the pixel/integer accurate source locations
6257 // m128_u_left = min(floor(m128_f_sourceX), sourceWidth - 2)
6258 const uint32x4_t m128_u_left_0123 = vminq_u32(vshrq_n_u32(m128_u_sourceX_0123_fixed16, 16), m128_u_sourceWidth_2); // not vrshrq_n_u32 as we must not round here
6259 const uint32x4_t m128_u_left_4567 = vminq_u32(vshrq_n_u32(m128_u_sourceX_4567_fixed16, 16), m128_u_sourceWidth_2);
6260
6261 // we store the offsets we have calculated
6262 vst1q_u32(leftOffsets + 0, m128_u_left_0123);
6263 vst1q_u32(leftOffsets + 4, m128_u_left_4567);
6264
6265
6266
6267 // we load the individal pixels to our four (de-interleaved) 8x8 bit registers (we do this for the top-left and top-right pixels)
6268 // note: loading of each pixel individually is significantly slower than loading two neighboring pixels within one iteration
6269
6270 uint8x8x3_t topLeftPixels;
6271 uint8x8x3_t topRightPixels;
6272
6273 topLeftPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[0u] + 0), topLeftPixels, 0);
6274 topRightPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[0u] + 1), topRightPixels, 0);
6275
6276 topLeftPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[1u] + 0), topLeftPixels, 1);
6277 topRightPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[1u] + 1), topRightPixels, 1);
6278
6279 topLeftPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[2u] + 0), topLeftPixels, 2);
6280 topRightPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[2u] + 1), topRightPixels, 2);
6281
6282 topLeftPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[3u] + 0), topLeftPixels, 3);
6283 topRightPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[3u] + 1), topRightPixels, 3);
6284
6285 topLeftPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[4u] + 0), topLeftPixels, 4);
6286 topRightPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[4u] + 1), topRightPixels, 4);
6287
6288 topLeftPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[5u] + 0), topLeftPixels, 5);
6289 topRightPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[5u] + 1), topRightPixels, 5);
6290
6291 topLeftPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[6u] + 0), topLeftPixels, 6);
6292 topRightPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[6u] + 1), topRightPixels, 6);
6293
6294 topLeftPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[7u] + 0), topLeftPixels, 7);
6295 topRightPixels = vld3_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[7u] + 1), topRightPixels, 7);
6296
6297
6298 // we load the individal pixels to our four (de-interleaved) 8x8 bit registers (we do this for the bottom-left and bottom-right pixels)
6299
6300 uint8x8x3_t bottomLeftPixels;
6301 uint8x8x3_t bottomRightPixels;
6302
6303 bottomLeftPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[0u] + 0), bottomLeftPixels, 0);
6304 bottomRightPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[0u] + 1), bottomRightPixels, 0);
6305
6306 bottomLeftPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[1u] + 0), bottomLeftPixels, 1);
6307 bottomRightPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[1u] + 1), bottomRightPixels, 1);
6308
6309 bottomLeftPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[2u] + 0), bottomLeftPixels, 2);
6310 bottomRightPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[2u] + 1), bottomRightPixels, 2);
6311
6312 bottomLeftPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[3u] + 0), bottomLeftPixels, 3);
6313 bottomRightPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[3u] + 1), bottomRightPixels, 3);
6314
6315 bottomLeftPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[4u] + 0), bottomLeftPixels, 4);
6316 bottomRightPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[4u] + 1), bottomRightPixels, 4);
6317
6318 bottomLeftPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[5u] + 0), bottomLeftPixels, 5);
6319 bottomRightPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[5u] + 1), bottomRightPixels, 5);
6320
6321 bottomLeftPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[6u] + 0), bottomLeftPixels, 6);
6322 bottomRightPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[6u] + 1), bottomRightPixels, 6);
6323
6324 bottomLeftPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[7u] + 0), bottomLeftPixels, 7);
6325 bottomRightPixels = vld3_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[7u] + 1), bottomRightPixels, 7);
6326
6327
6328
6329 // we determine the multiplication factors for the right pixels - which are already stored in the lower 16 bits
6330 // we need an accuracy of 7 bits (values between 0 and 128):
6331 // 76 54 32 10
6332 // [F3 F2 F1 F0]
6333 const uint16x4_t m64_u_factorsRight_0123 = vrshrn_n_u32(vandq_u32(m128_u_sourceX_0123_fixed16, vdupq_n_u32(0x0000FFFFu)), 9);
6334 const uint16x4_t m64_u_factorsRight_4567 = vrshrn_n_u32(vandq_u32(m128_u_sourceX_4567_fixed16, vdupq_n_u32(0x0000FFFFu)), 9);
6335
6336 // as we have the pixel information de-interleaved, we can store all 8 interpolation factors together into one 8x8 bit register:
6337 const uint16x8_t m128_u_factorsRight = vcombine_u16(m64_u_factorsRight_0123, m64_u_factorsRight_4567);
6338 const uint8x8_t m64_u_factorsRight = vqmovn_u16(m128_u_factorsRight);
6339 const uint8x8_t m64_u_factorsLeft = vsub_u8(vdup_n_u8(128u), m64_u_factorsRight);
6340
6341
6342
6343 // we determine the intermediate interpolation results for the top row (and we narrow down the 16 bit results 8 bit results)
6344 uint16x8_t m128_muliplicationChannel_0 = vmull_u8(topLeftPixels.val[0], m64_u_factorsLeft);
6345 uint16x8_t m128_muliplicationChannel_1 = vmull_u8(topLeftPixels.val[1], m64_u_factorsLeft);
6346 uint16x8_t m128_muliplicationChannel_2 = vmull_u8(topLeftPixels.val[2], m64_u_factorsLeft);
6347
6348 m128_muliplicationChannel_0 = vmlal_u8(m128_muliplicationChannel_0, topRightPixels.val[0], m64_u_factorsRight);
6349 m128_muliplicationChannel_1 = vmlal_u8(m128_muliplicationChannel_1, topRightPixels.val[1], m64_u_factorsRight);
6350 m128_muliplicationChannel_2 = vmlal_u8(m128_muliplicationChannel_2, topRightPixels.val[2], m64_u_factorsRight);
6351
6352 uint8x8_t m64_topRowChannel_0 = vrshrn_n_u16(m128_muliplicationChannel_0, 7);
6353 uint8x8_t m64_topRowChannel_1 = vrshrn_n_u16(m128_muliplicationChannel_1, 7);
6354 uint8x8_t m64_topRowChannel_2 = vrshrn_n_u16(m128_muliplicationChannel_2, 7);
6355
6356
6357
6358 // we determine the intermediate interpolation results for the bottom row (and we narrow down the 16 bit results 8 bit results)
6359 m128_muliplicationChannel_0 = vmull_u8(bottomLeftPixels.val[0], m64_u_factorsLeft);
6360 m128_muliplicationChannel_1 = vmull_u8(bottomLeftPixels.val[1], m64_u_factorsLeft);
6361 m128_muliplicationChannel_2 = vmull_u8(bottomLeftPixels.val[2], m64_u_factorsLeft);
6362
6363 m128_muliplicationChannel_0 = vmlal_u8(m128_muliplicationChannel_0, bottomRightPixels.val[0], m64_u_factorsRight);
6364 m128_muliplicationChannel_1 = vmlal_u8(m128_muliplicationChannel_1, bottomRightPixels.val[1], m64_u_factorsRight);
6365 m128_muliplicationChannel_2 = vmlal_u8(m128_muliplicationChannel_2, bottomRightPixels.val[2], m64_u_factorsRight);
6366
6367 uint8x8_t m64_bottomRowChannel_0 = vrshrn_n_u16(m128_muliplicationChannel_0, 7);
6368 uint8x8_t m64_bottomRowChannel_1 = vrshrn_n_u16(m128_muliplicationChannel_1, 7);
6369 uint8x8_t m64_bottomRowChannel_2 = vrshrn_n_u16(m128_muliplicationChannel_2, 7);
6370
6371
6372
6373 // finnally we determine the interpolation result between top and bottom row
6374 m128_muliplicationChannel_0 = vmull_u8(m64_topRowChannel_0, m64_u_factorsTop);
6375 m128_muliplicationChannel_1 = vmull_u8(m64_topRowChannel_1, m64_u_factorsTop);
6376 m128_muliplicationChannel_2 = vmull_u8(m64_topRowChannel_2, m64_u_factorsTop);
6377
6378 m128_muliplicationChannel_0 = vmlal_u8(m128_muliplicationChannel_0, m64_bottomRowChannel_0, m64_u_factorsBottom);
6379 m128_muliplicationChannel_1 = vmlal_u8(m128_muliplicationChannel_1, m64_bottomRowChannel_1, m64_u_factorsBottom);
6380 m128_muliplicationChannel_2 = vmlal_u8(m128_muliplicationChannel_2, m64_bottomRowChannel_2, m64_u_factorsBottom);
6381
6382
6383 // we narrow down the interpolation results and we store them
6384 uint8x8x3_t result;
6385 result.val[0] = vrshrn_n_u16(m128_muliplicationChannel_0, 7);
6386 result.val[1] = vrshrn_n_u16(m128_muliplicationChannel_1, 7);
6387 result.val[2] = vrshrn_n_u16(m128_muliplicationChannel_2, 7);
6388
6389 // we write back the results and interleave them automatically
6390 vst3_u8((uint8_t*)targetPixelData, result);
6391
6392 targetPixelData += 8;
6393 }
6394
6395 // we need to process the last pixel again, as this pixel may have received wrong interpolation factors as we always load two successive pixels into our NEON registers
6396 // **TODO** this is just a temporary solution, check how we can avoid this additional step
6397
6398 const unsigned int firstInvalidTargetX = (((sourceWidth - 1u) << 16u) - targetOffsetX_fixed16) / sourceX_T_targetX_fixed16;
6399
6400 for (unsigned int x = firstInvalidTargetX; x < targetWidth; ++x)
6401 {
6402 const unsigned int lastSourcePixelPosition_fixed16 = minmax<int>(0, targetOffsetX_fixed16 + int(sourceX_T_targetX_fixed16 * x), (sourceWidth - 1u) << 16u);
6403
6404 const unsigned int lastSourcePixelLeft = lastSourcePixelPosition_fixed16 >> 16u;
6405 ocean_assert(lastSourcePixelLeft < sourceWidth);
6406 const unsigned int lastSourcePixelRight = min(lastSourcePixelLeft + 1u, sourceWidth - 1u);
6407
6408 const unsigned int factorRight_fixed16 = lastSourcePixelPosition_fixed16 & 0x0000FFFFu;
6409
6410 const unsigned int factorRight = factorRight_fixed16 >> 9u;
6411 const unsigned int factorLeft = 128u - factorRight;
6412
6413 for (unsigned int c = 0u; c < 3u; ++c)
6414 {
6415 ((uint8_t*)(targetPixelData - (targetWidth - x)))[c] = ((((const uint8_t*)(sourceTopRowPixelData + lastSourcePixelLeft))[c] * factorLeft + ((const uint8_t*)(sourceTopRowPixelData + lastSourcePixelRight))[c] * factorRight) * (128u - factorBottom)
6416 + (((const uint8_t*)(sourceBottomRowPixelData + lastSourcePixelLeft))[c] * factorLeft + ((const uint8_t*)(sourceBottomRowPixelData + lastSourcePixelRight))[c] * factorRight) * factorBottom + 8192u) >> 14u;
6417 }
6418 }
6419 }
6420}
6421
6422#endif // OCEAN_WE_KEEP_THIS_IMPLEMENTATION_AS_WE_NEED_THIS_TO_FOR_A_NEW_NEON_IMPLEMENTATION
6423
6424#ifdef OCEAN_WE_KEEP_ALSO_THIS_SLOW_IMPLEMENTATION_SHOWING_A_MORE_GENERIC_APPROACH
6425
6426/// \cond DOXYGEN_DO_NOT_DOCUMENT
6427
6428template <>
6429inline void FrameInterpolatorBilinear::resize8BitPerChannelSubset7BitPrecisionNEON<4u, 8u>(const uint8_t* source, uint8_t* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
6430{
6431 ocean_assert(source != nullptr && target != nullptr);
6432 ocean_assert(sourceWidth >= 2u && sourceWidth <= 65535u);
6433 ocean_assert(sourceHeight >= 1u && sourceHeight <= 65535u);
6434 ocean_assert(targetWidth >= 8u && targetWidth <= 65535u);
6435 ocean_assert(targetHeight >= 1u && targetHeight <= 65535u);
6436 ocean_assert(sourceX_s_targetX > 0.0 && sourceY_s_targetY > 0.0);
6437
6438 ocean_assert(sourcePaddingElements == 0u); // not supported
6439 ocean_assert(targetPaddingElements == 0u);
6440
6441 using PixelType = typename DataType<uint8_t, 4u>::Type;
6442
6443 PixelType* targetPixelData = (PixelType*)target + firstTargetRow * targetWidth;
6444 const PixelType* const sourcePixelData = (const PixelType*)source;
6445
6446 // our offset values for the eight left pixels in relation to the first pixel of the row
6447 unsigned int leftOffsets[8];
6448
6449 // this function uses fixed point numbers with 16 bit for the calculation of const unsigned int sourceX_T_targetX_fixed16 = (unsigned int)(double(0x10000u) * sourceX_s_targetX + 0.5);
6450 const unsigned int sourceY_T_targetY_fixed16 = (unsigned int)(double(0x10000u) * sourceY_s_targetY + 0.5);
6451
6452 // this function uses fixed point numbers with 16 bit for the calculation of the interpolation positions and factors:
6453 // fixedPointLocation = floatLocation * 2^16
6454 //
6455 // [FEDCBA98, 76543210]
6456 // [pixel , subpixel]
6457 //
6458 // fixedPointLocation = pixel + subpixel / 2^16
6459 //
6460 // Thus, the upper 16 bit represent the location of e.g., the left pixel (for the linear interpolation)
6461 // while the lower 16 bit represent one of both interpolation factors (and 2^16 - subpixel represents the second interpolation factor)
6462
6463 const unsigned int sourceX_T_targetX_fixed16 = (unsigned int)(double(0x10000u) * sourceX_s_targetX + 0.5);
6464 const unsigned int sourceY_T_targetY_fixed16 = (unsigned int)(double(0x10000u) * sourceY_s_targetY + 0.5);
6465
6466 const int targetOffsetX_fixed16 = (int)(double(0x10000u) * ((sourceX_s_targetX * 0.5) - 0.5) + 0.5);
6467 const int targetOffsetY_fixed16 = (int)(double(0x10000u) * ((sourceY_s_targetY * 0.5) - 0.5) + 0.5);
6468
6469 // we store 4 integers: [sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16]
6470 const uint32x4_t m128_u_sourceX_T_targetX_fixed16 = vdupq_n_u32(sourceX_T_targetX_fixed16);
6471
6472 // we store 4 integers: [sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16, sourceX_T_targetX_fixed16]
6473 const int32x4_t m128_s_targetOffsetX_fixed16 = vdupq_n_s32(targetOffsetX_fixed16);
6474
6475 // we store 4 integers: [sourceWidth - 2, sourceWidth - 2, sourceWidth - 2, sourceWidth - 2]
6476 const uint32x4_t m128_u_sourceWidth_2 = vdupq_n_u32(sourceWidth - 2u);
6477
6478 // we store 4 integers: [0, 0, 0, 0]
6479 const int32x4_t m128_s_zero = vdupq_n_s32(0);
6480
6481 const unsigned int u_0123[4] = {0u, 1u, 2u, 3u};
6482 const uint32x4_t m128_u_0123 = vld1q_u32(u_0123);
6483
6484 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberTargetRows; ++y)
6485 {
6486 const unsigned int sourceY_fixed16 = minmax<int>(0, targetOffsetY_fixed16 + int(sourceY_T_targetY_fixed16 * y), (sourceHeight - 1u) << 16u);
6487
6488 const unsigned int sourceRowTop = sourceY_fixed16 >> 16u; // we must not round here
6489 const unsigned int factorBottom_fixed16 = sourceY_fixed16 & 0x0000FFFFu;
6490 const unsigned int factorBottom = factorBottom_fixed16 >> 9u;
6491
6492 const uint8x8_t m64_u_factorsBottom = vdup_n_u8(factorBottom);
6493 // factorTop = 128 - factorBottom
6494 const uint8x8_t m64_u_factorsTop = vdup_n_u8(128u - factorBottom);
6495
6496 const unsigned int sourceRowBottom = min(sourceRowTop + 1u, sourceHeight - 1u);
6497
6498 const PixelType* const sourceTopRowPixelData = sourcePixelData + sourceRowTop * sourceWidth;
6499 const PixelType* const sourceBottomRowPixelData = sourcePixelData + sourceRowBottom * sourceWidth;
6500
6501 for (unsigned int x = 0; x < targetWidth; x += 8u)
6502 {
6503 if (x + 8u > targetWidth)
6504 {
6505 // the last iteration will not fit into the output frame,
6506 // so we simply shift x left by some pixels (at most 7) and we will calculate some pixels again
6507
6508 ocean_assert(x >= 8u && targetWidth > 8u);
6509 const unsigned int newX = targetWidth - 8u;
6510
6511 ocean_assert(x > newX);
6512 targetPixelData -= x - newX;
6513
6514 x = newX;
6515
6516 // the for loop will stop after this iteration
6517 ocean_assert(!(x + 8u < targetWidth));
6518 }
6519
6520
6521 // we need four successive x coordinate floats:
6522 // [x + 3, x + 2, x + 1; x + 0]
6523 const uint32x4_t m128_u_x_0123 = vaddq_u32(vdupq_n_u32(x), m128_u_0123);
6524 const uint32x4_t m128_u_x_4567 = vaddq_u32(vdupq_n_u32(x + 4u), m128_u_0123);
6525
6526 // we calculate the four source locations for our four target locations
6527 const int32x4_t m128_s_sourceX_0123_fixed16 = vmaxq_s32(m128_s_zero, vaddq_s32(m128_s_targetOffsetX_fixed16, vreinterpretq_s32_u32(vmulq_u32(m128_u_sourceX_T_targetX_fixed16, m128_u_x_0123))));
6528 const uint32x4_t m128_u_sourceX_0123_fixed16 = vreinterpretq_u32_s32(m128_s_sourceX_0123_fixed16);
6529
6530 const int32x4_t m128_s_sourceX_4567_fixed16 = vmaxq_s32(m128_s_zero, vaddq_s32(m128_s_targetOffsetX_fixed16, vreinterpretq_s32_u32(vmulq_u32(m128_u_sourceX_T_targetX_fixed16, m128_u_x_4567))));
6531 const uint32x4_t m128_u_sourceX_4567_fixed16 = vreinterpretq_u32_s32(m128_s_sourceX_4567_fixed16);
6532
6533 // now we determine the pixel/integer accurate source locations
6534 // m128_u_left = min(floor(m128_f_sourceX), sourceWidth - 2)
6535 const uint32x4_t m128_u_left_0123 = vminq_u32(vshrq_n_u32(m128_u_sourceX_0123_fixed16, 16), m128_u_sourceWidth_2); // not vrshrq_n_u32 as we must not round here
6536 const uint32x4_t m128_u_left_4567 = vminq_u32(vshrq_n_u32(m128_u_sourceX_4567_fixed16, 16), m128_u_sourceWidth_2);
6537
6538 // we store the offsets we have calculated
6539 vst1q_u32(leftOffsets + 0, m128_u_left_0123);
6540 vst1q_u32(leftOffsets + 4, m128_u_left_4567);
6541
6542
6543
6544 // we load the individal pixels to our four (de-interleaved) 8x8 bit registers (we do this for the top-left and top-right pixels)
6545 // note: loading of each pixel individually is significantly slower than loading two neighboring pixels within one iteration
6546
6547 uint8x8x4_t topLeftPixels;
6548 uint8x8x4_t topRightPixels;
6549
6550 topLeftPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[0u] + 0), topLeftPixels, 0);
6551 topRightPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[0u] + 1), topRightPixels, 0);
6552
6553 topLeftPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[1u] + 0), topLeftPixels, 1);
6554 topRightPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[1u] + 1), topRightPixels, 1);
6555
6556 topLeftPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[2u] + 0), topLeftPixels, 2);
6557 topRightPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[2u] + 1), topRightPixels, 2);
6558
6559 topLeftPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[3u] + 0), topLeftPixels, 3);
6560 topRightPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[3u] + 1), topRightPixels, 3);
6561
6562 topLeftPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[4u] + 0), topLeftPixels, 4);
6563 topRightPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[4u] + 1), topRightPixels, 4);
6564
6565 topLeftPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[5u] + 0), topLeftPixels, 5);
6566 topRightPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[5u] + 1), topRightPixels, 5);
6567
6568 topLeftPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[6u] + 0), topLeftPixels, 6);
6569 topRightPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[6u] + 1), topRightPixels, 6);
6570
6571 topLeftPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[7u] + 0), topLeftPixels, 7);
6572 topRightPixels = vld4_lane_u8((uint8_t*)(sourceTopRowPixelData + leftOffsets[7u] + 1), topRightPixels, 7);
6573
6574
6575 // we load the individal pixels to our four (de-interleaved) 8x8 bit registers (we do this for the bottom-left and bottom-right pixels)
6576
6577 uint8x8x4_t bottomLeftPixels;
6578 uint8x8x4_t bottomRightPixels;
6579
6580 bottomLeftPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[0u] + 0), bottomLeftPixels, 0);
6581 bottomRightPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[0u] + 1), bottomRightPixels, 0);
6582
6583 bottomLeftPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[1u] + 0), bottomLeftPixels, 1);
6584 bottomRightPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[1u] + 1), bottomRightPixels, 1);
6585
6586 bottomLeftPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[2u] + 0), bottomLeftPixels, 2);
6587 bottomRightPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[2u] + 1), bottomRightPixels, 2);
6588
6589 bottomLeftPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[3u] + 0), bottomLeftPixels, 3);
6590 bottomRightPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[3u] + 1), bottomRightPixels, 3);
6591
6592 bottomLeftPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[4u] + 0), bottomLeftPixels, 4);
6593 bottomRightPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[4u] + 1), bottomRightPixels, 4);
6594
6595 bottomLeftPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[5u] + 0), bottomLeftPixels, 5);
6596 bottomRightPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[5u] + 1), bottomRightPixels, 5);
6597
6598 bottomLeftPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[6u] + 0), bottomLeftPixels, 6);
6599 bottomRightPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[6u] + 1), bottomRightPixels, 6);
6600
6601 bottomLeftPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[7u] + 0), bottomLeftPixels, 7);
6602 bottomRightPixels = vld4_lane_u8((uint8_t*)(sourceBottomRowPixelData + leftOffsets[7u] + 1), bottomRightPixels, 7);
6603
6604
6605
6606 // we determine the multiplication factors for the right pixels - which are already stored in the lower 16 bits
6607 // we need an accuracy of 7 bits (values between 0 and 128):
6608 // 76 54 32 10
6609 // [F3 F2 F1 F0]
6610 const uint16x4_t m64_u_factorsRight_0123 = vrshrn_n_u32(vandq_u32(m128_u_sourceX_0123_fixed16, vdupq_n_u32(0x0000FFFFu)), 9);
6611 const uint16x4_t m64_u_factorsRight_4567 = vrshrn_n_u32(vandq_u32(m128_u_sourceX_4567_fixed16, vdupq_n_u32(0x0000FFFFu)), 9);
6612
6613 // as we have the pixel information de-interleaved, we can store all 8 interpolation factors together into one 8x8 bit register:
6614 const uint16x8_t m128_u_factorsRight = vcombine_u16(m64_u_factorsRight_0123, m64_u_factorsRight_4567);
6615 const uint8x8_t m64_u_factorsRight = vqmovn_u16(m128_u_factorsRight);
6616 const uint8x8_t m64_u_factorsLeft = vsub_u8(vdup_n_u8(128u), m64_u_factorsRight);
6617
6618
6619
6620 // we determine the intermediate interpolation results for the top row (and we narrow down the 16 bit results 8 bit results)
6621 uint16x8_t m128_muliplicationChannel_0 = vmull_u8(topLeftPixels.val[0], m64_u_factorsLeft);
6622 uint16x8_t m128_muliplicationChannel_1 = vmull_u8(topLeftPixels.val[1], m64_u_factorsLeft);
6623 uint16x8_t m128_muliplicationChannel_2 = vmull_u8(topLeftPixels.val[2], m64_u_factorsLeft);
6624 uint16x8_t m128_muliplicationChannel_3 = vmull_u8(topLeftPixels.val[3], m64_u_factorsLeft);
6625
6626 m128_muliplicationChannel_0 = vmlal_u8(m128_muliplicationChannel_0, topRightPixels.val[0], m64_u_factorsRight);
6627 m128_muliplicationChannel_1 = vmlal_u8(m128_muliplicationChannel_1, topRightPixels.val[1], m64_u_factorsRight);
6628 m128_muliplicationChannel_2 = vmlal_u8(m128_muliplicationChannel_2, topRightPixels.val[2], m64_u_factorsRight);
6629 m128_muliplicationChannel_3 = vmlal_u8(m128_muliplicationChannel_3, topRightPixels.val[3], m64_u_factorsRight);
6630
6631 uint8x8_t m64_topRowChannel_0 = vrshrn_n_u16(m128_muliplicationChannel_0, 7);
6632 uint8x8_t m64_topRowChannel_1 = vrshrn_n_u16(m128_muliplicationChannel_1, 7);
6633 uint8x8_t m64_topRowChannel_2 = vrshrn_n_u16(m128_muliplicationChannel_2, 7);
6634 uint8x8_t m64_topRowChannel_3 = vrshrn_n_u16(m128_muliplicationChannel_3, 7);
6635
6636
6637
6638 // we determine the intermediate interpolation results for the bottom row (and we narrow down the 16 bit results 8 bit results)
6639 m128_muliplicationChannel_0 = vmull_u8(bottomLeftPixels.val[0], m64_u_factorsLeft);
6640 m128_muliplicationChannel_1 = vmull_u8(bottomLeftPixels.val[1], m64_u_factorsLeft);
6641 m128_muliplicationChannel_2 = vmull_u8(bottomLeftPixels.val[2], m64_u_factorsLeft);
6642 m128_muliplicationChannel_3 = vmull_u8(bottomLeftPixels.val[3], m64_u_factorsLeft);
6643
6644 m128_muliplicationChannel_0 = vmlal_u8(m128_muliplicationChannel_0, bottomRightPixels.val[0], m64_u_factorsRight);
6645 m128_muliplicationChannel_1 = vmlal_u8(m128_muliplicationChannel_1, bottomRightPixels.val[1], m64_u_factorsRight);
6646 m128_muliplicationChannel_2 = vmlal_u8(m128_muliplicationChannel_2, bottomRightPixels.val[2], m64_u_factorsRight);
6647 m128_muliplicationChannel_3 = vmlal_u8(m128_muliplicationChannel_3, bottomRightPixels.val[3], m64_u_factorsRight);
6648
6649 uint8x8_t m64_bottomRowChannel_0 = vrshrn_n_u16(m128_muliplicationChannel_0, 7);
6650 uint8x8_t m64_bottomRowChannel_1 = vrshrn_n_u16(m128_muliplicationChannel_1, 7);
6651 uint8x8_t m64_bottomRowChannel_2 = vrshrn_n_u16(m128_muliplicationChannel_2, 7);
6652 uint8x8_t m64_bottomRowChannel_3 = vrshrn_n_u16(m128_muliplicationChannel_3, 7);
6653
6654
6655
6656 // finnally we determine the interpolation result between top and bottom row
6657 m128_muliplicationChannel_0 = vmull_u8(m64_topRowChannel_0, m64_u_factorsTop);
6658 m128_muliplicationChannel_1 = vmull_u8(m64_topRowChannel_1, m64_u_factorsTop);
6659 m128_muliplicationChannel_2 = vmull_u8(m64_topRowChannel_2, m64_u_factorsTop);
6660 m128_muliplicationChannel_3 = vmull_u8(m64_topRowChannel_3, m64_u_factorsTop);
6661
6662 m128_muliplicationChannel_0 = vmlal_u8(m128_muliplicationChannel_0, m64_bottomRowChannel_0, m64_u_factorsBottom);
6663 m128_muliplicationChannel_1 = vmlal_u8(m128_muliplicationChannel_1, m64_bottomRowChannel_1, m64_u_factorsBottom);
6664 m128_muliplicationChannel_2 = vmlal_u8(m128_muliplicationChannel_2, m64_bottomRowChannel_2, m64_u_factorsBottom);
6665 m128_muliplicationChannel_3 = vmlal_u8(m128_muliplicationChannel_3, m64_bottomRowChannel_3, m64_u_factorsBottom);
6666
6667
6668 // we narrow down the interpolation results and we store them
6669 uint8x8x4_t result;
6670 result.val[0] = vrshrn_n_u16(m128_muliplicationChannel_0, 7);
6671 result.val[1] = vrshrn_n_u16(m128_muliplicationChannel_1, 7);
6672 result.val[2] = vrshrn_n_u16(m128_muliplicationChannel_2, 7);
6673 result.val[3] = vrshrn_n_u16(m128_muliplicationChannel_3, 7);
6674
6675 // we write back the results and interleave them automatically
6676 vst4_u8((uint8_t*)targetPixelData, result);
6677
6678 targetPixelData += 8;
6679 }
6680
6681 // we need to process the last pixel again, as this pixel may have received wrong interpolation factors as we always load two successive pixels into our NEON registers
6682 // **TODO** this is just a temporary solution, check how we can avoid this additional step
6683
6684 const unsigned int firstInvalidTargetX = (((sourceWidth - 1u) << 16u) - targetOffsetX_fixed16) / sourceX_T_targetX_fixed16;
6685
6686 for (unsigned int x = firstInvalidTargetX; x < targetWidth; ++x)
6687 {
6688 const unsigned int lastSourcePixelPosition_fixed16 = minmax<int>(0, targetOffsetX_fixed16 + int(sourceX_T_targetX_fixed16 * x), (sourceWidth - 1u) << 16u);
6689
6690 const unsigned int lastSourcePixelLeft = lastSourcePixelPosition_fixed16 >> 16u;
6691 ocean_assert(lastSourcePixelLeft < sourceWidth);
6692 const unsigned int lastSourcePixelRight = min(lastSourcePixelLeft + 1u, sourceWidth - 1u);
6693
6694 const unsigned int factorRight_fixed16 = lastSourcePixelPosition_fixed16 & 0x0000FFFFu;
6695
6696 const unsigned int factorRight = factorRight_fixed16 >> 9u;
6697 const unsigned int factorLeft = 128u - factorRight;
6698
6699 for (unsigned int c = 0u; c < 4u; ++c)
6700 {
6701 ((uint8_t*)(targetPixelData - (targetWidth - x)))[c] = ((((const uint8_t*)(sourceTopRowPixelData + lastSourcePixelLeft))[c] * factorLeft + ((const uint8_t*)(sourceTopRowPixelData + lastSourcePixelRight))[c] * factorRight) * factorTop
6702 + (((const uint8_t*)(sourceBottomRowPixelData + lastSourcePixelLeft))[c] * factorLeft + ((const uint8_t*)(sourceBottomRowPixelData + lastSourcePixelRight))[c] * factorRight) * factorBottom + 8192u) >> 14u;
6703 }
6704 }
6705 }
6706}
6707
6708/// \endcond
6709
6710#endif // OCEAN_WE_KEEP_ALSO_THIS_SLOW_IMPLEMENTATION_SHOWING_A_MORE_GENERIC_APPROACH
6711
6712template <>
6713inline void FrameInterpolatorBilinear::interpolateRowVerticalNEON<float>(const float* sourceRowTop, const float* sourceRowBottom, float* targetRow, const unsigned int elements, const float factorBottom)
6714{
6715 ocean_assert(sourceRowTop != nullptr);
6716 ocean_assert(sourceRowBottom != nullptr);
6717 ocean_assert(targetRow != nullptr);
6718 ocean_assert(elements >= 16u);
6719 ocean_assert(factorBottom >= 0.0f && factorBottom <= 1.0f);
6720
6721 // [1.0f, 1.0f, 1.0f, 1.0f]
6722 const float32x4_t constant_1_f_32x4 = vdupq_n_f32(1.0f);
6723
6724 const float32x4_t factorsBottom_f_32x4 = vdupq_n_f32(factorBottom);
6725 const float32x4_t factorsTop_f_32x4 = vsubq_f32(constant_1_f_32x4, factorsBottom_f_32x4); // factorTop = 1 - factorBottom
6726
6727 for (unsigned int n = 0u; n < elements; n += 16u)
6728 {
6729 if (n + 16u > elements)
6730 {
6731 // the last iteration will not fit into the output frame,
6732 // so we simply shift x left by some elements (at most 15) and we will calculate some elements again
6733
6734 ocean_assert(n >= 16u && elements > 16u);
6735 const unsigned int offset = n - (elements - 16u);
6736 ocean_assert(offset < 16u);
6737
6738 sourceRowTop -= offset;
6739 sourceRowBottom -= offset;
6740 targetRow -= offset;
6741
6742 // the for loop will stop after this iteration
6743 ocean_assert(!(n + 16u < elements));
6744 }
6745
6746 // loading the next four 32 bit values from the top and bottom row
6747 const float32x4_t top_03_32x4 = vld1q_f32(sourceRowTop + 0);
6748 const float32x4_t top_47_32x4 = vld1q_f32(sourceRowTop + 4);
6749 const float32x4_t top_8B_32x4 = vld1q_f32(sourceRowTop + 8);
6750 const float32x4_t top_CF_32x4 = vld1q_f32(sourceRowTop + 12);
6751
6752 const float32x4_t bottom_03_32x4 = vld1q_f32(sourceRowBottom + 0);
6753 const float32x4_t bottom_47_32x4 = vld1q_f32(sourceRowBottom + 4);
6754 const float32x4_t bottom_8B_32x4 = vld1q_f32(sourceRowBottom + 8);
6755 const float32x4_t bottom_CF_32x4 = vld1q_f32(sourceRowBottom + 12);
6756
6757 // interpolatedRow_32x4 = top_32x4 * factorsTop + bottom_32x4 * factorsBottom
6758 float32x4_t interpolatedRow_03_32x4 = vmulq_f32(top_03_32x4, factorsTop_f_32x4);
6759 float32x4_t interpolatedRow_47_32x4 = vmulq_f32(top_47_32x4, factorsTop_f_32x4);
6760 float32x4_t interpolatedRow_8B_32x4 = vmulq_f32(top_8B_32x4, factorsTop_f_32x4);
6761 float32x4_t interpolatedRow_CF_32x4 = vmulq_f32(top_CF_32x4, factorsTop_f_32x4);
6762
6763 interpolatedRow_03_32x4 = vmlaq_f32(interpolatedRow_03_32x4, bottom_03_32x4, factorsBottom_f_32x4);
6764 interpolatedRow_47_32x4 = vmlaq_f32(interpolatedRow_47_32x4, bottom_47_32x4, factorsBottom_f_32x4);
6765 interpolatedRow_8B_32x4 = vmlaq_f32(interpolatedRow_8B_32x4, bottom_8B_32x4, factorsBottom_f_32x4);
6766 interpolatedRow_CF_32x4 = vmlaq_f32(interpolatedRow_CF_32x4, bottom_CF_32x4, factorsBottom_f_32x4);
6767
6768 // writing back the four interpolated 32 bit results
6769 vst1q_f32(targetRow + 0, interpolatedRow_03_32x4);
6770 vst1q_f32(targetRow + 4, interpolatedRow_47_32x4);
6771 vst1q_f32(targetRow + 8, interpolatedRow_8B_32x4);
6772 vst1q_f32(targetRow + 12, interpolatedRow_CF_32x4);
6773
6774 sourceRowTop += 16;
6775 sourceRowBottom += 16;
6776 targetRow += 16;
6777 }
6778}
6779
6780template <>
6781inline void FrameInterpolatorBilinear::interpolateRowHorizontalNEON<float, 1u>(const float* extendedSourceRow, float* targetRow, const unsigned int targetWidth, const unsigned int channels, const unsigned int* interpolationLocations, const float* interpolationFactorsRight)
6782{
6783 ocean_assert(extendedSourceRow != nullptr);
6784 ocean_assert(targetRow != nullptr);
6785 ocean_assert(targetWidth >= 8u);
6786 ocean_assert(interpolationLocations != nullptr);
6787 ocean_assert(interpolationFactorsRight != nullptr);
6788
6789 ocean_assert_and_suppress_unused(channels == 1u, channels);
6790
6791 // [1.0f, 1.0f, 1.0f, 1.0f]
6792 const float32x4_t constant_1_f_32x4 = vdupq_n_f32(1.0f);
6793
6794 for (unsigned int x = 0; x < targetWidth; x += 8u)
6795 {
6796 if (x + 8u > targetWidth)
6797 {
6798 // the last iteration will not fit into the output frame,
6799 // so we simply shift x left by some pixels (at most 7) and we will calculate some pixels again
6800
6801 ocean_assert(x >= 8u && targetWidth > 8u);
6802 const unsigned int newX = targetWidth - 8u;
6803
6804 ocean_assert(x > newX);
6805 const unsigned int offset = x - newX;
6806
6807 targetRow -= offset;
6808 interpolationLocations -= offset;
6809 interpolationFactorsRight -= offset;
6810
6811 x = newX;
6812
6813 // the for loop will stop after this iteration
6814 ocean_assert(!(x + 8u < targetWidth));
6815 }
6816
6817 // we load the left and the right pixels (for four resulting target pixels)
6818
6819 const float32x2_t pixel_0_f_32x2 = vld1_f32(extendedSourceRow + interpolationLocations[0]);
6820 const float32x2_t pixel_1_f_32x2 = vld1_f32(extendedSourceRow + interpolationLocations[1]);
6821 const float32x4_t pixel_01_f_32x4 = vcombine_f32(pixel_0_f_32x2, pixel_1_f_32x2);
6822
6823 const float32x2_t pixel_2_f_32x2 = vld1_f32(extendedSourceRow + interpolationLocations[2]);
6824 const float32x2_t pixel_3_f_32x2 = vld1_f32(extendedSourceRow + interpolationLocations[3]);
6825 const float32x4_t pixel_23_f_32x4 = vcombine_f32(pixel_2_f_32x2, pixel_3_f_32x2);
6826
6827 const float32x2_t pixel_4_f_32x2 = vld1_f32(extendedSourceRow + interpolationLocations[4]);
6828 const float32x2_t pixel_5_f_32x2 = vld1_f32(extendedSourceRow + interpolationLocations[5]);
6829 const float32x4_t pixel_45_f_32x4 = vcombine_f32(pixel_4_f_32x2, pixel_5_f_32x2);
6830
6831 const float32x2_t pixel_6_f_32x2 = vld1_f32(extendedSourceRow + interpolationLocations[6]);
6832 const float32x2_t pixel_7_f_32x2 = vld1_f32(extendedSourceRow + interpolationLocations[7]);
6833 const float32x4_t pixel_67_f_32x4 = vcombine_f32(pixel_6_f_32x2, pixel_7_f_32x2);
6834
6835 const float32x4_t factorsRight_0123_f_32x4 = vld1q_f32(interpolationFactorsRight + 0);
6836 const float32x4_t factorsLeft_0123_f_32x4 = vsubq_f32(constant_1_f_32x4, factorsRight_0123_f_32x4);
6837 const float32x4x2_t factorsLeftRight_0123_f_32x4_2 = vzipq_f32(factorsLeft_0123_f_32x4, factorsRight_0123_f_32x4);
6838
6839 const float32x4_t factorsRight_4567_f_32x4 = vld1q_f32(interpolationFactorsRight + 4);
6840 const float32x4_t factorsLeft_4567_f_32x4 = vsubq_f32(constant_1_f_32x4, factorsRight_4567_f_32x4);
6841 const float32x4x2_t factorsLeftRight_4567_f_32x4_2 = vzipq_f32(factorsLeft_4567_f_32x4, factorsRight_4567_f_32x4);
6842
6843 const float32x4_t multiplied_01_f_32x4 = vmulq_f32(pixel_01_f_32x4, factorsLeftRight_0123_f_32x4_2.val[0]);
6844 const float32x4_t multiplied_23_f_32x4 = vmulq_f32(pixel_23_f_32x4, factorsLeftRight_0123_f_32x4_2.val[1]);
6845
6846 const float32x4_t multiplied_45_f_32x4 = vmulq_f32(pixel_45_f_32x4, factorsLeftRight_4567_f_32x4_2.val[0]);
6847 const float32x4_t multiplied_67_f_32x4 = vmulq_f32(pixel_67_f_32x4, factorsLeftRight_4567_f_32x4_2.val[1]);
6848
6849 const float32x2_t result_01_f_32x2 = vpadd_f32(vget_low_f32(multiplied_01_f_32x4), vget_high_f32(multiplied_01_f_32x4));
6850 const float32x2_t result_23_f_32x2 = vpadd_f32(vget_low_f32(multiplied_23_f_32x4), vget_high_f32(multiplied_23_f_32x4));
6851
6852 const float32x2_t result_45_f_32x2 = vpadd_f32(vget_low_f32(multiplied_45_f_32x4), vget_high_f32(multiplied_45_f_32x4));
6853 const float32x2_t result_67_f_32x2 = vpadd_f32(vget_low_f32(multiplied_67_f_32x4), vget_high_f32(multiplied_67_f_32x4));
6854
6855 const float32x4_t result_0123_f_32x4 = vcombine_f32(result_01_f_32x2, result_23_f_32x2);
6856 const float32x4_t result_4567_f_32x4 = vcombine_f32(result_45_f_32x2, result_67_f_32x2);
6857
6858 vst1q_f32(targetRow + 0, result_0123_f_32x4);
6859 vst1q_f32(targetRow + 4, result_4567_f_32x4);
6860
6861 targetRow += 8;
6862 interpolationLocations += 8;
6863 interpolationFactorsRight += 8;
6864 }
6865}
6866
6867template <>
6868inline void FrameInterpolatorBilinear::scaleSubset<float, float, 1u>(const float* source, float* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
6869{
6870 ocean_assert(source != nullptr && target != nullptr);
6871 ocean_assert(sourceWidth >= 1u && sourceHeight >= 1u);
6872 ocean_assert(targetWidth >= 1u && targetHeight >= 1u);
6873 ocean_assert(sourceX_s_targetX > 0.0 && sourceY_s_targetY > 0.0);
6874
6875 ocean_assert(sourceWidth != targetWidth || sourceHeight != targetHeight);
6876
6877 const unsigned int sourceStrideElements = sourceWidth * 1u + sourcePaddingElements;
6878 const unsigned int targetStrideElements = targetWidth * 1u + targetPaddingElements;
6879
6880 using InterpolateRowVerticalFunction = void (*)(const float*, const float*, float*, const unsigned int, const float);
6881 using InterpolateRowHorizontalFunction = void (*)(const float*, float*, const unsigned int, const unsigned int, const unsigned int*, const float*);
6882
6883 InterpolateRowVerticalFunction interpolateRowVerticalFunction = interpolateRowVertical<float>;
6884 InterpolateRowHorizontalFunction interpolateRowHorizontalFunction = interpolateRowHorizontal<float, 1u>;
6885
6886 if (sourceWidth * 1u >= 16u)
6887 {
6888 interpolateRowVerticalFunction = interpolateRowVerticalNEON<float>;
6889 }
6890
6891 if (targetWidth >= 8u)
6892 {
6893 interpolateRowHorizontalFunction = interpolateRowHorizontalNEON<float, 1u>;
6894 }
6895
6896 target += targetStrideElements * firstTargetRow;
6897
6898 const float sourceX_T_targetX = float(sourceX_s_targetX);
6899 const float sourceY_T_targetY = float(sourceY_s_targetY);
6900
6901 // See the generic template function for a detailed documentation regarding interpolation factors.
6902
6903 Memory memoryIntermediateExtendedRow;
6904 Memory memoryHorizontalInterpolationLocations;
6905 Memory memoryHorizontalInterpolationFactorsRight;
6906
6907 if (sourceWidth != targetWidth)
6908 {
6909 // in case we are scaling the width of the frame, we use an intermediate buffer and pre-calculated interpolation locations and factors
6910
6911 memoryIntermediateExtendedRow = Memory::create<float>(sourceWidth + 1u); // one additional pixel
6912
6913 memoryHorizontalInterpolationLocations = Memory::create<unsigned int>(targetWidth); // one offset for each target pixel
6914
6915 memoryHorizontalInterpolationFactorsRight = Memory::create<float>(targetWidth); // one factors (right) for each target pixel
6916 }
6917
6918 if (memoryHorizontalInterpolationLocations)
6919 {
6920 ocean_assert(memoryHorizontalInterpolationFactorsRight);
6921
6922 if (targetWidth >= 4u)
6923 {
6924 const float32x4_t sourceX_T_targetX_f_32x4 = vdupq_n_f32(sourceX_T_targetX);
6925 const float32x4_t targetOffsetX_f_32x4 = vdupq_n_f32(sourceX_T_targetX * 0.5f - 0.5f);
6926
6927 // [0.0f, 0.0f, 0.0f, 0.0f]
6928 const float32x4_t constant_0_f_32x4 = vdupq_n_f32(0);
6929
6930 // [4.0f, 4.0f, 4.0f, 4.0f]
6931 const float32x4_t constant_4_f_32x4 = vdupq_n_f32(4.0f);
6932
6933 // we store 4 integers: [sourceWidth - 1, sourceWidth - 1, sourceWidth - 1, sourceWidth - 1]
6934 const uint32x4_t sourceWidth_1_u_32x4 = vdupq_n_u32(sourceWidth - 1u);
6935
6936 // [0.0f, 1.0f, 2.0f, 3.0f]
6937 const float f_0123[4] = {0.0f, 1.0f, 2.0f, 3.0f};
6938 float32x4_t x_0123_f_32x4 = vld1q_f32(f_0123);
6939
6940 // we pre-calculate the interpolation factors and pixel locations in horizontal direction
6941
6942 for (unsigned int x = 0u; x < targetWidth; x += 4u)
6943 {
6944 if (x + 4u > targetWidth)
6945 {
6946 // the last iteration will not fit into the output frame,
6947 // so we simply shift x left by some pixels (at most 3) and we will calculate some pixels again
6948
6949 ocean_assert(x >= 4u && targetWidth > 4u);
6950 const unsigned int newX = targetWidth - 4u;
6951
6952 ocean_assert(x > newX);
6953 const unsigned int offset = x - newX;
6954
6955 x = newX;
6956
6957 x_0123_f_32x4 = vsubq_f32(x_0123_f_32x4, vdupq_n_f32(float(offset)));
6958
6959 // the for loop will stop after this iteration
6960 ocean_assert(!(x + 4u < targetWidth));
6961 }
6962
6963 // we calculate the four source locations for our four target locations
6964 const float32x4_t sourceX_0123_f_32x4 = vmaxq_f32(constant_0_f_32x4, vaddq_f32(targetOffsetX_f_32x4, vmulq_f32(sourceX_T_targetX_f_32x4, x_0123_f_32x4)));
6965
6966 // now we determine the pixel/integer accurate source locations
6967 // left = min(floor(sourceX), sourceWidth - 1)
6968 uint32x4_t left_0123_u_32x4 = vminq_u32(vcvtq_u32_f32(sourceX_0123_f_32x4), sourceWidth_1_u_32x4); // no rounding here
6969
6970 // we store the offsets we have calculated
6971 vst1q_u32(memoryHorizontalInterpolationLocations.data<unsigned int>() + x, left_0123_u_32x4);
6972
6973 // factorRight = sourcceX - float(left)
6974 const float32x4_t factorsRight_f_32x4 = vsubq_f32(sourceX_0123_f_32x4, vcvtq_f32_u32(left_0123_u_32x4));
6975
6976 vst1q_f32(memoryHorizontalInterpolationFactorsRight.data<float>() + x, factorsRight_f_32x4);
6977
6978 // [x + 0, x + 1, x + 2, x + 3] + [4, 4, 4, 4]
6979 x_0123_f_32x4 = vaddq_f32(x_0123_f_32x4, constant_4_f_32x4);
6980 }
6981 }
6982 else
6983 {
6984 const float targetOffsetX = sourceX_T_targetX * 0.5f - 0.5f;
6985
6986 // we pre-calculate the interpolation factors and pixel locations in horizontal direction
6987
6988 for (unsigned int x = 0u; x < targetWidth; ++x)
6989 {
6990 const float sourceX = max(0.0f, targetOffsetX + float(x) * sourceX_T_targetX);
6991
6992 const unsigned int left = min((unsigned int)sourceX, sourceWidth - 1u); // no rounding here
6993
6994 memoryHorizontalInterpolationLocations.data<unsigned int>()[x] = left;
6995
6996 const float factorRight = sourceX - float(left);
6997 ocean_assert(factorRight >= 0.0f && factorRight <= 1.0f);
6998
6999 memoryHorizontalInterpolationFactorsRight.data<float>()[x] = factorRight;
7000 }
7001 }
7002 }
7003
7004 const float targetOffsetY = sourceY_T_targetY * 0.5f - 0.5f;
7005
7006 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberTargetRows; ++y)
7007 {
7008 const float sourceY = minmax<float>(0.0f, targetOffsetY + sourceY_T_targetY * float(y), float(sourceHeight) - 1.0f);
7009
7010 const unsigned int sourceRowTop = (unsigned int)sourceY; // we must not round here
7011 const float factorBottom = sourceY - float(sourceRowTop);
7012 ocean_assert(factorBottom >= 0.0f && factorBottom <= 1.0f);
7013
7014 const unsigned int sourceRowBottom = min(sourceRowTop + 1u, sourceHeight - 1u);
7015
7016 const float* const sourceTopRow = source + sourceStrideElements * sourceRowTop;
7017 const float* const sourceBottomRow = source + sourceStrideElements * sourceRowBottom;
7018
7019 float* targetRow = nullptr;
7020
7021 if (sourceHeight == targetHeight)
7022 {
7023 ocean_assert(sourceWidth != targetWidth);
7024 ocean_assert(memoryIntermediateExtendedRow);
7025
7026 // we do not need to interpolate two lines, thus we simply need to copy the row (as we need an additional pixel at the end)
7027 memcpy(memoryIntermediateExtendedRow.data<float>(), sourceTopRow, sourceWidth * sizeof(float));
7028 }
7029 else
7030 {
7031 // in case we do not scale the width of the frame, we can write the result to the target frame directly
7032 targetRow = memoryIntermediateExtendedRow.isNull() ? target : memoryIntermediateExtendedRow.data<float>();
7033
7034 ocean_assert(targetRow != nullptr);
7035 ocean_assert(interpolateRowVerticalFunction != nullptr);
7036 interpolateRowVerticalFunction(sourceTopRow, sourceBottomRow, targetRow, sourceWidth * 1u, factorBottom);
7037 }
7038
7039 if (memoryIntermediateExtendedRow) // sourceWidth != targetWidth
7040 {
7041 // we use an extended row (with one additional pixel at the end - equal to the last pixel)
7042 // so we have to copy the last pixel
7043 memoryIntermediateExtendedRow.data<float>()[sourceWidth] = memoryIntermediateExtendedRow.data<float>()[sourceWidth - 1u];
7044
7045 interpolateRowHorizontalFunction(memoryIntermediateExtendedRow.data<float>(), target, targetWidth, 1u, memoryHorizontalInterpolationLocations.data<unsigned int>(), memoryHorizontalInterpolationFactorsRight.data<float>());
7046 }
7047
7048 target += targetStrideElements;
7049 }
7050}
7051
7052#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
7053
7054template <typename T, typename TScale, unsigned int tChannels>
7055void FrameInterpolatorBilinear::scaleSubset(const T* source, T* target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
7056{
7057 static_assert((std::is_same<float, TScale>::value || std::is_same<double, TScale>::value), "Invalid TScale type");
7058
7059 ocean_assert(source != nullptr && target != nullptr);
7060 ocean_assert(sourceWidth != 0u && sourceHeight != 0u);
7061 ocean_assert_and_suppress_unused(targetWidth >= 1u && targetHeight >= 1u, targetHeight);
7062 ocean_assert(sourceX_s_targetX > 0.0 && sourceY_s_targetY > 0.0);
7063
7064 const unsigned int sourceStrideElements = sourceWidth * tChannels + sourcePaddingElements;
7065 const unsigned int targetStrideElements = targetWidth * tChannels + targetPaddingElements;
7066
7067 const TScale sourceX_T_targetX = TScale(sourceX_s_targetX);
7068 const TScale sourceY_T_targetY = TScale(sourceY_s_targetY);
7069
7070 /*
7071 * We determine the sub-pixel accurate source location for each target pixel as follows:
7072 *
7073 * Example with a downsampling by factor 4:
7074 * sourceRow with 12 pixels: | 0 1 2 3 4 5 6 7 8 9 A B |
7075 * targetRow with 3 pixels: | 0 1 2 |
7076 *
7077 * Thus, the source row can be separated into three blocks;
7078 * and we want to extract the color information from the center of the blocks:
7079 * sourceRow with 12 pixels: | 0 1 2 3 | 4 5 6 7 | 8 9 A B |
7080 * targetRow with 3 pixels: | 0 | 1 | 2 | (targetTSourceX = 4)
7081 *
7082 * Thus, we add 0.5 to each target coordinate before converting it to a source location;
7083 * and subtract 0.5 again afterwards:
7084 * sourceX = (targetX + 0.5) * targetTSourceX - 0.5
7085 *
7086 * e.g., (0 + 0.5) * 4 - 0.5 = 1.5
7087 * (1 + 0.5) * 4 - 0.5 = 5.5
7088 *
7089 *
7090 * Example with a downsampling by factor 3:
7091 * sourceRow with 9 pixels: | 0 1 2 3 4 5 6 7 8 |
7092 * targetRow with 3 pixels: | 0 1 2 |
7093 *
7094 * sourceRow with 9 pixels: | 0 1 2 | 3 4 5 | 6 7 8 |
7095 * targetRow with 3 pixels: | 0 | 1 | 2 | (targetTSourceX = 3)
7096 *
7097 * e.g., (0 + 0.5) * 3 - 0.5 = 1
7098 * (1 + 0.5) * 3 - 0.5 = 4
7099 *
7100 *
7101 * Example with a downsampling by factor 2:
7102 * sourceRow with 6 pixels: | 0 1 2 3 4 5 |
7103 * targetRow with 3 pixels: | 0 1 2 |
7104 *
7105 * sourceRow with 6 pixels: | 0 1 | 2 3 | 4 5 |
7106 * targetRow with 3 pixels: | 0 | 1 | 2 | (targetTSourceX = 2)
7107 *
7108 * e.g., (0 + 0.5) * 2 - 0.5 = 0.5
7109 * (1 + 0.5) * 2 - 0.5 = 2.5
7110 *
7111 *
7112 * we can simplify the calculation (as we have a constant term):
7113 * sourceX = (targetX * targetTSourceX) + (0.5 * targetTSourceX - 0.5)
7114 */
7115
7116 const TScale sourceX_T_targetXOffset = sourceX_T_targetX * TScale(0.5) - TScale(0.5);
7117 const TScale sourceY_T_targetYOffset = sourceY_T_targetY * TScale(0.5) - TScale(0.5);
7118
7119 const TScale sourceWidth_1 = TScale(sourceWidth - 1u);
7120 const TScale sourceHeight_1 = TScale(sourceHeight - 1u);
7121
7122 target += targetStrideElements * firstTargetRow;
7123
7124 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberTargetRows; ++y)
7125 {
7126 const TScale sy = minmax(TScale(0), sourceY_T_targetYOffset + sourceY_T_targetY * TScale(y), sourceHeight_1);
7127 ocean_assert(sy >= TScale(0) && sy < TScale(sourceHeight));
7128
7129 const unsigned int sTop = (unsigned int)sy;
7130 ocean_assert(sy >= TScale(sTop));
7131
7132 const TScale factorBottom = sy - TScale(sTop);
7133 ocean_assert(factorBottom >= TScale(0) && factorBottom <= TScale(1));
7134
7135 const TScale factorTop = TScale(1) - factorBottom;
7136 ocean_assert(factorTop >= TScale(0) && factorTop <= TScale(1));
7137
7138 const T* const sourceTop = source + sTop * sourceStrideElements;
7139 const T* const sourceBottom = (sTop + 1u < sourceHeight) ? sourceTop + sourceStrideElements : sourceTop;
7140
7141 for (unsigned int x = 0; x < targetWidth; ++x)
7142 {
7143 const TScale sx = minmax(TScale(0), sourceX_T_targetXOffset + sourceX_T_targetX * TScale(x), sourceWidth_1);
7144 ocean_assert(sx >= TScale(0) && sx < TScale(sourceWidth));
7145
7146 const unsigned int sLeft = (unsigned int)sx;
7147 ocean_assert(sx >= TScale(sLeft));
7148
7149 const TScale factorRight = sx - TScale(sLeft);
7150 ocean_assert(factorRight >= TScale(0) && factorRight <= TScale(1));
7151
7152 const TScale factorLeft = TScale(1) - factorRight;
7153 ocean_assert(factorLeft >= TScale(0) && factorLeft <= TScale(1));
7154
7155 const unsigned int sourceRightOffset = sLeft + 1u < sourceWidth ? tChannels : 0u;
7156
7157 const T* const sourceTopLeft = sourceTop + sLeft * tChannels;
7158 const T* const sourceBottomLeft = sourceBottom + sLeft * tChannels;
7159
7160 const TScale factorTopLeft = factorTop * factorLeft;
7161 const TScale factorTopRight = factorTop * factorRight;
7162 const TScale factorBottomLeft = factorBottom * factorLeft;
7163 const TScale factorBottomRight = factorBottom * factorRight;
7164
7165 for (unsigned int n = 0u; n < tChannels; ++n)
7166 {
7167 target[n] = T(TScale(sourceTopLeft[n]) * factorTopLeft + TScale(sourceTopLeft[sourceRightOffset + n]) * factorTopRight
7168 + TScale(sourceBottomLeft[n]) * factorBottomLeft + TScale(sourceBottomLeft[sourceRightOffset + n]) * factorBottomRight);
7169 }
7170
7171 target += tChannels;
7172 }
7173
7174 target += targetPaddingElements;
7175 }
7176}
7177
7178template <unsigned int tChannels>
7179void FrameInterpolatorBilinear::rotate8BitPerChannelSubset(const uint8_t* source, uint8_t* target, const unsigned int width, const unsigned int height, const Scalar horizontalAnchorPosition, const Scalar verticalAnchorPosition, const Scalar angle, const uint8_t* borderColor, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
7180{
7181 static_assert(tChannels != 0u, "Invalid channel number!");
7182
7183 ocean_assert(firstTargetRow + numberTargetRows <= height);
7184
7185 using PixelType = typename DataType<uint8_t, tChannels>::Type;
7186
7187 const unsigned int targetStrideElements = width * tChannels + targetPaddingElements;
7188
7189 uint8_t zeroColor[tChannels] = {uint8_t(0)};
7190 const PixelType bColor = borderColor ? *(const PixelType*)borderColor : *(const PixelType*)zeroColor;
7191
7192 const SquareMatrix3 rotationMatrix3(Rotation(0, 0, 1, angle));
7193 const SquareMatrix2 rotationMatrix2(rotationMatrix3(0, 0), rotationMatrix3(1, 0), rotationMatrix3(0, 1), rotationMatrix3(1, 1));
7194
7195 const Scalar width_1 = Scalar(width - 1u);
7196 const Scalar height_1 = Scalar(height - 1u);
7197 const Vector2 anchorPosition(horizontalAnchorPosition, verticalAnchorPosition);
7198
7199 for (unsigned int y = firstTargetRow; y < firstTargetRow + numberTargetRows; ++y)
7200 {
7201 PixelType* targetPixel = (PixelType*)(target + y * targetStrideElements);
7202
7203 const Scalar floatY = Scalar(y);
7204
7205 for (unsigned int x = 0; x < width; ++x)
7206 {
7207 const Vector2 sourceLocation(anchorPosition + rotationMatrix2 * (Vector2(Scalar(x), floatY) - anchorPosition));
7208
7209 if (sourceLocation.x() >= 0 && sourceLocation.y() >= 0 && sourceLocation.x() <= width_1 && sourceLocation.y() <= height_1)
7210 {
7211 interpolatePixel8BitPerChannel<tChannels, PC_TOP_LEFT>(source, width, height, sourcePaddingElements, sourceLocation, (uint8_t*)(targetPixel));
7212 }
7213 else
7214 {
7215 *targetPixel = bColor;
7216 }
7217
7218 ++targetPixel;
7219 }
7220 }
7221}
7222
7223} // namespace CV
7224
7225} // namespace Ocean
7226
7227#endif // META_OCEAN_CV_FRAME_INTERPOLATOR_BILINEAR_H
This class implements the abstract base class for all AnyCamera objects.
Definition AnyCamera.h:131
virtual VectorT3< T > vector(const VectorT2< T > &distortedImagePoint, const bool makeUnitVector=true) const =0
Returns a vector starting at the camera's center and intersecting a given 2D point in the image.
virtual unsigned int width() const =0
Returns the width of the camera image.
virtual unsigned int height() const =0
Returns the height of the camera image.
virtual VectorT2< T > projectToImageIF(const VectorT3< T > &objectPoint) const =0
Projects a 3D object point into the camera frame.
virtual bool isValid() const =0
Returns whether this camera is valid.
Helper class allowing to determine the offset that is necessary to access the alpha channel.
Definition FrameBlender.h:60
static constexpr unsigned int data()
Returns the offset that is applied to access the first data channel.
Definition FrameBlender.h:1171
The following comfort class provides comfortable functions simplifying prototyping applications but a...
Definition FrameInterpolatorBilinear.h:65
static bool homographies(const Frame &input, Frame &output, const SquareMatrix3 homographies[4], const Vector2 &outputQuadrantCenter, const uint8_t *borderColor=nullptr, Worker *worker=nullptr, const PixelPositionI &outputOrigin=PixelPositionI(0, 0))
Transforms a given input frame into an output frame (with arbitrary frame dimension) by application o...
static bool zoom(const Frame &source, Frame &target, const Scalar zoomFactor, Worker *worker=nullptr)
Zooms into a given input frame (or zooms out) and stores the zoomed image content in an output frame.
static bool homographyMask(const Frame &input, Frame &output, Frame &outputMask, const SquareMatrix3 &input_H_output, Worker *worker=nullptr, const uint8_t maskValue=0xFF, const PixelPositionI &outputOrigin=PixelPositionI(0, 0))
Transforms a given input frame into an output frame (with arbitrary frame dimension) by application o...
static bool lookupMask(const Frame &input, Frame &output, Frame &outputMask, const LookupTable &input_LT_output, const bool offset, Worker *worker=nullptr, const uint8_t maskValue=0xFF)
Transforms a given input frame into an output frame by application of an interpolation lookup table a...
static bool homographyWithCameraMask(const AnyCamera &inputCamera, const AnyCamera &outputCamera, const Frame &input, Frame &output, Frame &outputMask, const SquareMatrix3 &homography, Worker *worker=nullptr, const uint8_t maskValue=0xFFu)
Transforms a given input frame into an output frame by application of a homography.
static bool rotate(const Frame &source, Frame &target, const Scalar horizontalAnchorPosition, const Scalar verticalAnchorPosition, const Scalar angle, Worker *worker=nullptr, const uint8_t *borderColor=nullptr)
Rotates a given frame by a bilinear interpolation.
static bool resize(const Frame &source, Frame &target, Worker *worker=nullptr)
Resizes/rescales a given frame by application of a bilinear interpolation.
static bool homographiesMask(const Frame &input, Frame &output, Frame &outputMask, const SquareMatrix3 *homographies, const Vector2 &outputQuadrantCenter, Worker *worker=nullptr, const uint8_t maskValue=0xFF, const PixelPositionI &outputOrigin=PixelPositionI(0, 0))
Transforms a given input frame into an output frame (with arbitrary frame dimension) by application o...
static bool interpolatePixel(const TSource *frame, const unsigned int channels, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const PixelCenter pixelCenter, const VectorT2< TScalar > &position, TTarget *result, const TIntermediate &resultBias=TIntermediate(0))
Determines the interpolated pixel values for a given pixel position in a frame with arbitrary data ty...
Definition FrameInterpolatorBilinear.h:1551
static bool resampleCameraImage(const Frame &sourceFrame, const AnyCamera &sourceCamera, const SquareMatrix3 &source_R_target, const AnyCamera &targetCamera, Frame &targetFrame, LookupCorner2< Vector2 > *source_OLT_target=nullptr, Worker *worker=nullptr, const unsigned int binSizeInPixel=8u, const void *borderColor=nullptr)
Re-samples a camera image which has been captured with a camera profile as if the image would have be...
static bool homographyWithCamera(const PinholeCamera &inputCamera, const PinholeCamera &outputCamera, const Frame &input, Frame &output, const SquareMatrix3 &homography, const bool useDistortionParameters, const uint8_t *borderColor=nullptr, Worker *worker=nullptr)
Transforms a given input frame into an output frame by application of a homography.
static bool lookup(const Frame &input, Frame &output, const LookupTable &input_LT_output, const bool offset, const void *borderColor, Worker *worker=nullptr)
Transforms a given input frame into an output frame by application of an interpolation lookup table.
static bool affine(const Frame &source, Frame &target, const SquareMatrix3 &source_A_target, const uint8_t *borderColor=nullptr, Worker *worker=nullptr, const PixelPositionI &targetOrigin=PixelPositionI(0, 0))
Applies an affine transformation to an image.
static bool interpolatePixel8BitPerChannel(const uint8_t *frame, const unsigned int channels, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const PixelCenter pixelCenter, const VectorT2< TScalar > &position, uint8_t *result)
Determines the interpolated pixel values for a given pixel position in an 8 bit per channel frame.
Definition FrameInterpolatorBilinear.h:1458
static bool homography(const Frame &input, Frame &output, const SquareMatrix3 &input_H_output, const void *borderColor=nullptr, Worker *worker=nullptr, const PixelPositionI &outputOrigin=PixelPositionI(0, 0))
Transforms a given input frame into an output frame (with arbitrary frame dimension) by application o...
This class implements highly optimized interpolation functions with fixed properties.
Definition FrameInterpolatorBilinear.h:346
static void resize400x400To256x256_8BitPerChannel(const uint8_t *const source, uint8_t *const target, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements)
Resizes a given FORMAT_Y8 frame with resolution 400x400 to a FORMAT_Y8 frame with resolution 256x256 ...
static void resize400x400To224x224_8BitPerChannel(const uint8_t *const source, uint8_t *const target, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements)
Resizes a given FORMAT_Y8 frame with resolution 400x400 to a FORMAT_Y8 frame with resolution 224x224 ...
This class implements bilinear frame interpolator functions.
Definition FrameInterpolatorBilinear.h:47
static OCEAN_FORCE_INLINE void interpolate4Pixels8BitPerChannelNEON(const uint8_t *source, const unsigned int offsetsTopLeftElements[4], const unsigned int offsetsTopRightElements[4], const unsigned int offsetsBottomLeftElements[4], const unsigned int offsetsBottomRightElements[4], const unsigned int validPixels[4], const typename DataType< uint8_t, tChannels >::Type &borderColor, const uint32x4_t &m128_factorsRight, const uint32x4_t &m128_factorsBottom, typename DataType< uint8_t, tChannels >::Type *targetPositionPixels)
Interpolates 4 independent pixels concurrently based on already known locations (top-left,...
Definition FrameInterpolatorBilinear.h:4432
static void homography(const T *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 &input_H_output, const T *borderColor, T *output, const PixelPositionI &outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker *worker=nullptr)
Transforms a input frame with (almost) arbitrary pixel format into an output frame by application of ...
Definition FrameInterpolatorBilinear.h:1769
static void resampleCameraImage(const T *sourceFrame, const AnyCamera &sourceCamera, const SquareMatrix3 &source_R_target, const AnyCamera &targetCamera, T *targetFrame, const unsigned int sourceFramePaddingElements, const unsigned int targetFramePaddingElements, LookupCorner2< Vector2 > *source_OLT_target=nullptr, Worker *worker=nullptr, const unsigned int binSizeInPixel=8u, const T *borderColor=nullptr)
Re-samples a camera image which has been captured with a camera profile as if the image would have be...
Definition FrameInterpolatorBilinear.h:1929
static void interpolateRowVerticalNEON(const T *sourceRowTop, const T *sourceRowBottom, T *targetRow, const unsigned int elements, const float factorBottom)
Applies a (vertical) linear interpolation between two rows with arbitrary data types.
static void homographyWithCamera8BitPerChannel(const PinholeCamera &inputCamera, const PinholeCamera &outputCamera, const uint8_t *input, const SquareMatrix3 &homography, const bool useDistortionParameters, const uint8_t *borderColor, uint8_t *output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker *worker=nullptr)
Transforms a given 8 bit per channel input frame into an output frame by application of a homography.
Definition FrameInterpolatorBilinear.h:1835
static void interpolatePixel8BitPerChannel(const uint8_t *frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const VectorT2< TScalar > &position, uint8_t *result)
Determines the interpolated pixel values for a given pixel position in an 8 bit per channel frame.
Definition FrameInterpolatorBilinear.h:1996
static void affine8BitPerChannelSSESubset(const uint8_t *source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3 *source_A_target, const uint8_t *borderColor, uint8_t *target, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int firstTargetRow, const unsigned int numberTargetRows, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements)
Subset function to apply an affine transform to an N-channel, 8-bit unsigned image (using SSE).
Definition FrameInterpolatorBilinear.h:2500
static Scalar patchIntensitySum1Channel(const uint32_t *linedIntegralFrame, const unsigned int frameWidth, const unsigned int frameHeight, const unsigned int lineIntegralFramePaddingElements, const Vector2 &center, const CV::PixelCenter pixelCenter, const unsigned int patchWidth, const unsigned int patchHeight)
Interpolate the sum of intensity values of an image patch in a frame, while the frame is provided as ...
static void homographyWithCameraMask8BitPerChannel(const PinholeCamera &inputCamera, const PinholeCamera &outputCamera, const uint8_t *input, const unsigned int inputPaddingElements, const SquareMatrix3 &homography, uint8_t *output, uint8_t *outputMask, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker *worker=nullptr, const uint8_t maskValue=0xFF)
Transforms a given 8 bit per channel input frame into an output frame by application of a homography.
Definition FrameInterpolatorBilinear.h:1852
static void homographiesMask8BitPerChannelSubset(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 *homographies, uint8_t *output, uint8_t *outputMask, const uint8_t maskValue, const Scalar outputQuadrantCenterX, const Scalar outputQuadrantCenterY, const int outputOriginX, const int outputOriginY, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
Transforms an 8 bit per channel frame using the given homography.
Definition FrameInterpolatorBilinear.h:4654
static void homographiesMask8BitPerChannel(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 homographies[4], uint8_t *output, uint8_t *outputMask, const Vector2 &outputQuadrantCenter, const PixelPositionI &outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker *worker=nullptr, const uint8_t maskValue=0xFF)
Transforms a given 8 bit per channel input frame into an output frame by application of four homograp...
Definition FrameInterpolatorBilinear.h:1822
static void interpolateRowHorizontal8BitPerChannel7BitPrecision(const uint8_t *extendedSourceRow, uint8_t *targetRow, const unsigned int targetWidth, const unsigned int channels, const unsigned int *interpolationLocations, const uint8_t *interpolationFactors)
Applies a (horizontal) linear interpolation for one row with 8 bit per channel.
static void homography8BitPerChannelNEONSubset(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 *input_H_output, const uint8_t *borderColor, uint8_t *output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
Transforms an 8 bit per channel frame using the given homography.
Definition FrameInterpolatorBilinear.h:3613
static OCEAN_FORCE_INLINE void interpolate4Pixels4Channel8BitPerChannelNEON(const uint8x16_t &topLeftPixels_u8x16, const uint8x16_t &topRightPixels_u8x16, const uint8x16_t &bottomLeftPixels_u8x16, const uint8x16_t &bottomRightPixels_u8x16, const uint32x4_t &m128_factorsRight, const uint32x4_t &m128_factorsBottom, typename DataType< uint8_t, 4u >::Type *targetPositionPixels, const bool useOptimizedNEONFactorReplication=false)
Interpolates 4 independent 4-channel pixels using widening byte multiply.
Definition FrameInterpolatorBilinear.h:4320
static void interpolateRowVertical8BitPerChannel7BitPrecisionNEON(const uint8_t *sourceRowTop, const uint8_t *sourceRowBottom, uint8_t *targetRow, const unsigned int elements, const unsigned int factorBottom)
Applies a (vertical) linear interpolation between two rows with 8 bit per channel.
static void homographySubset(const T *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 *input_H_output, const T *borderColor, T *output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
Transforms a frame with (almost) arbitrary pixel format using the given homography.
Definition FrameInterpolatorBilinear.h:2416
static void homographyMask8BitPerChannel(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 &input_H_output, uint8_t *output, uint8_t *outputMask, const PixelPositionI &outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const uint8_t maskValue, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker *worker=nullptr)
Transforms a given 8 bit per channel input frame into an output frame by application of a homography.
Definition FrameInterpolatorBilinear.h:1806
static OCEAN_FORCE_INLINE void interpolate8Pixels1Channel8BitNEON(const uint8x8_t &topLeft_u_8x8, const uint8x8_t &topRight_u_8x8, const uint8x8_t &bottomLeft_u_8x8, const uint8x8_t &bottomRight_u_8x8, const uint8x16_t &factorsRight_factorsBottom_128_u_8x16, uint8_t *targetPositionPixels)
Interpolates 8 independent pixels concurrently of a 1 channel frame, the source pixel locations must ...
Definition FrameInterpolatorBilinear.h:3993
static void homographies8BitPerChannelSubset(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 *homographies, const uint8_t *borderColor, uint8_t *output, const Scalar outputQuadrantCenterX, const Scalar outputQuadrantCenterY, const int outputOriginX, const int outputOriginY, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
Transforms an 8 bit per channel frame using the given homographies.
Definition FrameInterpolatorBilinear.h:4498
static void homographyWithCamera8BitPerChannelSubset(const PinholeCamera *inputCamera, const PinholeCamera *outputCamera, const PinholeCamera::DistortionLookup *outputCameraDistortionLookup, const uint8_t *input, const SquareMatrix3 *normalizedHomography, const bool useDistortionParameters, const uint8_t *borderColor, uint8_t *output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Transforms an 8 bit per channel frame using the given homography.
Definition FrameInterpolatorBilinear.h:4734
static void affine8BitPerChannel(const uint8_t *source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3 &source_A_target, const uint8_t *borderColor, uint8_t *target, const PixelPositionI &targetOrigin, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Apply an affine transforms to a N-channel, 8-bit frame The target frame must have the same pixel form...
Definition FrameInterpolatorBilinear.h:1693
static void lookup(const T *input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable &input_LT_output, const bool offset, const T *borderColor, T *output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker *worker=nullptr, const bool useOptimizedNEON=false, const bool useOptimizedBilinearValuesAndFactorCalculation=false, const bool useOptimizedNEONFactorReplication=false)
Transforms a given input frame into an output frame by application of an interpolation lookup table.
Definition FrameInterpolatorBilinear.h:1869
static void homographyWithCameraMask8BitPerChannelSubset(const PinholeCamera *inputCamera, const PinholeCamera *outputCamera, const PinholeCamera::DistortionLookup *outputCameraDistortionLookup, const uint8_t *input, const unsigned int inputPaddingElements, const SquareMatrix3 *normalizedHomography, uint8_t *output, uint8_t *outputMask, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const uint8_t maskValue, const unsigned int firstRow, const unsigned int numberRows)
Transforms an 8 bit per channel frame using the given homography.
Definition FrameInterpolatorBilinear.h:4780
static void affine8BitPerChannelNEONSubset(const uint8_t *source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3 *source_A_target, const uint8_t *borderColor, uint8_t *target, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int firstTargetRow, const unsigned int numberTargetRows, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements)
Subset function to apply an affine transform to an N-channel, 8-bit unsigned image (using NEON).
Definition FrameInterpolatorBilinear.h:3363
static void interpolateRowHorizontalNEON(const T *extendedSourceRow, T *targetRow, const unsigned int targetWidth, const unsigned int channels, const unsigned int *interpolationLocations, const float *interpolationFactorsRight)
Applies a (horizontal) linear interpolation for one row with arbitrary data type.
static void scale8BitPerChannelSubset7BitPrecisionNEON(const uint8_t *source, uint8_t *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int channels, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
Rescales a subset of a given frame with 8 bit per channel by a bilinear interpolation.
Definition FrameInterpolatorBilinear.h:5630
static OCEAN_FORCE_INLINE __m128i interpolate4Pixels8BitPerChannelSSE(const __m128i &m128_sourcesTopLeft, const __m128i &m128_sourcesTopRight, const __m128i &m128_sourcesBottomLeft, const __m128i &m128_sourcesBottomRight, const __m128i &m128_factorsTopLeft, const __m128i &m128_factorsTopRight, const __m128i &m128_factorsBottomLeft, const __m128i &m128_factorsBottomRight)
Interpolates 4 independent pixels concurrently based on already known locations (top-left,...
static void interpolateRowHorizontal(const T *extendedSourceRow, T *targetRow, const unsigned int targetWidth, const unsigned int channels, const unsigned int *interpolationLocations, const float *interpolationFactorsRight)
Applies a (horizontal) linear interpolation for one row with arbitrary data type.
Definition FrameInterpolatorBilinear.h:5595
static void rotate8BitPerChannelSubset(const uint8_t *source, uint8_t *target, const unsigned int width, const unsigned int height, const Scalar horizontalAnchorPosition, const Scalar verticalAnchorPosition, const Scalar angle, const uint8_t *borderColor, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
Rotates a subset of a given frame by a bilinear interpolation.
Definition FrameInterpolatorBilinear.h:7179
static void lookupMask8BitPerChannel(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable &input_LT_output, const bool offset, uint8_t *output, uint8_t *outputMask, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, Worker *worker=nullptr, const uint8_t maskValue=0xFF)
Transforms a given input frame into an output frame by application of an interpolation lookup table.
Definition FrameInterpolatorBilinear.h:1916
static OCEAN_FORCE_INLINE void interpolate4Pixels8BitPerChannelSSE(const uint8_t *source, const unsigned int offsetsTopLeft[4], const unsigned int offsetsTopRight[4], const unsigned int offsetsBottomLeft[4], const unsigned int offsetsBottomRight[4], const unsigned int validPixels[4], const typename DataType< uint8_t, tChannels >::Type &borderColor, const __m128i &m128_factorsRight, const __m128i &m128_factorsBottom, typename DataType< uint8_t, tChannels >::Type *targetPositionPixels)
Interpolates 4 independent pixels concurrently based on already known locations (top-left,...
Definition FrameInterpolatorBilinear.h:3295
static void homographies8BitPerChannel(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 homographies[4], const uint8_t *borderColor, uint8_t *output, const Vector2 &outputQuadrantCenter, const PixelPositionI &outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker *worker=nullptr)
Transforms a given 8 bit per channel input frame into an output frame by application of four homograp...
Definition FrameInterpolatorBilinear.h:1793
static void lookup8BitPerChannelSubset(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable *input_LT_output, const bool offset, const uint8_t *borderColor, uint8_t *output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Transforms a subset of a given input frame with uint8_t as element type into an output frame by appli...
Definition FrameInterpolatorBilinear.h:4828
static void resize(const T *source, T *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Resizes a given frame with (almost) arbitrary data type (e.g., float, double, int) by using a bilinea...
Definition FrameInterpolatorBilinear.h:1644
static bool coversHomographyInputFrame(const unsigned int inputWidth, const unsigned int inputHeight, const unsigned int outputWidth, const unsigned int outputHeight, const SquareMatrix3 &input_H_output, const int outputOriginX=0, const int outputOriginY=0)
Checks whether the application of a given homography for a specified input frame and output frame cov...
static void scale(const T *source, T *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Rescales a given frame with arbitrary data type (e.g., float, double, int) by using a bilinear interp...
Definition FrameInterpolatorBilinear.h:1657
static void lookupSubset(const T *input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable *input_LT_output, const bool offset, const T *borderColor, T *output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Transforms a subset of a given input frame with arbitrary element type into an output frame by applic...
Definition FrameInterpolatorBilinear.h:4882
static void scale8BitPerChannelSubset(const uint8_t *source, uint8_t *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
Resizes a subset of a given frame with 8 bit per channel by a bilinear interpolation.
Definition FrameInterpolatorBilinear.h:5454
static void rotate8BitPerChannel(const uint8_t *source, uint8_t *target, const unsigned int width, const unsigned int height, const Scalar horizontalAnchorPosition, const Scalar verticalAnchorPosition, const Scalar angle, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr, const uint8_t *borderColor=nullptr)
Rotates a given frame by a bilinear interpolation.
Definition FrameInterpolatorBilinear.h:1978
static void interpolateRowVertical(const T *sourceRowTop, const T *sourceRowBottom, T *targetRow, const unsigned int elements, const float factorBottom)
Applies a (vertical) linear interpolation between two rows with arbitrary data types.
Definition FrameInterpolatorBilinear.h:5575
static void homography8BitPerChannel(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 &input_H_output, const uint8_t *borderColor, uint8_t *output, const PixelPositionI &outputOrigin, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, Worker *worker=nullptr)
Transforms a given 8 bit per channel input frame into an output frame by application of a homography.
Definition FrameInterpolatorBilinear.h:1731
static void interpolate1PixelFullAlphaBorder8BitPerChannel(const uint8_t *frame, const unsigned int width, const unsigned int height, const Vector2 &position, uint8_t *result, const unsigned int framePaddingElements)
Determines the interpolated pixel values for a given pixel position in an 8 bit per channel frame wit...
Definition FrameInterpolatorBilinear.h:2175
static void lookupMask8BitPerChannelSubset(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable *input_LT_output, const bool offset, uint8_t *output, uint8_t *outputMask, const uint8_t maskValue, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Transforms a given input frame into an output frame by application of an interpolation lookup table.
Definition FrameInterpolatorBilinear.h:5357
static void affine8BitPerChannelSubset(const uint8_t *source, const unsigned int sourceWidth, const unsigned int sourceHeight, const SquareMatrix3 *source_A_target, const uint8_t *borderColor, uint8_t *target, const unsigned int targetWidth, const unsigned int targetHeight, const unsigned int firstTargetRow, const unsigned int numberTargetRows, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements)
Subset function to apply an affine transform to an N-channel, 8-bit unsigned image.
Definition FrameInterpolatorBilinear.h:2264
static void homography8BitPerChannelSSESubset(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 *input_H_output, const uint8_t *borderColor, uint8_t *output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
Transforms an 8 bit per channel frame using the given homography.
Definition FrameInterpolatorBilinear.h:2685
static void interpolateRowHorizontal8BitPerChannel7BitPrecisionNEON(const uint8_t *extendedSourceRow, uint8_t *targetRow, const unsigned int targetWidth, const unsigned int channels, const unsigned int *interpolationLocations, const uint8_t *interpolationFactors)
Applies a (horizontal) linear interpolation for one row with 8 bit per channel.
static void scale8BitPerChannel(const uint8_t *source, uint8_t *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, Worker *worker=nullptr)
Rescales a given frame with 8 bit per data channel by using a bilinear interpolation with user-define...
Definition FrameInterpolatorBilinear.h:5410
static void lookup8BitPerChannelSubsetNEON(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const LookupTable *input_LT_output, const bool offset, const uint8_t *borderColor, uint8_t *output, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstRow, const unsigned int numberRows, const bool useOptimizedNEON=false, const bool useOptimizedBilinearValuesAndFactorCalculation=false, const bool useOptimizedNEONFactorReplication=false)
Transforms a subset of a given input frame into an output frame by application of an interpolation lo...
Definition FrameInterpolatorBilinear.h:5139
static void homographyMask8BitPerChannelSubset(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 *input_H_output, uint8_t *output, uint8_t *outputMask, const uint8_t maskValue, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int outputMaskPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
Transforms an 8 bit per channel frame using the given homography.
Definition FrameInterpolatorBilinear.h:4579
static void scaleSubset(const T *source, T *target, const unsigned int sourceWidth, const unsigned int sourceHeight, const unsigned int targetWidth, const unsigned int targetHeight, const double sourceX_s_targetX, const double sourceY_s_targetY, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstTargetRow, const unsigned int numberTargetRows)
Resizes a subset of a given frame with arbitrary data type by a bilinear interpolation.
Definition FrameInterpolatorBilinear.h:7055
static void homography8BitPerChannelSubset(const uint8_t *input, const unsigned int inputWidth, const unsigned int inputHeight, const SquareMatrix3 *input_H_output, const uint8_t *borderColor, uint8_t *output, const unsigned int outputWidth, const unsigned int outputHeight, const unsigned int inputPaddingElements, const unsigned int outputPaddingElements, const unsigned int firstOutputRow, const unsigned int numberOutputRows)
Transforms an 8 bit per channel frame using the given homography.
Definition FrameInterpolatorBilinear.h:2339
static void interpolatePixel(const TSource *frame, const unsigned int width, const unsigned int height, const unsigned int framePaddingElements, const VectorT2< TScalar > &position, TTarget *result, const TIntermediate &resultBias=TIntermediate(0))
Determines the interpolated pixel values for a given pixel position in a frame with arbitrary data ty...
Definition FrameInterpolatorBilinear.h:2089
static constexpr uint8x8_t create_uint8x8(const uint8_t v0, const uint8_t v1, const uint8_t v2, const uint8_t v3, const uint8_t v4, const uint8_t v5, const uint8_t v6, const uint8_t v7)
Creates a uint8x8_t vector from 8 individual uint8_t values.
Definition NEON.h:591
This class implements a 2D pixel position with pixel precision.
Definition PixelPosition.h:63
T y() const
Returns the vertical coordinate position of this object.
Definition PixelPosition.h:468
T x() const
Returns the horizontal coordinate position of this object.
Definition PixelPosition.h:456
static __m128i set128i(const unsigned long long high64, const unsigned long long low64)
Sets a 128i value by two 64 bit values.
Definition SSE.h:3875
static Caller< void > createStatic(typename StaticFunctionPointerMaker< void, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass >::Type function)
Creates a new caller container for a static function with no function parameter.
Definition Caller.h:2877
Template class allowing to define an array of data types.
Definition DataType.h:27
This class implements Ocean's image class.
Definition Frame.h:1879
void setRelativeTimestamp(const Timestamp &relative)
Sets the relative timestamp of this frame.
Definition Frame.h:4320
bool isValid() const
Returns whether this frame is valid.
Definition Frame.h:4615
void setTimestamp(const Timestamp &timestamp)
Sets the timestamp of this frame.
Definition Frame.h:4315
const Timestamp & timestamp() const
Returns the timestamp of this frame.
Definition Frame.h:4305
const Timestamp & relativeTimestamp() const
Returns the relative timestamp of this frame.
Definition Frame.h:4310
Definition of a frame type composed by the frame dimension, pixel format and pixel origin.
Definition Frame.h:30
size_t sizeY() const
Returns the vertical dimension of this lookup object.
Definition Lookup2.h:960
size_t sizeX() const
Returns the horizontal dimension of this lookup object.
Definition Lookup2.h:954
size_t binsY() const
Returns the number of vertical bins of this lookup object.
Definition Lookup2.h:972
size_t binsX() const
Returns the number of horizontal bins of this lookup object.
Definition Lookup2.h:966
This class implements a 2D lookup object with values at the bins' corners defining the individual loo...
Definition Lookup2.h:636
Vector2 binTopLeftCornerPosition(const size_t binX, const size_t binY) const
Returns the corner position (the top left corner) of a specific bin in relation to the dimension of t...
Definition Lookup2.h:1799
void setBinTopLeftCornerValue(const size_t binX, const size_t binY, const T &value)
Sets the value of one specific lookup bin's top left corner.
Definition Lookup2.h:2215
void bilinearValues(const size_t y, TTarget *values) const
Applies a lookup for an entire row in this lookup object.
Definition Lookup2.h:1877
This class implements an object able to allocate memory.
Definition base/Memory.h:22
bool isNull() const
Returns whether this object holds any memory.
Definition base/Memory.h:401
void * data()
Returns the pointer to the writable memory which is allocated by this object.
Definition base/Memory.h:303
This class provides basic numeric functionalities.
Definition Numeric.h:57
static constexpr T eps()
Returns a small epsilon.
static T floor(const T value)
Returns the largest integer value that is not greater than the given value.
Definition Numeric.h:2035
static constexpr bool isEqualEps(const T value)
Returns whether a value is smaller than or equal to a small epsilon.
Definition Numeric.h:2096
static constexpr bool isNotEqualEps(const T value)
Returns whether a value is not smaller than or equal to a small epsilon.
Definition Numeric.h:2246
unsigned int width() const
Returns the width of the camera image.
Definition PinholeCamera.h:1452
const SquareMatrixT3< T > & invertedIntrinsic() const
Returns the inverted intrinsic camera matrix.
Definition PinholeCamera.h:1333
const SquareMatrixT3< T > & intrinsic() const
Returns the intrinsic camera matrix.
Definition PinholeCamera.h:1327
unsigned int height() const
Returns the height of the camera image.
Definition PinholeCamera.h:1458
VectorT2< T > normalizedImagePoint2imagePoint(const VectorT2< T > &normalizedImagePoint, const bool distortImagePoint) const
Calculates the image point corresponding to a given normalized image point.
Definition PinholeCamera.h:1792
This class implements a 2x2 square matrix.
Definition SquareMatrix2.h:73
bool isNull() const
Returns whether this matrix is a zero matrix.
Definition SquareMatrix3.h:1334
const T * data() const
Returns a pointer to the internal values.
Definition SquareMatrix3.h:1047
bool isOrthonormal(const T epsilon=NumericT< T >::eps()) const
Returns whether this matrix is an orthonormal matrix.
Definition SquareMatrix3.h:1366
This class implements tests for the NEON-accelerated bilinear interpolation functions.
Definition TestFrameInterpolatorBilinearNEON.h:27
const T & x() const noexcept
Returns the x value.
Definition Vector2.h:710
const T & y() const noexcept
Returns the y value.
Definition Vector2.h:722
bool isEqual(const VectorT2< T > &vector, const T eps) const
Returns whether two vectors are equal up to a specified epsilon.
Definition Vector2.h:758
const T & y() const noexcept
Returns the y value.
Definition Vector3.h:824
const T & x() const noexcept
Returns the x value.
Definition Vector3.h:812
const T & z() const noexcept
Returns the z value.
Definition Vector3.h:836
This class implements a worker able to distribute function calls over different threads.
Definition Worker.h:33
bool executeFunction(const Function &function, const unsigned int first, const unsigned int size, const unsigned int firstIndex=(unsigned int)(-1), const unsigned int sizeIndex=(unsigned int)(-1), const unsigned int minimalIterations=1u, const unsigned int threadIndex=(unsigned int)(-1))
Executes a callback function separable by two function parameters.
T minmax(const T &lowerBoundary, const T &value, const T &upperBoundary)
This function fits a given parameter into a specified value range.
Definition base/Utilities.h:973
PixelCenter
Definition of individual centers of pixels.
Definition CV.h:117
@ PC_TOP_LEFT
The center of a pixel is in the upper-left corner of each pixel's square.
Definition CV.h:133
@ PC_CENTER
The center of a pixel is located in the center of each pixel's square (with an offset of 0....
Definition CV.h:150
float Scalar
Definition of a scalar type.
Definition Math.h:129
SquareMatrixT3< Scalar > SquareMatrix3
Definition of the SquareMatrix3 object, depending on the OCEAN_MATH_USE_SINGLE_PRECISION either with ...
Definition SquareMatrix3.h:43
VectorT3< Scalar > Vector3
Definition of a 3D vector.
Definition Vector3.h:29
VectorT2< Scalar > Vector2
Definition of a 2D vector.
Definition Vector2.h:28
RotationT< Scalar > Rotation
Definition of the Rotation object, depending on the OCEAN_MATH_USE_SINGLE_PRECISION flag either with ...
Definition Rotation.h:32
The namespace covering the entire Ocean framework.
Definition Accessor.h:15
Default definition of a type with tBytes bytes.
Definition DataType.h:32
float Type
The 32 bit floating point data type for any data type T but 'double'.
Definition DataType.h:373