Ocean
FrameFilterSeparable.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_FRAME_FILTER_SEPARABLE_H
9 #define META_OCEAN_CV_FRAME_FILTER_SEPARABLE_H
10 
11 #include "ocean/cv/CV.h"
12 #include "ocean/cv/NEON.h"
13 #include "ocean/cv/SSE.h"
14 
15 #include "ocean/base/Frame.h"
16 #include "ocean/base/Memory.h"
17 #include "ocean/base/Processor.h"
18 #include "ocean/base/Worker.h"
19 
20 #include "ocean/math/Numeric.h"
21 
22 namespace Ocean
23 {
24 
25 namespace CV
26 {
27 
28 /**
29  * This class implements separable filter.
30  * @ingroup cv
31  */
32 class OCEAN_CV_EXPORT FrameFilterSeparable
33 {
34  public:
35 
36  /**
37  * This class holds re-usable memory for the filtering process.
38  */
40  {
41  friend class FrameFilterSeparable;
42 
43  public:
44 
45  /**
46  * Default constructor.
47  */
48  ReusableMemory() = default;
49 
50  protected:
51 
52  /// An intermediate frame which can be re-used during filtering.
54 
55  /// Float-based filter factors which can be re-used during filtering.
56  std::vector<float> filterFactors_;
57 
58  /// Normalized horizontal filter factors which can be re-used during filtering.
59  std::vector<float> normalizedHorizontalFilter_;
60 
61  /// Normalized vertical filter factors which can be re-used during filtering.
62  std::vector<float> normalizedVerticalFilter_;
63  };
64 
65  protected:
66 
67  /**
68  * Definition of a 128 bit SIMD data type holding four 32 bit values.
69  */
70  template <typename T>
71  struct SIMD32x4
72  {
74  };
75 
76  public:
77 
78  /**
79  * Returns whether a given 1D filter is symmetric.
80  * @param filterValues The individual values of the 1D filter, must be valid
81  * @param size The size of the filter (the number of filter elements), with range [1, infinity), must be odd
82  * @return True, if so
83  * @tparam T The data type of each filter value, e.g., 'unsigned int', or 'float'
84  */
85  template <typename T>
86  static bool isFilterSymmetric(const T* filterValues, const size_t size);
87 
88  /**
89  * Determines the sum of all elements of a given 1D filter.
90  * @param filterValues The individual values of the 1D filter, must be valid
91  * @param size The size of the filter (the number of filter elements), with range [1, infinity)
92  * @return The sum of all filter values
93  * @tparam T The data type of each filter value, e.g., 'unsigned int', or 'float'
94  */
95  template <typename T>
96  static T sumFilterValues(const T* filterValues, const size_t size);
97 
98  /**
99  * Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a horizontal 1D filter and a vertical 1D filter for frames with zipped pixel format.
100  * The filter result is stored in a target frame with zipped pixel format.
101  *
102  * The provided filter values are given with integer precision, the filter responses will be normalized automatically.<br>
103  *
104  * Here is an example showing how to use this function:
105  * @code
106  * void function(const Frame& rgbFrame)
107  * {
108  * // now let's create a simple Gaussian blur filter with kernel size 3
109  * const std::vector<unsigned int> filter[] = {1u, 2u, 1u};
110  *
111  * // so let's filter our frame
112  * Frame targetFrame(rgbFrame.frameType());
113  * FrameFilterSeparable::filter(rgbFrame, targetFrame, filter, filter);
114  * }
115  * @endcode
116  * @param source The source frame to be filtered, with zipped pixel format and with data type DT_UNSIGNED_INTEGER_8, or DT_SIGNED_FLOAT_32, must be valid
117  * @param target The target frame receiving the filtered results, will be set to the correct frame type, if invalid or if the type does not match the source frame
118  * @param horizontalFilter The horizontal filter, the number of filter elements must be odd, at least 1 element
119  * @param verticalFilter The vertical filter, the number of filter elements must be odd, at least 1 element
120  * @param worker Optional worker object to distribute the computation
121  * @param reusableMemory An optional object holding reusable memory which can be used during filtering, nullptr otherwise
122  * @param processorInstructions The set of available instructions, may be any combination of instructions
123  * @see filter<T, TFilter>()
124  */
125  static bool filter(const Frame& source, Frame& target, const std::vector<unsigned int>& horizontalFilter, const std::vector<unsigned int>& verticalFilter, Worker* worker = nullptr, ReusableMemory* reusableMemory = nullptr, const ProcessorInstructions processorInstructions = Processor::get().instructions());
126 
127  /**
128  * Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a horizontal 1D filter and a vertical 1D filter for frames with zipped pixel format.
129  * The filter result is stored in a target frame with zipped pixel format.
130  *
131  * When providing filter values with integer precision, the filter responses will be normalized automatically.<br>
132  * In contrast, when providing filter values with floating point precision, the filter responses will not be normalized.<br>
133  * Thus, you need to provide a normalized filter already when providing floating point filters.
134  *
135  * Here is an example showing how to use this function:
136  * @code
137  * void function(const Frame& rgbFrame)
138  * {
139  * // now let's create a simple Gaussian blur filter with kernel size 3
140  * const unsigned int horizontalFilter[] = {1u, 2u, 1u};
141  * const unsigned int verticalFilter[] = {1u, 2u, 1u};
142  *
143  * // so let's filter our frame
144  * Frame targetFrame(rgbFrame.frameType());
145  * FrameFilterSeparable::filter<uint8_t, unsigned int>(rgbFrame.constdata<uint8_t>(), targetFrame.data<uint8_t>(), rgbFrame.width(), rgbFrame.height(), rgbFrame.channels(), horizontalFilter, 3u, verticalFilter, 3u, rgbFrame.paddingElements(), targetFrame.paddingElements());
146  * }
147  * @endcode
148  * @param source The source frame to be filtered, must be valid
149  * @param target The target frame receiving the filtered results, can be the same memory pointer as 'source', must be valid
150  * @param width The width of the source (and target) frame in pixel, with range [horizontalFilterSize, infinity)
151  * @param height The height of the source (and target) frame in pixel, with range [verticalFilterSize, infinity)
152  * @param channels The number of channels the source frame (and target frame) has, with range [1, infinity)
153  * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
154  * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
155  * @param horizontalFilter The elements of the horizontal filter, must be valid
156  * @param horizontalFilterSize The number of elements the horizontal filter has, with range [1, width], must be odd
157  * @param verticalFilter The elements of the vertical filter, must be valid
158  * @param verticalFilterSize The number of elements the vertical filter has, with range [1, height], must be odd
159  * @param worker Optional worker object to distribute the computation
160  * @param reusableMemory An optional object holding reusable memory which can be used during filtering, nullptr otherwise
161  * @param processorInstructions The set of available instructions, may be any combination of instructions
162  * @tparam T The data type of each pixel channel of the source frame (and target frame) e.g., 'uint8_t', or 'float'
163  * @tparam TFilter The data type of each filter elements e.g., 'unsigned int', or 'float'
164  * @see filterUniversal<T>()
165  */
166  template <typename T, typename TFilter>
167  static bool filter(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const TFilter* horizontalFilter, const unsigned int horizontalFilterSize, const TFilter* verticalFilter, const unsigned int verticalFilterSize, Worker* worker = nullptr, ReusableMemory* reusableMemory = nullptr, const ProcessorInstructions processorInstructions = Processor::get().instructions());
168 
169  /**
170  * Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a horizontal 1D filter and a vertical 1D filter for frames with almost arbitrary pixel format.
171  * This function supports images with arbitrary pixel format as long as the pixel format is zipped (e.g,. FrameType::FORMAT_Y8, FrameType::FORMAT_RGB24, ...).
172  *
173  * Beware: This function is not highly optimized, you may want to check whether Ocean provided a more optimized implementation for your purpose if performance matters e.g., filer<T, TFilter>().
174  *
175  * Here is an example showing how to use this function:
176  * @code
177  * void function(const Frame& rgbFrame)
178  * {
179  * // let's say we receive a frame with FORMAT_RGB24 pixel format
180  * if (rgbFrame.pixelFormat() != FrameType::FORMAT_RGB24)
181  * {
182  * // wrong pixel format
183  * return;
184  * }
185  *
186  * // let's convert this frame to a floating point frame
187  * Frame floatFrameWith3Channels(FrameType(rgbFrame, FrameType::genericPixelFormat<float, 3u>()));
188  *
189  * FrameConverter::cast<uint8_t, float>(rgbFrame.constdata<uint8_t>(), floatFrameWith3Channels.data<float>(), rgbFrame.width(), rgbFrame.height(), rgbFrame.channels());
190  *
191  * // now let's create a simple Gaussian blur filter with kernel size 3
192  * const float horizontalFilter[] = {0.25f, 0.5f, 0.25f};
193  * const float verticalFilter[] = {0.25f, 0.5f, 0.25f};
194  *
195  * const unsigned int channels = 3u;
196  *
197  * // so let's filter our floating point frame
198  * Frame floatTargetFrame(floatFrameWith3Channels.frameType());
199  * FrameFilterSeparable::filterUniversal<float>(floatFrameWith3Channels.constdata<float>(), floatTargetFrame.data<float>(), floatTargetFrame.width(), floatTargetFrame.height(), channels, horizontalFilter, 3u, verticalFilter, 3u);
200  *
201  * // btw: we could also apply the same filter to our RGB24 frame (with uint8_t values)
202  * // however, this time we lose the floating point accuracy
203  * Frame rgbTargetFrame(rgbFrame.frameType());
204  * FrameFilterSeparable::filterUniversal<uint8_t>(rgbFrame.constdata<uint8_t>(), rgbTargetFrame.data<uint8_t>(), rgbFrame.width(), rgbFrame.height(), channels, horizontalFilter, 3u, verticalFilter, 3u, rgbFrame.paddingElements(), rgbTargetFrame.paddingElements());
205  * }
206  * @endcode
207  * @param source The source frame to which the filter will be applied, must be valid
208  * @param target The target frame receiving the filtered results, can be the same memory pointer as 'source', must be valid
209  * @param width The width of the source frame (and target frame) in pixel, with range [1, infinity)
210  * @param height The height of the source frame (and target frame) in pixel, with range [1, infinity)
211  * @param channels The number of channels the source and target frame have, with range [1, infinity)
212  * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
213  * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
214  * @param horizontalFilter The (separable) horizontal filter to be applied, must be valid
215  * @param horizontalFilterSize The number of horizontal filter elements, with range [1, width], must be odd
216  * @param verticalFilter The (separable) vertical filter to be applied, must be valid
217  * @param verticalFilterSize The number of vertical filter elements, with range [1, height], must be odd
218  * @param worker Optional worker object to distribute the computation to several CPU cores
219  * @return True, if the filter could be applied; False, if the input parameters were wrong
220  * @tparam T The data type of each pixel channel of the source and target frame, e.g., 'uint8_t', 'int', 'float', ...
221  * @see filter<T, TFilter>()
222  */
223  template <typename T>
224  static bool filterUniversal(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const float* horizontalFilter, const unsigned int horizontalFilterSize, const float* verticalFilter, const unsigned int verticalFilterSize, Worker* worker = nullptr);
225 
226  protected:
227 
228  /**
229  * Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a horizontal 1D filter and a vertical 1D filter for frames with zipped pixel format.
230  * The filter result is stored in a target frame with zipped pixel format.
231  * @param source The source frame to be filtered, must be valid
232  * @param target The target frame receiving the filtered results, must be valid
233  * @param width The width of the source (and target) frame in pixel, with range [horizontalFilterSize, infinity)
234  * @param height The height of the source (and target) frame in pixel, with range [verticalFilterSize, infinity)
235  * @param channels The number of channels the source frame (and target frame) has, with range [1, infinity)
236  * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
237  * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
238  * @param horizontalFilter The elements of the horizontal filter, must be valid
239  * @param horizontalFilterSize The number of elements the horizontal filter has, with range [1, width], must be odd
240  * @param verticalFilter The elements of the vertical filter, must be valid
241  * @param verticalFilterSize The number of elements the vertical filter has, with range [1, height], must be odd
242  * @param reusableMemory An optional object holding reusable memory which can be used during filtering, nullptr otherwise
243  * @param worker Optional worker object to distribute the computation
244  * @tparam T The data type of each pixel channel of the source frame (and target frame) e.g., 'uint8_t', or 'float'
245  * @tparam TFilter The data type of each filter elements e.g., 'unsigned int', or 'float'
246  * @tparam tProcessorInstructions The processor instructions that can be used
247  * @see filterUniversal<T>()
248  */
249  template <typename T, typename TFilter, ProcessorInstructions tProcessorInstructions>
250  static void filter(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const TFilter* horizontalFilter, const unsigned int horizontalFilterSize, const TFilter* verticalFilter, const unsigned int verticalFilterSize, ReusableMemory* reusableMemory = nullptr, Worker* worker = nullptr);
251 
252  /**
253  * Sets a given SIMD value to zero.
254  * @param value The SIMD value to be set
255  * @tparam T The 32 bit data type of the SIMD value
256  * @tparam tProcessorInstructions The set of available instructions, may be any combination of instructions
257  */
258  template <typename T, ProcessorInstructions tProcessorInstructions>
259  static OCEAN_FORCE_INLINE void setSIMDZero(typename SIMD32x4<T>::Type& value);
260 
261  /**
262  * Writes a SIMD with four 32 bit values to (not aligned) memory.
263  * @param value The SIMD value to be written
264  * @param target The buffer receiving the values
265  * @tparam T The 32 bit data type of the SIMD value
266  * @tparam tProcessorInstructions The set of available instructions, may be any combination of instructions
267  */
268  template <typename T, ProcessorInstructions tProcessorInstructions>
269  static OCEAN_FORCE_INLINE void writeSIMD(const typename SIMD32x4<T>::Type& value, T* target);
270 
271  /**
272  * Fills the left border area of an extended row with mirrored pixel information (from the left image region).
273  * @param source The source row providing the image information to be mirrored, must be valid
274  * @param channels The number of channels the source frame has, with range [1, infinity)
275  * @param pixels The number of pixels to be mirrored, should be filterSize / 2u, with range [1, width]
276  * @param extendedRowLeft The pointer to the left border area of the extended row to which the mirrored image content will be copied, must be valid
277  * @tparam T The data type of each pixel channel, e.g., 'uint8_t', or 'float'
278  * @see fillRightExtraBorder().
279  */
280  template <typename T>
281  static void fillLeftExtraBorder(const T* source, const unsigned int channels, const unsigned int pixels, T* extendedRowLeft);
282 
283  /**
284  * Fills the right border area of an extended row with mirrored pixel information (from the right image region).
285  * @param sourceEnd The end of the source row providing the image information to be mirrored (source + width * channels), must be valid
286  * @param channels The number of channels the source frame has, with range [1, infinity)
287  * @param pixels The number of pixels to be mirrored, should be filterSize / 2u, with range [1, width]
288  * @param extendedRowRight The pointer to the right border area of the extended row to which the mirrored image content will be copied, must be valid
289  * @tparam T The data type of each pixel channel, e.g., 'uint8_t', or 'float'
290  * @see fillLeftExtraBorder().
291  */
292  template <typename T>
293  static void fillRightExtraBorder(const T* sourceEnd, const unsigned int channels, const unsigned int pixels, T* extendedRowRight);
294 
295  /**
296  * Determines the filter responses for one filter factor of an asymmetric filter for 4 successive frame elements (4 elements are 4 successive pixels in a Y8 frame or 1 + 1/3 pixels in a RGB24 frame) and adds the individual results to given target elements.
297  * <pre>
298  * This function calculates the following:
299  * target[0] += source[0] * filterFactor
300  * ...
301  * target[3] += source[3] * filterFactor
302  * </pre>
303  * @param source The source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
304  * @param filterFactor The filter factor to be used for multiplication
305  * @param target_32x4 The four 32 bit accumulated filter response values to which the multiplication result will be added
306  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
307  * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
308  * @tparam tProcessorInstructions The set of available processor instructions needed
309  * @see symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements().
310  */
311  template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
312  static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource* source, const TFilter& filterFactor, typename SIMD32x4<TFilter>::Type& target_32x4);
313 
314  /**
315  * Determines the filter responses for one filter factor of a symmetric filter for 4 successive frame elements (4 elements are 4 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame) and adds the individual results to given target elements.
316  * This function applies a run-time known filter factor.
317  * <pre>
318  * This function calculates the following:
319  * targeta[0] += (sourceLeft[0] + sourceRight[0]) * filterFactor
320  * ...
321  * targetb[3] += (sourceLeft[3] + sourceRight[3]) * filterFactor
322  * </pre>
323  * @param sourceLeft The left source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
324  * @param sourceRight The right source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
325  * @param filterFactor The filter factor to be used for multiplication
326  * @param target_32x4 The four 32 bit accumulated filter response values to which the multiplication result will be added
327  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
328  * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
329  * @tparam tProcessorInstructions The set of available processor instructions needed
330  * @see asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements().
331  */
332  template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
333  static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource* sourceLeft, const TSource* sourceRight, const TFilter& filterFactor, typename SIMD32x4<TFilter>::Type& target_32x4);
334 
335  /**
336  * Determines the filter responses for one filter factor of an asymmetric filter for 8 successive frame elements (8 elements are 8 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame) and adds the individual results to given target elements.
337  * <pre>
338  * This function calculates the following:
339  * targeta[0] += source[0] * filterFactor
340  * ...
341  * targeta[3] += source[3] * filterFactor
342  * targetb[4] += source[4] * filterFactor
343  * ...
344  * targetb[7] += source[7] * filterFactor
345  * </pre>
346  * @param source The source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
347  * @param filterFactor The filter factor to be used for multiplication
348  * @param target_32x4a The first four 32 bit accumulated filter response values to which the multiplication result will be added
349  * @param target_32x4b The second four 32 bit accumulated filter response values to which the multiplication result will be added
350  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
351  * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
352  * @tparam tProcessorInstructions The set of available processor instructions needed
353  * @see symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements().
354  */
355  template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
356  static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource* source, const TFilter& filterFactor, typename SIMD32x4<TFilter>::Type& target_32x4a, typename SIMD32x4<TFilter>::Type& target_32x4b);
357 
358  /**
359  * Determines the filter responses for one filter factor of a symmetric filter for 8 successive frame elements (8 elements are 8 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame) and adds the individual results to given target elements.
360  * This function applies a run-time known filter factor.
361  * <pre>
362  * This function calculates the following:
363  * targeta[0] += (sourceLeft[0] + sourceRight[0]) * filterFactor
364  * ...
365  * targeta[3] += (sourceLeft[3] + sourceRight[3]) * filterFactor
366  * targetb[4] += (sourceLeft[4] + sourceRight[4]) * filterFactor
367  * ...
368  * targetb[7] += (sourceLeft[7] + sourceRight[7]) * filterFactor
369  * </pre>
370  * @param sourceLeft The left source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
371  * @param sourceRight The right source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
372  * @param filterFactor The filter factor to be used for multiplication
373  * @param target_32x4a The first four 32 bit accumulated filter response values to which the multiplication result will be added
374  * @param target_32x4b The second four 32 bit accumulated filter response values to which the multiplication result will be added
375  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
376  * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
377  * @tparam tProcessorInstructions The set of available processor instructions needed
378  * @see asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements().
379  */
380  template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
381  static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource* sourceLeft, const TSource* sourceRight, const TFilter& filterFactor, typename SIMD32x4<TFilter>::Type& target_32x4a, typename SIMD32x4<TFilter>::Type& target_32x4b);
382 
383  /**
384  * Determines the horizontal filter responses for one block with 4 successive frame elements (4 elements are 4 successive pixels in a Y8 frame or 1 + 1/3 pixels in a RGB24 frame).
385  * @param source The first source element for which the filter will be applied, the buffer must contain at least 4 + 'filterSize' - 1 elements, must be valid
386  * @param target The first target element receiving the filter responses, the buffer must contain at least 4 elements, must be valid
387  * @param channels The number of channels the source (and target) frame has, with range [1, infinity)
388  * @param filter The filter factors of the horizontal filter, with 'filterSize' elements, must be valid
389  * @param filterSize The size of the given filter, with range [1, width], must be odd
390  * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
391  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
392  * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
393  * @tparam tProcessorInstructions The set of available processor instructions needed
394  * @see isFilterSymmetric().
395  */
396  template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
397  static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith4Elements(const TSource* const source, TFilter* const target, const unsigned int channels, const TFilter* const filter, const unsigned int filterSize, const bool isSymmetric);
398 
399  /**
400  * Determines the horizontal filter responses for one block with 8 successive frame elements (8 elements are 8 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame).
401  * @param source The first source element for which the filter will be applied, the buffer must contain at least 8 + 'filterSize' - 1 elements, must be valid
402  * @param target The first target element receiving the filter responses, the buffer must contain at least 8 elements, must be valid
403  * @param channels The number of channels the source (and target) frame has, with range [1, infinity)
404  * @param filter The filter factors of the horizontal filter, with 'filterSize' elements, must be valid
405  * @param filterSize The size of the given filter, with range [1, width], must be odd
406  * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
407  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
408  * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
409  * @tparam tProcessorInstructions The set of available processor instructions needed
410  * @see isFilterSymmetric().
411  */
412  template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
413  static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith8Elements(const TSource* const source, TFilter* const target, const unsigned int channels, const TFilter* const filter, const unsigned int filterSize, const bool isSymmetric);
414 
415  /**
416  * Determines the vertical filter responses for the inner core of a frame for one row while processing a block of 4 elements within one iteration (4 elements are 4 successive pixels in a Y8 frame or 1 + 1/3 pixels in a RGB24 frame).
417  * The inner core lies within the frame not covering the frame border of size of filterSize/2.<br>
418  * @param source The first source element that will be used for filtering, must be valid
419  * @param target The first target elements that will receive the filtered results, must be valid
420  * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [width * channels, infinity)
421  * @param filter The filter factors also containing the normalization, must be
422  * @param filterSize The size of the given filter, with range [1, width], must be odd
423  * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
424  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
425  * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
426  * @tparam tProcessorInstructions The set of available processor instructions needed
427  */
428  template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
429  static OCEAN_FORCE_INLINE void filterVerticalCoreRow4Elements32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric);
430 
431  /**
432  * Determines the vertical filter responses for the inner core of a frame for one row while processing a block of 8 elements within one iteration (8 elements are 8 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame).
433  * The inner core lies within the frame not covering the frame border of size of filterSize/2.<br>
434  * @param source The first source element that will be used for filtering, must be valid
435  * @param target The first target elements that will receive the filtered results, must be valid
436  * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [width * channels, infinity)
437  * @param filter The filter factors also containing the normalization, must be
438  * @param filterSize The size of the given filter, with range [1, width], must be odd
439  * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
440  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
441  * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
442  * @tparam tProcessorInstructions The set of available processor instructions needed
443  */
444  template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
445  static OCEAN_FORCE_INLINE void filterVerticalCoreRow8Elements32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric);
446 
447  /**
448  * Determines the vertical filter responses for the inner core of a frame for one row while processing a block of 16 elements within one iteration (16 elements are 16 successive pixels in a Y8 frame or 5 + 1/3 pixels in a RGB24 frame).
449  * The inner core lies within the frame not covering the (vertical) frame border of size of filterSize/2.<br>
450  * @param source The first source element that will be used for filtering, must be valid
451  * @param target The first target elements that will receive the filtered results, must be valid
452  * @param sourceStrideElements The stride of the frame in elements, stideElements = width * channels + paddingElements, with range [1, infinity)
453  * @param filter The filter factors also containing the normalization, must be
454  * @param filterSize The size of the given filter, with range [1, width], must be odd
455  * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
456  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
457  * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
458  * @tparam tProcessorInstructions The set of available processor instructions needed
459  */
460  template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
461  static OCEAN_FORCE_INLINE void filterVerticalCoreRow16Elements32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric);
462 
463  /**
464  * Determines the vertical filter responses for the inner core of a frame for one row.
465  * The inner core lies within the frame not covering the (vertical) frame border of size of filterSize/2.<br>
466  * @param source The first source element that will be used for filtering, must be valid
467  * @param target The first target elements that will receive the filtered results, must be valid
468  * @param width The width of the frame in pixel, with range [1, infinity)
469  * @param channels The number of channels the source (and target) frame has, with range [1, infinity)
470  * @param filter The filter factors also containing the normalization, must be
471  * @param filterSize The size of the given filter, with range [1, width], must be odd
472  * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
473  * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
474  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
475  * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
476  * @tparam tProcessorInstructions The set of available processor instructions needed
477  */
478  template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
479  static OCEAN_FORCE_INLINE void filterVerticalCoreRow32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int width, const unsigned int channels, const float* filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements);
480 
481  /**
482  * Determines the vertical filter responses near the (vertical) border of a frame for one row while processing a block of 8 elements within one iteration (8 elements are 8 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame).
483  * The border covers the upper and lower filterSize/2 rows of a frame as this area needs a special handling of filter locations lying outside the frame.
484  * @param source The first source element that will be used for filtering, must be valid
485  * @param target The first target elements that will receive the filtered results, must be valid
486  * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [width * channels, infinity)
487  * @param height The height of the frame in pixel, with range [1, infinity)
488  * @param row The row to be handled, with range [0, height - 1]
489  * @param filter The filter factors, must be 'filterSize' individual values
490  * @param filterSize The number of filter factors, with range [1, width - 1], must be odd
491  * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
492  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
493  * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
494  * @tparam tProcessorInstructions The set of available processor instructions needed
495  */
496  template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
497  static OCEAN_FORCE_INLINE void filterVerticalBorderRow8Elements32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric);
498 
499  /**
500  * Determines the vertical filter responses near the (vertical) border of a frame for one row while processing a block of 16 elements within one iteration (16 elements are 16 successive pixels in a Y8 frame or 5 + 1/3 pixels in a RGB24 frame).
501  * The border covers the upper and lower filterSize/2 rows of a frame as this area needs a special handling of filter locations lying outside the frame.
502  * @param source The first source element that will be used for filtering, must be valid
503  * @param target The first target elements that will receive the filtered results, must be valid
504  * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [width * channels, infinity)
505  * @param height The height of the frame in pixel, with range [1, infinity)
506  * @param row The row to be handled, with range [0, height - 1]
507  * @param filter The filter factors, must be 'filterSize' individual values
508  * @param filterSize The number of filter factors, with range [1, width - 1], must be odd
509  * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
510  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
511  * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
512  * @tparam tProcessorInstructions The set of available processor instructions needed
513  */
514  template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
515  static OCEAN_FORCE_INLINE void filterVerticalBorderRow16Elements32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric);
516 
517  /**
518  * Determines the vertical filter responses near the (vertical) border of a frame for one row.
519  * The border covers the upper and lower filterSize/2 rows of a frame as this area needs a special handling of filter locations lying outside the frame.
520  * @param source The first source element that will be used for filtering, must be valid
521  * @param target The first target elements that will receive the filtered results, must be valid
522  * @param width The width of the frame in pixel, with range [1, infinity)
523  * @param height The height of the frame in pixel, with range [1, infinity)
524  * @param channels The number of data channels both frames have, with range [1, infinity)
525  * @param row The row to be handled, with range [0, height - 1]
526  * @param filter The filter factors, must be 'filterSize' individual values
527  * @param filterSize The number of filter factors, with range [1, width - 1], must be odd
528  * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
529  * @param sourcePaddingElements Optional number of padding elements at the end of each source row, in elements, with range [0, infinity)
530  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
531  * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
532  * @tparam tProcessorInstructions The set of available processor instructions needed
533  */
534  template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
535  static OCEAN_FORCE_INLINE void filterVerticalBorderRow32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements);
536 
537  /**
538  * Applies the horizontal filtering in a subset of a frame with a specified 1D filter kernel for frames with zipped pixel format.
539  * The filter result is stored in a target frame with zipped pixel format and 32 bit per channel.
540  * @param source The source frame to be filtered, must be valid
541  * @param target The target frame receiving the filtered results, must be valid
542  * @param width The width of the source (and target) frame in pixel, with range [filterSize + 1, infinity)
543  * @param height The height of the source (and target) frame in pixel, with range [filterSize, infinity)
544  * @param channels The number of data channels both frames have, with range [1, infinity)
545  * @param filter The filter factors, must be 'filterSize' individual values
546  * @param filterSize The number of filter factors, with range [1, width - 1], must be odd
547  * @param sourcePaddingElements Optional number of padding elements at the end of each source row, in elements, with range [0, infinity)
548  * @param targetPaddingElements Optional number of padding elements at the end of each target row, in elements, with range [0, infinity)
549  * @param firstRow The first row to be handled, with range [0, height)
550  * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
551  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
552  * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
553  * @tparam tProcessorInstructions The set of available processor instructions needed
554  */
555  template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
556  static void filterHorizontalSubset(const TSource* source, TFilter* target, const unsigned int width, const unsigned int height, const unsigned int channels, const TFilter* filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
557 
558  /**
559  * Applies the vertical filtering for a subset of the frame with a specified 1D filter kernel for frames with zipped pixel format and 32 bit per channel.
560  * The filter result is stored in a target frame with zipped pixel format and 8 bit per channel.<br>
561  * This function uses floating point filter factors ensuring the final result is normalized.
562  * @param source The source frame to be filtered, must be valid
563  * @param target The target frame receiving the filtered results, must be valid
564  * @param width The width of the source (and target) frame in pixel, with range [max(filterSize + 1, 16 / channels), infinity)
565  * @param height The height of the source (and target) frame in pixel, with range [filterSize, infinity)
566  * @param channels The number of data channels both frames have, with range [1, infinity)
567  * @param filter The filter factors, must be 'filterSize' individual values
568  * @param filterSize The number of filter factors, with range [1, width - 1], must be odd
569  * @param sourcePaddingElements Optional number of padding elements at the end of each source row, in elements, with range [0, infinity)
570  * @param targetPaddingElements Optional number of padding elements at the end of each target row, in elements, with range [0, infinity)
571  * @param firstRow The first row to be handled, with range [0, height]
572  * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
573  * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
574  * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
575  * @tparam tProcessorInstructions The set of available processor instructions needed
576  */
577  template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
578  static void filterVerticalSubset(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int channels, const float* filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, unsigned int firstRow, const unsigned int numberRows);
579 
580  /**
581  * Applies an horizontal filter to a subset of an image with almost arbitrary data type.
582  * @param source The source frame to which the filter will be applied, must be valid
583  * @param target The target frame receiving the filter response, must be valid
584  * @param width The width of the source frame (and target frame) in pixel, with range [1, infinity)
585  * @param channels The number of channels the source and target frame have, with range [1, infinity)
586  * @param horizontalFilter The (separable) horizontal filter to be applied, must be valid
587  * @param filterSize The number of filter elements, must be odd, with range [1, width]
588  * @param sourcePaddingElements Optional number of padding elements at the end of each source row, in elements, with range [0, infinity)
589  * @param targetPaddingElements Optional number of padding elements at the end of each target row, in elements, with range [0, infinity)
590  * @param firstRow The first row to be handled, with range [0, height)
591  * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
592  * @tparam T The data type of each pixel channel of the source and target frame, e.g., 'uint8_t', 'int', 'float', ...
593  * @tparam TIntermediate The data type of the intermediate target frame, should be either 'float' or 'double'
594  */
595  template <typename T, typename TIntermediate>
596  static void filterUniversalHorizontalSubset(const T* source, TIntermediate* target, const unsigned int width, const unsigned int channels, const float* horizontalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
597 
598  /**
599  * Applies an vertical filter to a subset of an image with almost arbitrary data type.
600  * @param source The source frame to which the filter will be applied, must be valid
601  * @param target The target frame receiving the filter response, must be valid
602  * @param width The width of the source frame (and target frame) in pixel, with range [1, infinity)
603  * @param height The height of the source frame (and target frame) in pixel, with range [1, infinity)
604  * @param channels The number of channels the source and target frame have, with range [1, infinity)
605  * @param verticalFilter The (separable) vertical filter to be applied, must be valid
606  * @param filterSize The number of filter elements, must be odd, with range [1, width]
607  * @param sourcePaddingElements Optional number of padding elements at the end of each source row, in elements, with range [0, infinity)
608  * @param targetPaddingElements Optional number of padding elements at the end of each target row, in elements, with range [0, infinity)
609  * @param firstRow The first row to be handled, with range [0, height)
610  * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
611  * @tparam T The data type of each pixel channel of the source and target frame, e.g., 'uint8_t', 'int', 'float', ...
612  * @tparam TIntermediate The data type of the intermediate target frame, should be either 'float' or 'double'
613  */
614  template <typename T, typename TIntermediate>
615  static void filterUniversalVerticalSubset(const TIntermediate* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const float* verticalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
616 
617  /**
618  * Mirrors a given value at the left border if necessary.
619  * The function provides a result as below:<br>
620  * <pre>
621  * Original: -3 -2 -1 | 0 1 2 3 4 5 6
622  * Result: 2 1 0 | 0 1 2 3 4 5 6
623  * </pre>
624  * @param value The value to be mirrored, with range (-infinity, infinity)
625  * @return Mirrored value
626  * @ingroup base
627  */
628  static inline unsigned int mirroredBorderLocationLeft(const int value);
629 
630  /**
631  * Mirrors a given value at the right border if necessary.
632  * The values is mirrored according to a given size parameter.<br>
633  * The function provides a result as below:<br>
634  * <pre>
635  * Original: 4 5 6 ... s-2 s-1 | s s+1 s+2
636  * Result: 4 5 6 ... s-2 s-1 | s-1 s-2 s-3
637  * </pre>
638  * @param value The value to be mirrored, with range [0, 2*size)
639  * @param size Specified size defining the upper mirror border, with range [1, 2147483647]
640  * @return Mirrored value
641  * @ingroup base
642  */
643  static inline unsigned int mirroredBorderLocationRight(const unsigned int value, const unsigned int size);
644 };
645 
646 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 10
647 
648 /**
649  * Definition of a 128 bit SIMD data type holding four 32 bit values.
650  */
651 template <>
652 struct FrameFilterSeparable::SIMD32x4<unsigned int>
653 {
654  typedef __m128i Type;
655 };
656 
657 /**
658  * Definition of a 128 bit SIMD data type holding four 32 bit values.
659  */
660 template <>
662 {
663  typedef __m128 Type;
664 };
665 
666 #elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
667 
668 /**
669  * Definition of a 128 bit SIMD data type holding four 32 bit values.
670  */
671 template <>
672 struct FrameFilterSeparable::SIMD32x4<unsigned int>
673 {
674  typedef uint32x4_t Type;
675 };
676 
677 /**
678  * Definition of a 128 bit SIMD data type holding four 32 bit values.
679  */
680 template <>
681 struct FrameFilterSeparable::SIMD32x4<float>
682 {
683  typedef float32x4_t Type;
684 };
685 
686 #endif
687 
688 template <typename T>
689 bool FrameFilterSeparable::isFilterSymmetric(const T* filterValues, const size_t size)
690 {
691  ocean_assert(filterValues != nullptr);
692  ocean_assert(size >= 1 && size % 2 == 1);
693 
694  for (size_t n = 0; n < size / 2; ++n)
695  {
696  if (NumericT<T>::isNotEqual(filterValues[n], filterValues[size - n - 1]))
697  {
698  return false;
699  }
700  }
701 
702  return true;
703 }
704 
705 template <typename T>
706 T FrameFilterSeparable::sumFilterValues(const T* filterValues, const size_t size)
707 {
708  ocean_assert(filterValues != nullptr);
709  ocean_assert(size >= 1);
710 
711  T sum = filterValues[0];
712 
713  for (size_t n = 1; n < size; ++n)
714  {
715  sum += filterValues[n];
716  }
717 
718  return sum;
719 }
720 
721 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
722 
723 template <>
724 OCEAN_FORCE_INLINE void FrameFilterSeparable::setSIMDZero<unsigned int, PI_SSE_2>(typename SIMD32x4<unsigned int>::Type& value)
725 {
726  // SSE2: _mm_setzero_si128
727 
728  value = _mm_setzero_si128();
729 }
730 
731 template <>
732 OCEAN_FORCE_INLINE void FrameFilterSeparable::setSIMDZero<float, PI_SSE_2>(typename SIMD32x4<float>::Type& value)
733 {
734  // SSE: _mm_set_ps1
735 
736  value = _mm_set_ps1(0.0f);
737 }
738 
739 template <>
740 OCEAN_FORCE_INLINE void FrameFilterSeparable::writeSIMD<unsigned int, PI_SSE_2>(const SIMD32x4<unsigned int>::Type& value, unsigned int* target)
741 {
742  _mm_storeu_si128((__m128i*)target, value);
743 }
744 
745 template <>
746 OCEAN_FORCE_INLINE void FrameFilterSeparable::writeSIMD<float, PI_SSE_2>(const SIMD32x4<float>::Type& value, float* target)
747 {
748  _mm_storeu_si128((__m128i*)target, _mm_castps_si128(value));
749 }
750 
751 #endif // OCEAN_HARDWARE_SSE_VERSION >= 20
752 
753 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
754 
755 template <>
756 OCEAN_FORCE_INLINE void FrameFilterSeparable::setSIMDZero<unsigned int, PI_NEON>(typename SIMD32x4<unsigned int>::Type& value)
757 {
758  value = vdupq_n_u32(0u);
759 }
760 
761 template <>
762 OCEAN_FORCE_INLINE void FrameFilterSeparable::setSIMDZero<float, PI_NEON>(typename SIMD32x4<float>::Type& value)
763 {
764  value = vdupq_n_f32(0.0f);
765 }
766 
767 template <>
768 OCEAN_FORCE_INLINE void FrameFilterSeparable::writeSIMD<unsigned int, PI_NEON>(const SIMD32x4<unsigned int>::Type& value, unsigned int* target)
769 {
770  vst1q_u32(target, value);
771 }
772 
773 template <>
774 OCEAN_FORCE_INLINE void FrameFilterSeparable::writeSIMD<float, PI_NEON>(const SIMD32x4<float>::Type& value, float* target)
775 {
776  vst1q_f32(target, value);
777 }
778 
779 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
780 
781 template <typename T>
782 void FrameFilterSeparable::fillLeftExtraBorder(const T* source, const unsigned int channels, const unsigned int pixels, T* extendedRow)
783 {
784  ocean_assert(source != nullptr && extendedRow != nullptr);
785 
786  for (unsigned int n = 0u; n < pixels; ++n)
787  {
788  memcpy(extendedRow + n * channels, source + (pixels - n - 1u) * channels, sizeof(T) * channels);
789  }
790 }
791 
792 template <typename T>
793 void FrameFilterSeparable::fillRightExtraBorder(const T* sourceEnd, const unsigned int channels, const unsigned int pixels, T* extendedRow)
794 {
795  ocean_assert(sourceEnd != nullptr && extendedRow != nullptr);
796 
797  for (unsigned int n = 0u; n < pixels; ++n)
798  {
799  memcpy(extendedRow + n * channels, sourceEnd - (n + 1u) * int(channels), sizeof(T) * channels);
800  }
801 }
802 
803 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
804 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int width, const unsigned int channels, const float* filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
805 {
806  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
807  ocean_assert(channels >= 1u);
808  ocean_assert(filterSize % 2u == 1u);
809 
810  const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
811 
812  unsigned int remainingElements = width * channels;
813 
814  while (remainingElements >= 16u)
815  {
816  filterVerticalCoreRow16Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, filter, filterSize, isSymmetric);
817 
818  source += 16;
819  target += 16;
820 
821  remainingElements -= 16u;
822  }
823 
824  while (remainingElements >= 8u)
825  {
826  filterVerticalCoreRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, filter, filterSize, isSymmetric);
827 
828  source += 8;
829  target += 8;
830 
831  remainingElements -= 8u;
832  }
833 
834 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
835 
836  while (remainingElements >= 4u)
837  {
838  filterVerticalCoreRow4Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, filter, filterSize, isSymmetric);
839 
840  source += 4;
841  target += 4;
842 
843  remainingElements -= 4u;
844  }
845 
846  ocean_assert(width * channels >= 4u);
847  ocean_assert(remainingElements < 4u);
848 
849  if (remainingElements != 0u)
850  {
851  const unsigned int shift = 4u - remainingElements;
852 
853  filterVerticalCoreRow4Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements, filter, filterSize, isSymmetric);
854  }
855 
856 #else
857 
858  ocean_assert(width * channels >= 8u);
859  ocean_assert(remainingElements < 8u);
860 
861  if (remainingElements != 0u)
862  {
863  const unsigned int shift = 8u - remainingElements;
864 
865  filterVerticalCoreRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements, filter, filterSize, isSymmetric);
866  }
867 
868 #endif // OCEAN_HARDWARE_SSE_VERSION >= 20
869 }
870 
871 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
872 
873 template <>
874 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow4Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
875 {
876  ocean_assert(source != nullptr && target != nullptr);
877  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
878 
879  /**
880  * This function uses the following SSE instructions, and needs SSE2 or higher
881  *
882  * SSE1:
883  * _mm_set_ps1
884  * _mm_mul_ps
885  * _mm_add_ps
886  * _mm_loadu_ps
887  *
888  * SSE2:
889  * _mm_loadu_si128
890  * _mm_cvtepi32_ps
891  * _mm_add_epi32
892  * _mm_cvtps_epi32
893  * _mm_packs_epi32
894  * _mm_packus_epi16
895  */
896 
897  const unsigned int filterSize_2 = filterSize / 2u;
898 
899  const __m128i* sourceBlock = (const __m128i*)source;
900 
901  // we store one filter value in each of the four 32 bit integer values
902  __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
903 
904  // now we load four input values, and multiply each of them with the center kernel value
905  __m128 source128 = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock));
906  __m128 result128 = _mm_mul_ps(source128, filterFactor_32x4);
907 
908  // now we proceed with the remaining filter values
909  for (unsigned int i = 1u; i <= filterSize_2; ++i)
910  {
911  const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
912  const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
913 
914  if (isSymmetric)
915  {
916  // we have a symmetric filter, so let's do some optimizations
917  filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
918 
919  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
920 
921  __m128i source128i = _mm_add_epi32(_mm_loadu_si128(sourceMinus), _mm_loadu_si128(sourcePlus));
922 
923  result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128i), filterFactor_32x4));
924  }
925  else
926  {
927  // we don't have a symmetric filter, so we need to handle two individual filters
928  __m128 filterFactor128Minus = _mm_set_ps1(filter[filterSize_2 - i]);
929  __m128 filterFactor128Plus = _mm_set_ps1(filter[filterSize_2 + i]);
930 
931  __m128i source128iMinus = _mm_loadu_si128(sourceMinus);
932  __m128i source128iPlus = _mm_loadu_si128(sourcePlus);
933 
934  result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128iMinus), filterFactor128Minus));
935  result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128iPlus), filterFactor128Plus));
936  }
937  }
938 
939  // now we have 8 bit values in each 32 bit register
940 
941  __m128i source128i = _mm_cvtps_epi32(result128);
942  source128i = _mm_packs_epi32(source128i, source128i);
943  source128i = _mm_packus_epi16(source128i, source128i);
944 
945  *((unsigned int*)target) = SSE::value_u32<0u>(source128i);
946 }
947 
948 template <>
949 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow4Elements32BitPerChannelFloat<float, float, PI_SSE_2>(const float* source, float* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
950 {
951  ocean_assert(source != nullptr && target != nullptr);
952  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
953 
954  /**
955  * This function uses the following SSE instructions, and needs SSE2 or higher
956  *
957  * SSE:
958  * _mm_set_ps1
959  * _mm_mul_ps
960  * _mm_add_ps
961  *
962  * SSE2:
963  * _mm_loadu_si128
964  * _mm_castsi128_ps
965  */
966 
967  const unsigned int filterSize_2 = filterSize / 2u;
968 
969  const __m128i* sourceBlock = (const __m128i*)source;
970 
971  // we store one filter value in each of the four 32 bit values
972  __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
973 
974  // now we load four input values, and multiply each of them with the center kernel value
975  __m128 source128 = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
976  __m128 result128 = _mm_mul_ps(source128, filterFactor_32x4);
977 
978  // now we proceed with the remaining filter values
979  for (unsigned int i = 1u; i <= filterSize_2; ++i)
980  {
981  const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
982  const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
983 
984  if (isSymmetric)
985  {
986  // we have a symmetric filter, so let's do some optimizations
987  filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
988 
989  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
990 
991  source128 = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus)));
992 
993  result128 = _mm_add_ps(result128, _mm_mul_ps(source128, filterFactor_32x4));
994  }
995  else
996  {
997  // we don't have a symmetric filter, so we need to handle two individual filters
998  __m128 filterFactor_32x4Minus = _mm_set_ps1(filter[filterSize_2 - i]);
999  __m128 filterFactor_32x4Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1000 
1001  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1002  __m128 source128Minus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus));
1003  __m128 source128Plus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus));
1004 
1005  result128 = _mm_add_ps(result128, _mm_mul_ps(source128Minus, filterFactor_32x4Minus));
1006  result128 = _mm_add_ps(result128, _mm_mul_ps(source128Plus, filterFactor_32x4Plus));
1007  }
1008  }
1009 
1010  writeSIMD<float, PI_SSE_2>(result128, target);
1011 }
1012 
1013 template <>
1014 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1015 {
1016  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1017  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1018 
1019  /**
1020  * This function uses the following SSE instructions, and needs SSE2 or higher
1021  *
1022  * SSE1:
1023  * _mm_set_ps1
1024  * _mm_mul_ps
1025  * _mm_add_ps
1026  * _mm_loadu_ps
1027  *
1028  * SSE2:
1029  * _mm_loadu_si128
1030  * _mm_cvtepi32_ps
1031  * _mm_add_epi32
1032  * _mm_cvtps_epi32
1033  * _mm_packs_epi32
1034  * _mm_packus_epi16
1035  * _mm_storel_epi64
1036  */
1037 
1038  const unsigned int filterSize_2 = filterSize / 2u;
1039 
1040  const __m128i* sourceBlock = (const __m128i*)source;
1041 
1042  // we store one filter value in each of the four 32 bit integer values
1043  __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1044 
1045  // now we load four input values, and multiply each of them with the center kernel value
1046  __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
1047  __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1048 
1049  // now we load the next four input values, ...
1050  __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
1051  __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1052 
1053  // now we proceed with the remaining filter values
1054  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1055  {
1056  const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
1057  const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
1058 
1059  if (isSymmetric)
1060  {
1061  // we have a symmetric filter, so let's do some optimizations
1062  filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1063 
1064  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1065  __m128i source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 0), _mm_loadu_si128(sourcePlus + 0));
1066  __m128i source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 1), _mm_loadu_si128(sourcePlus + 1));
1067 
1068  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1069  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1070  }
1071  else
1072  {
1073  // we don't have a symmetric filter, so we need to handle two individual filters
1074  __m128 filterFactor128Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1075  __m128 filterFactor128Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1076 
1077  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1078  __m128i source128aiMinus =_mm_loadu_si128(sourceMinus + 0);
1079  __m128i source128aiPlus = _mm_loadu_si128(sourcePlus + 0);
1080  __m128i source128biMinus = _mm_loadu_si128(sourceMinus + 1);
1081  __m128i source128biPlus = _mm_loadu_si128(sourcePlus + 1);
1082 
1083  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
1084  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
1085 
1086  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
1087  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
1088  }
1089  }
1090 
1091  // now we have 8 bit values in each 32 bit register
1092  __m128i result128 = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
1093  result128 = _mm_packus_epi16(result128, result128);
1094 
1095  _mm_storel_epi64((__m128i*)target, result128);
1096 }
1097 
1098 template <>
1099 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<float, float, PI_SSE_2>(const float* source, float* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1100 {
1101  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1102  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1103 
1104  /**
1105  * This function uses the following SSE instructions, and needs SSE2 or higher
1106  *
1107  * SSE:
1108  * _mm_set_ps1
1109  * _mm_mul_ps
1110  * _mm_add_ps
1111  *
1112  * SSE2:
1113  * _mm_loadu_si128
1114  * _mm_castsi128_ps
1115  */
1116 
1117  const unsigned int filterSize_2 = filterSize / 2u;
1118 
1119  const __m128i* sourceBlock = (const __m128i*)source;
1120 
1121  // we store one filter value in each of the four 32 bit values
1122  __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1123 
1124  // now we load four input values, and multiply each of them with the center kernel value
1125  __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
1126  __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1127 
1128  // now we load the next four input values, ...
1129  __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
1130  __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1131 
1132  // now we proceed with the remaining filter values
1133  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1134  {
1135  const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
1136  const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
1137 
1138  if (isSymmetric)
1139  {
1140  // we have a symmetric filter, so let's do some optimizations
1141  filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1142 
1143  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1144 
1145  source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0)));
1146  source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1)));
1147 
1148  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
1149  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
1150  }
1151  else
1152  {
1153  // we don't have a symmetric filter, so we need to handle two individual filters
1154  __m128 filterFactor_32x4Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1155  __m128 filterFactor_32x4Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1156 
1157  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1158  __m128 source128aMinus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0));
1159  __m128 source128aPlus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0));
1160  __m128 source128bMinus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1));
1161  __m128 source128bPlus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1));
1162 
1163  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source128aMinus, filterFactor_32x4Minus));
1164  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source128aPlus, filterFactor_32x4Plus));
1165 
1166  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source128bMinus, filterFactor_32x4Minus));
1167  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source128bPlus, filterFactor_32x4Plus));
1168  }
1169  }
1170 
1171  writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
1172  writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
1173 }
1174 
1175 #endif // OCEAN_HARDWARE_SSE_VERSION >= 20
1176 
1177 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1178 
1179 template <>
1180 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1181 {
1182  const unsigned int filterSize_2 = filterSize / 2u;
1183 
1184  // we store one filter value in each of the four 32 bit integer values
1185  float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
1186 
1187  // now we load four input values, and multiply each of them with the center kernel value
1188  float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
1189  float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1190 
1191  float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
1192  float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1193 
1194  // now we proceed with the remaining filter values
1195  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1196  {
1197  const unsigned int* sourceMinus = source - sourceStrideElements * i;
1198  const unsigned int* sourcePlus = source + sourceStrideElements * i;
1199 
1200  if (isSymmetric)
1201  {
1202  // we have a symmetric filter, so let's do some optimizations
1203  filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
1204 
1205  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1206  uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
1207  uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
1208 
1209  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1210  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1211  }
1212  else
1213  {
1214  // we don't have a symmetric filter, so we need to handle two individual filters
1215 
1216  float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
1217  float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
1218 
1219  uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
1220  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
1221 
1222  uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
1223  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
1224 
1225  uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
1226  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
1227 
1228  uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
1229  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
1230  }
1231  }
1232 
1233  // now we have 8 bit values in each 32 bit register
1234  uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
1235 
1236  uint8x8_t result64 = vqmovn_u16(result128ab);
1237 
1238  vst1_u8(target, result64);
1239 }
1240 
1241 template <>
1242 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<float, float, PI_NEON>(const float* source, float* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1243 {
1244  const unsigned int filterSize_2 = filterSize / 2u;
1245 
1246  // we store one filter value in each of the four 32 bit integer values
1247  float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
1248 
1249  // now we load four input values, and multiply each of them with the center kernel value
1250  float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
1251  float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1252 
1253  float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
1254  float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1255 
1256  // now we proceed with the remaining filter values
1257  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1258  {
1259  const float* sourceMinus = source - sourceStrideElements * i;
1260  const float* sourcePlus = source + sourceStrideElements * i;
1261 
1262  if (isSymmetric)
1263  {
1264  // we have a symmetric filter, so let's do some optimizations
1265  filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
1266 
1267  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1268  source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
1269  source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
1270 
1271  result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
1272  result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
1273  }
1274  else
1275  {
1276  // we don't have a symmetric filter, so we need to handle two individual filters
1277 
1278  float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
1279  float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
1280 
1281  float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
1282  float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
1283 
1284  float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
1285  float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
1286 
1287  result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
1288  result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
1289 
1290  result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
1291  result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
1292  }
1293  }
1294 
1295  vst1q_f32(target + 0, result_32x4a);
1296  vst1q_f32(target + 4, result_32x4b);
1297 }
1298 
1299 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
1300 
1301 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
1302 
1303 template <>
1304 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1305 {
1306  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1307  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1308 
1309  /**
1310  * This function uses the following SSE instructions, and needs SSE2 or higher
1311  *
1312  * SSE1:
1313  * _mm_set_ps1
1314  * _mm_mul_ps
1315  * _mm_add_ps
1316  * _mm_loadu_ps
1317  *
1318  * SSE2:
1319  * _mm_loadu_si128
1320  * _mm_cvtepi32_ps
1321  * _mm_add_epi32
1322  * _mm_cvtps_epi32
1323  * _mm_packs_epi32
1324  * _mm_packus_epi16
1325  * _mm_storeu_si128
1326  */
1327 
1328  /**
1329  * We determine 16 filter responses within one loop iteration.
1330  * For a filter with size 5 for 1 channel frames we apply the following strategy:
1331  *
1332  * Source Data:
1333  * Y
1334  * Y
1335  * Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1336  * Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
1337  * Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <------------
1338  * Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
1339  * Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1340  * Y
1341  * Y
1342  *
1343  * Further, we use the fact that the filter kernel is symmetric so that we start at the center row (the target row) and then going to the filter's borders
1344  *
1345  * For frames with n channels the strategy stays the same.
1346  */
1347 
1348  const unsigned int filterSize_2 = filterSize / 2u;
1349 
1350  const __m128i* sourceBlock = (const __m128i*)source;
1351 
1352  // we store one filter value in each of the four 32 bit integer values
1353  __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1354 
1355  // now we load four input values, and multiply each of them with the center kernel value
1356  __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
1357  __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1358 
1359  // now we load the next four input values, ...
1360  __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
1361  __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1362 
1363  __m128 source_32x4c = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 2));
1364  __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
1365 
1366  __m128 source_32x4d = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 3));
1367  __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
1368 
1369  // now we proceed with the remaining filter values
1370  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1371  {
1372  const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
1373  const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
1374 
1375  if (isSymmetric)
1376  {
1377  // we have a symmetric filter, so let's do some optimizations
1378  filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1379 
1380  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1381 
1382  __m128i source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 0), _mm_loadu_si128(sourcePlus + 0));
1383  __m128i source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 1), _mm_loadu_si128(sourcePlus + 1));
1384 
1385  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1386  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1387 
1388  source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 2), _mm_loadu_si128(sourcePlus + 2));
1389  source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 3), _mm_loadu_si128(sourcePlus + 3));
1390 
1391  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1392  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1393  }
1394  else
1395  {
1396  // we don't have a symmetric filter, so we need to handle two individual filters
1397  __m128 filterFactor128Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1398  __m128 filterFactor128Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1399 
1400  __m128i source128aiMinus = _mm_loadu_si128(sourceMinus + 0);
1401  __m128i source128aiPlus = _mm_loadu_si128(sourcePlus + 0);
1402 
1403  __m128i source128biMinus = _mm_loadu_si128(sourceMinus + 1);
1404  __m128i source128biPlus = _mm_loadu_si128(sourcePlus + 1);
1405 
1406  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
1407  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
1408 
1409  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
1410  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
1411 
1412  __m128i source128ciMinus = _mm_loadu_si128(sourceMinus + 2);
1413  __m128i source128ciPlus = _mm_loadu_si128(sourcePlus + 2);
1414 
1415  __m128i source128diMinus = _mm_loadu_si128(sourceMinus + 3);
1416  __m128i source128diPlus = _mm_loadu_si128(sourcePlus + 3);
1417 
1418  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciMinus), filterFactor128Minus));
1419  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diMinus), filterFactor128Minus));
1420 
1421  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciPlus), filterFactor128Plus));
1422  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diPlus), filterFactor128Plus));
1423  }
1424  }
1425 
1426  // now we have 8 bit values in each 32 bit register
1427  __m128i result128ab = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
1428  __m128i result128cd = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4c), _mm_cvtps_epi32(result_32x4d));
1429  __m128i result128 = _mm_packus_epi16(result128ab, result128cd);
1430 
1431  _mm_storeu_si128((__m128i*)target, result128);
1432 }
1433 
1434 template <>
1435 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<float, float, PI_SSE_2>(const float* source, float* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1436 {
1437  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1438  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1439 
1440  /**
1441  * This function uses the following SSE instructions, and needs SSE2 or higher
1442  *
1443  * SSE:
1444  * _mm_set_ps1
1445  * _mm_mul_ps
1446  * _mm_add_ps
1447  *
1448  * SSE2:
1449  * _mm_loadu_si128
1450  * _mm_castsi128_ps
1451  */
1452 
1453  /**
1454  * We determine 16 filter responses within one loop iteration.
1455  * For a filter with size 5 for 1 channel frames we apply the following strategy:
1456  *
1457  * Source Data:
1458  * Y
1459  * Y
1460  * Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1461  * Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
1462  * Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <------------
1463  * Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
1464  * Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1465  * Y
1466  * Y
1467  *
1468  * Further, we use the fact that the filter kernel is symmetric so that we start at the center row (the target row) and then going to the filter's borders
1469  *
1470  * For frames with n channels the strategy stays the same.
1471  */
1472 
1473  const unsigned int filterSize_2 = filterSize / 2u;
1474 
1475  const __m128i* sourceBlock = (const __m128i*)source;
1476 
1477  // we store one filter value in each of the four 32 bit values
1478  __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1479 
1480  // now we load four input values, and multiply each of them with the center kernel value
1481  __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
1482  __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1483 
1484  // now we load the next four input values, ...
1485  __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
1486  __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1487 
1488  __m128 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 2));
1489  __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
1490 
1491  __m128 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 3));
1492  __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
1493 
1494  // now we proceed with the remaining filter values
1495  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1496  {
1497  const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
1498  const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
1499 
1500  if (isSymmetric)
1501  {
1502  // we have a symmetric filter, so let's do some optimizations
1503  filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1504 
1505  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1506 
1507  source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0)));
1508  source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1)));
1509 
1510  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
1511  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
1512 
1513  source_32x4c = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 2)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 2)));
1514  source_32x4d = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 3)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 3)));
1515 
1516  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4));
1517  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4));
1518  }
1519  else
1520  {
1521  // we don't have a symmetric filter, so we need to handle two individual filters
1522  __m128 filterFactor_32x4Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1523  __m128 filterFactor_32x4Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1524 
1525  source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0));
1526  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
1527 
1528  source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1));
1529  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
1530 
1531  source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 2));
1532  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Minus));
1533 
1534  source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 3));
1535  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Minus));
1536 
1537  source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0));
1538  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
1539 
1540  source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1));
1541  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
1542 
1543  source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 2));
1544  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Plus));
1545 
1546  source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 3));
1547  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Plus));
1548  }
1549  }
1550 
1551  writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
1552  writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
1553  writeSIMD<float, PI_SSE_2>(result_32x4c, target + 8);
1554  writeSIMD<float, PI_SSE_2>(result_32x4d, target + 12);
1555 }
1556 
1557 #endif // OCEAN_HARDWARE_SSE_VERSION >= 20
1558 
1559 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1560 
1561 template <>
1562 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1563 {
1564  const unsigned int filterSize_2 = filterSize / 2u;
1565 
1566  // we store one filter value in each of the four 32 bit integer values
1567  float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
1568 
1569  // now we load four input values, and multiply each of them with the center kernel value
1570  float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
1571  float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1572 
1573  float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
1574  float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1575 
1576  float32x4_t source_32x4c = vcvtq_f32_u32(vld1q_u32(source + 4 * 2));
1577  float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
1578 
1579  float32x4_t source_32x4d = vcvtq_f32_u32(vld1q_u32(source + 4 * 3));
1580  float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
1581 
1582  // now we proceed with the remaining filter values
1583  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1584  {
1585  const unsigned int* sourceMinus = source - sourceStrideElements * i;
1586  const unsigned int* sourcePlus = source + sourceStrideElements * i;
1587 
1588  if (isSymmetric)
1589  {
1590  // we have a symmetric filter, so let's do some optimizations
1591  filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
1592 
1593  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1594  uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
1595  uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
1596 
1597  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1598  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1599 
1600  source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 2), vld1q_u32(sourcePlus + 4 * 2));
1601  source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 3), vld1q_u32(sourcePlus + 4 * 3));
1602 
1603  result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1604  result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1605  }
1606  else
1607  {
1608  // we don't have a symmetric filter, so we need to handle two individual filters
1609 
1610  float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
1611  float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
1612 
1613  uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
1614  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
1615 
1616  uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
1617  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
1618 
1619  uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
1620  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
1621 
1622  uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
1623  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
1624 
1625  uint32x4_t source128ciMinus = vld1q_u32(sourceMinus + 4 * 2);
1626  result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciMinus), filterFactor128Minus);
1627 
1628  uint32x4_t source128diMinus = vld1q_u32(sourceMinus + 4 * 3);
1629  result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diMinus), filterFactor128Minus);
1630 
1631  uint32x4_t source128ciPlus = vld1q_u32(sourcePlus + 4 * 2);
1632  result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciPlus), filterFactor128Plus);
1633 
1634  uint32x4_t source128diPlus = vld1q_u32(sourcePlus + 4 * 3);
1635  result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diPlus), filterFactor128Plus);
1636  }
1637  }
1638 
1639  // now we have 8 bit values in each 32 bit register
1640  uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
1641  uint16x8_t result128cd = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4c)), vqmovn_u32(vcvtq_u32_f32(result_32x4d)));
1642 
1643  uint8x16_t result128 = vcombine_u8(vqmovn_u16(result128ab), vqmovn_u16(result128cd));
1644 
1645  vst1q_u8(target, result128);
1646 }
1647 
1648 template <>
1649  OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<float, float, PI_NEON>(const float* source, float* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1650 {
1651  const unsigned int filterSize_2 = filterSize / 2u;
1652 
1653  // we store one filter value in each of the four 32 bit integer values
1654  float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
1655 
1656  // now we load four input values, and multiply each of them with the center kernel value
1657  float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
1658  float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1659 
1660  float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
1661  float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1662 
1663  float32x4_t source_32x4c = vld1q_f32(source + 4 * 2);
1664  float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
1665 
1666  float32x4_t source_32x4d = vld1q_f32(source + 4 * 3);
1667  float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
1668 
1669  // now we proceed with the remaining filter values
1670  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1671  {
1672  const float* sourceMinus = source - sourceStrideElements * i;
1673  const float* sourcePlus = source + sourceStrideElements * i;
1674 
1675  if (isSymmetric)
1676  {
1677  // we have a symmetric filter, so let's do some optimizations
1678  filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
1679 
1680  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1681  source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
1682  source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
1683 
1684  result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
1685  result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
1686 
1687  source_32x4c = vaddq_f32(vld1q_f32(sourceMinus + 4 * 2), vld1q_f32(sourcePlus + 4 * 2));
1688  source_32x4d = vaddq_f32(vld1q_f32(sourceMinus + 4 * 3), vld1q_f32(sourcePlus + 4 * 3));
1689 
1690  result_32x4c = vmlaq_f32(result_32x4c, source_32x4c, filterFactor_32x4);
1691  result_32x4d = vmlaq_f32(result_32x4d, source_32x4d, filterFactor_32x4);
1692  }
1693  else
1694  {
1695  // we don't have a symmetric filter, so we need to handle two individual filters
1696 
1697  float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
1698  float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
1699 
1700  float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
1701  float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
1702 
1703  float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
1704  float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
1705 
1706  result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
1707  result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
1708 
1709  result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
1710  result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
1711 
1712  source128aMinus = vld1q_f32(sourceMinus + 4 * 2);
1713  source128aPlus = vld1q_f32(sourcePlus + 4 * 2);
1714 
1715  source128bMinus = vld1q_f32(sourceMinus + 4 * 3);
1716  source128bPlus = vld1q_f32(sourcePlus + 4 * 3);
1717 
1718  result_32x4c = vmlaq_f32(result_32x4c, source128aMinus, filterFactor128Minus);
1719  result_32x4d = vmlaq_f32(result_32x4d, source128bMinus, filterFactor128Minus);
1720 
1721  result_32x4c = vmlaq_f32(result_32x4c, source128aPlus, filterFactor128Plus);
1722  result_32x4d = vmlaq_f32(result_32x4d, source128bPlus, filterFactor128Plus);
1723  }
1724  }
1725 
1726  vst1q_f32(target + 0, result_32x4a);
1727  vst1q_f32(target + 4, result_32x4b);
1728  vst1q_f32(target + 8, result_32x4c);
1729  vst1q_f32(target + 12, result_32x4d);
1730 }
1731 
1732 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
1733 
1734 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
1735 
1736 template <>
1737 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1738 {
1739  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1740  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1741 
1742  /**
1743  * This function uses the following SSE instructions, and needs SSE2 or higher
1744  *
1745  * SSE1:
1746  * _mm_set_ps1
1747  * _mm_mul_ps
1748  * _mm_add_ps
1749  * _mm_loadu_ps
1750  *
1751  * SSE2:
1752  * _mm_loadu_si128
1753  * _mm_cvtepi32_ps
1754  * _mm_add_epi32
1755  * _mm_cvtps_epi32
1756  * _mm_packs_epi32
1757  * _mm_packus_epi16
1758  * _mm_storel_epi64
1759  */
1760 
1761  const unsigned int filterSize_2 = filterSize / 2u;
1762 
1763  // the border covers row ids within the range [0, filterSize_2)
1764  ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
1765 
1766  const __m128i* sourceBlock = (const __m128i*)source;
1767 
1768  // we store one filter value in each of the four 32 bit integer values
1769  __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1770 
1771  // now we load four input values, and multiply each of them with the center kernel value
1772  __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
1773  __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1774 
1775  // now we load the next four input values, ...
1776  __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
1777  __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1778 
1779  __m128i source128ai, source128bi;
1780 
1781  // now we proceed with the remaining filter values
1782  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1783  {
1784  // we determine the mirrored locations (and the row offset in relation to the current row)
1785  const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
1786  const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
1787 
1788  // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
1789  const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
1790  const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
1791 
1792  if (isSymmetric)
1793  {
1794  // we have a symmetric filter, so let's do some optimizations
1795  filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1796 
1797  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1798 
1799  source128ai = _mm_add_epi32(_mm_loadu_si128((const __m128i*)sourceMinus + 0), _mm_loadu_si128((const __m128i*)sourcePlus + 0));
1800  source128bi = _mm_add_epi32(_mm_loadu_si128((const __m128i*)sourceMinus + 1), _mm_loadu_si128((const __m128i*)sourcePlus + 1));
1801 
1802  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1803  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1804  }
1805  else
1806  {
1807  // we don't have a symmetric filter, so we need to handle two individual filters
1808 
1809  __m128 filterFactor128Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1810  __m128 filterFactor128Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1811 
1812  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1813 
1814  source128ai = _mm_loadu_si128((const __m128i*)sourceMinus + 0);
1815  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor128Minus));
1816 
1817  source128bi = _mm_loadu_si128((const __m128i*)sourceMinus + 1);
1818  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor128Minus));
1819 
1820  source128ai = _mm_loadu_si128((const __m128i*)sourcePlus + 0);
1821  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor128Plus));
1822 
1823  source128bi = _mm_loadu_si128((const __m128i*)sourcePlus + 1);
1824  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor128Plus));
1825  }
1826  }
1827 
1828  // now we have 8 bit values in each 32 bit register
1829  __m128i result128 = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
1830  result128 = _mm_packus_epi16(result128, result128);
1831 
1832  _mm_storel_epi64((__m128i*)target, result128);
1833 }
1834 
1835 template <>
1836 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<float, float, PI_SSE_2>(const float* source, float* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1837 {
1838  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1839  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1840 
1841  /**
1842  * This function uses the following SSE instructions, and needs SSE2 or higher
1843  *
1844  * SSE:
1845  * _mm_set_ps1
1846  * _mm_mul_ps
1847  * _mm_add_ps
1848  *
1849  * SSE2:
1850  * _mm_loadu_si128
1851  * _mm_castsi128_ps
1852  */
1853 
1854  const unsigned int filterSize_2 = filterSize / 2u;
1855 
1856  // the border covers row ids within the range [0, filterSize_2)
1857  ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
1858 
1859  const __m128i* sourceBlock = (const __m128i*)source;
1860 
1861  // we store one filter value in each of the four 32 bit values
1862  __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1863 
1864  // now we load four input values, and multiply each of them with the center kernel value
1865  __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
1866  __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1867 
1868  // now we load the next four input values, ...
1869  __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
1870  __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1871 
1872  // now we proceed with the remaining filter values
1873  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1874  {
1875  // we determine the mirrored locations (and the row offset in relation to the current row)
1876  const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
1877  const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
1878 
1879  // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
1880  const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
1881  const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
1882 
1883  if (isSymmetric)
1884  {
1885  // we have a symmetric filter, so let's do some optimizations
1886  filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1887 
1888  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1889 
1890  source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 0)));
1891  source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 1)));
1892 
1893  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
1894  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
1895  }
1896  else
1897  {
1898  // we don't have a symmetric filter, so we need to handle two individual filters
1899  __m128 filterFactor_32x4Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1900  __m128 filterFactor_32x4Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1901 
1902  source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 0));
1903  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
1904 
1905  source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 1));
1906  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
1907 
1908  source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 0));
1909  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
1910 
1911  source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 1));
1912  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
1913  }
1914  }
1915 
1916  writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
1917  writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
1918 }
1919 
1920 #endif // OCEAN_HARDWARE_SSE_VERSION >= 20
1921 
1922 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1923 
1924 template <>
1925 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1926 {
1927  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1928  ocean_assert(filterSize % 2u == 1u);
1929 
1930  const unsigned int filterSize_2 = filterSize / 2u;
1931 
1932  // the border covers row ids within the range [0, filterSize_2)
1933  ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
1934 
1935  // we store one filter value in each of the four 32 bit integer values
1936  float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
1937 
1938  // now we load four input values, and multiply each of them with the center kernel value
1939  float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
1940  float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1941 
1942  float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
1943  float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1944 
1945  // now we proceed with the remaining filter values
1946  for (unsigned int i = 1u; i <= filterSize_2; ++i)
1947  {
1948  // we determine the mirrored locations (and the row offset in relation to the current row)
1949  const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
1950  const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
1951 
1952  // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
1953  const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
1954  const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
1955 
1956  if (isSymmetric)
1957  {
1958  // we have a symmetric filter, so let's do some optimizations
1959  filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
1960 
1961  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1962 
1963  uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
1964  uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
1965 
1966  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1967  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1968  }
1969  else
1970  {
1971  // we don't have a symmetric filter, so we need to handle two individual filters
1972 
1973  float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
1974  float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
1975 
1976  uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
1977  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
1978 
1979  uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
1980  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
1981 
1982  uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
1983  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
1984 
1985  uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
1986  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
1987  }
1988  }
1989 
1990  // now we have 8 bit values in each 32 bit register
1991  uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
1992 
1993  uint8x8_t result64 = vqmovn_u16(result128ab);
1994 
1995  vst1_u8(target, result64);
1996 }
1997 
1998 template <>
1999 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<float, float, PI_NEON>(const float* source, float* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
2000 {
2001  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
2002  ocean_assert(filterSize % 2u == 1u);
2003 
2004  const unsigned int filterSize_2 = filterSize / 2u;
2005 
2006  // the border covers row ids within the range [0, filterSize_2)
2007  ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2008 
2009  // we store one filter value in each of the four 32 bit integer values
2010  float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
2011 
2012  // now we load four input values, and multiply each of them with the center kernel value
2013  float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
2014  float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
2015 
2016  float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
2017  float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
2018 
2019  // now we proceed with the remaining filter values
2020  for (unsigned int i = 1u; i <= filterSize_2; ++i)
2021  {
2022  // we determine the mirrored locations (and the row offset in relation to the current row)
2023  const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
2024  const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
2025 
2026  // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
2027  const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2028  const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2029 
2030  if (isSymmetric)
2031  {
2032  // we have a symmetric filter, so let's do some optimizations
2033  filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
2034 
2035  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
2036 
2037  source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
2038  source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
2039 
2040  result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
2041  result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
2042  }
2043  else
2044  {
2045  // we don't have a symmetric filter, so we need to handle two individual filters
2046 
2047  float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
2048  float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
2049 
2050  float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
2051  float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
2052 
2053  float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
2054  float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
2055 
2056  result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
2057  result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
2058 
2059  result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
2060  result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
2061  }
2062  }
2063 
2064  vst1q_f32(target + 0, result_32x4a);
2065  vst1q_f32(target + 4, result_32x4b);
2066 }
2067 
2068 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
2069 
2070 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
2071 
2072 template <>
2073 OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(const uint8_t* source, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target)
2074 {
2075  /**
2076  * This function uses the following SSE instructions, and needs SSE2 or higher
2077  *
2078  * SSE2:
2079  * _mm_set1_epi32
2080  * _mm_unpacklo_epi8
2081  * _mm_unpackhi_epi16
2082  * _mm_setzero_si128
2083  * _mm_madd_epi16
2084  * _mm_add_epi32
2085  */
2086 
2087  // we store one filter value in each of the four 32 bit integer values
2088  __m128i filterFactor_32x4 = _mm_set1_epi32(int(filterFactor));
2089 
2090  // we load four source values into the lower 32 bit of our 128 bit register
2091  __m128i source128 = _mm_set1_epi32(*((const int*)source));
2092 
2093  // we separate the source values to receive 16 bit integers
2094  source128 = _mm_unpacklo_epi8(source128, _mm_setzero_si128());
2095 
2096  // we separate the 16 bit values further so that we receive 32 bit integers
2097  source128 = _mm_unpackhi_epi16(source128, _mm_setzero_si128());
2098 
2099  // we multiply each value with the same filter factor, and sum the result
2100  source128 = _mm_madd_epi16(source128, filterFactor_32x4);
2101 
2102  // we add the local result to the sum parameters
2103  target = _mm_add_epi32(target, source128);
2104 }
2105 
2106 template <>
2107 OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_SSE_2>(const float* source, const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2108 {
2109  /**
2110  * This function uses the following SSE instructions, and needs SSE2 or higher
2111  *
2112  * SSE:
2113  * _mm_set_ps1
2114  * _mm_mul_ps
2115  * _mm_add_ps
2116  *
2117  * SSE2:
2118  * _mm_loadu_si128
2119  * _mm_castsi128_ps
2120  */
2121 
2122  // we store one filter value in each of the four 32 bit values
2123  __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2124 
2125  // we load 8 source values into two 128 bit registers
2126  __m128 source_32x4 = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)source));
2127 
2128  // we multiply each value with the same filter factor
2129  source_32x4 = _mm_mul_ps(source_32x4, filterFactor_32x4);
2130 
2131  // we add the local result to the sum parameters
2132  target_32x4 = _mm_add_ps(target_32x4, source_32x4);
2133 }
2134 
2135 template <>
2136 OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(const uint8_t* sourceLeft, const uint8_t* sourceRight, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target)
2137 {
2138  /**
2139  * This function uses the following SSE instructions, and needs SSE2 or higher
2140  *
2141  * SSE2:
2142  * _mm_set1_epi32
2143  * _mm_unpacklo_epi8
2144  * _mm_unpackhi_epi16
2145  * _mm_setzero_si128
2146  * _mm_madd_epi16
2147  * _mm_add_epi32
2148  */
2149 
2150  // we store one filter value in each of the four 32 bit integer values
2151  __m128i filterFactor_32x4 = _mm_set1_epi32(int(filterFactor));
2152 
2153  // we load 4 source values from the left side and 4 source values from the right side, we separate the values to receive 16 bit integers and add them together
2154  __m128i source128 = _mm_add_epi16(_mm_unpacklo_epi8(_mm_set1_epi32(*((const int*)sourceLeft)), _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_set1_epi32(*((const int*)sourceRight)), _mm_setzero_si128()));
2155 
2156  // we separate the 16 bit values further so that we receive 32 bit integers
2157  source128 = _mm_unpackhi_epi16(source128, _mm_setzero_si128());
2158 
2159  // we multiply each value with the same filter factor, and sum the result
2160  source128 = _mm_madd_epi16(source128, filterFactor_32x4);
2161 
2162  // we add the local result to the sum parameters
2163  target = _mm_add_epi32(target, source128);
2164 }
2165 
2166 template <>
2167 OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_SSE_2>(const float* sourceLeft, const float* sourceRight, const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2168 {
2169  /**
2170  * This function uses the following SSE instructions, and needs SSE2 or higher
2171  *
2172  * SSE:
2173  * _mm_set_ps1
2174  * _mm_mul_ps
2175  * _mm_add_ps
2176  *
2177  * SSE2:
2178  * _mm_loadu_si128
2179  * _mm_castsi128_ps
2180  */
2181 
2182  // we store one filter value in each of the four 32 bit values
2183  __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2184 
2185  // we load 4 * 2 source values and add them together
2186  __m128 source_32x4 = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceLeft)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceRight)));
2187 
2188  // we multiply each value with the same filter factor
2189  source_32x4 = _mm_mul_ps(source_32x4, filterFactor_32x4);
2190 
2191  // we add the local result to the sum parameters
2192  target_32x4 = _mm_add_ps(target_32x4, source_32x4);
2193 }
2194 
2195 template <>
2196 OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(const uint8_t* source, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2197 {
2198  /**
2199  * This function uses the following SSE instructions, and needs SSE2 or higher
2200  *
2201  * SSE2:
2202  * _mm_set1_epi32
2203  * _mm_loadl_epi64
2204  * _mm_unpacklo_epi8
2205  * _mm_unpackhi_epi16
2206  * _mm_unpacklo_epi16
2207  * _mm_setzero_si128
2208  * _mm_madd_epi16
2209  * _mm_add_epi32
2210  */
2211 
2212  // we store one filter value in each of the four 32 bit integer values
2213  __m128i filterFactor_32x4 = _mm_set1_epi32(int(filterFactor));
2214 
2215  // we load eight source values into the lower 64 bit of our 128 bit register
2216  __m128i source_32x4a = _mm_loadl_epi64((const __m128i*)source);
2217 
2218  // we separate the source values to receive 16 bit integers
2219  source_32x4a = _mm_unpacklo_epi8(source_32x4a, _mm_setzero_si128());
2220 
2221  // we separate the 16 bit values further so that we receive 32 bit integers
2222  __m128i source_32x4b = _mm_unpackhi_epi16(source_32x4a, _mm_setzero_si128());
2223  source_32x4a = _mm_unpacklo_epi16(source_32x4a, _mm_setzero_si128());
2224 
2225  // we multiply each value with the same filter factor, and sum the result
2226  source_32x4a = _mm_madd_epi16(source_32x4a, filterFactor_32x4);
2227  source_32x4b = _mm_madd_epi16(source_32x4b, filterFactor_32x4);
2228 
2229  // we add the local result to the sum parameters
2230  target_32x4a = _mm_add_epi32(target_32x4a, source_32x4a);
2231  target_32x4b = _mm_add_epi32(target_32x4b, source_32x4b);
2232 }
2233 
2234 template <>
2235 OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_SSE_2>(const float* source, const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2236 {
2237  /**
2238  * This function uses the following SSE instructions, and needs SSE2 or higher
2239  *
2240  * SSE:
2241  * _mm_set_ps1
2242  * _mm_mul_ps
2243  * _mm_add_ps
2244  *
2245  * SSE2:
2246  * _mm_loadu_si128
2247  * _mm_castsi128_ps
2248  */
2249 
2250  // we store one filter value in each of the four 32 bit values
2251  __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2252 
2253  // we load 8 source values into two 128 bit registers
2254  __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)source + 0));
2255  __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)source + 1));
2256 
2257  // we multiply each value with the same filter factor
2258  source_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2259  source_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2260 
2261  // we add the local result to the sum parameters
2262  target_32x4a = _mm_add_ps(target_32x4a, source_32x4a);
2263  target_32x4b = _mm_add_ps(target_32x4b, source_32x4b);
2264 }
2265 
2266 template <>
2267 OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(const uint8_t* sourceLeft, const uint8_t* sourceRight, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2268 {
2269  /**
2270  * This function uses the following SSE instructions, and needs SSE2 or higher
2271  *
2272  * SSE2:
2273  * _mm_set1_epi32
2274  * _mm_loadl_epi64
2275  * _mm_unpacklo_epi8
2276  * _mm_unpackhi_epi16
2277  * _mm_unpacklo_epi16
2278  * _mm_setzero_si128
2279  * _mm_madd_epi16
2280  * _mm_add_epi32
2281  */
2282 
2283  // we store one filter value in each of the four 32 bit integer values
2284  __m128i filterFactor_32x4 = _mm_set1_epi32(int(filterFactor));
2285 
2286  // we load 8 source values from the left side and 8 source values from the right side, we separate the values to receive 16 bit integers and add them together
2287  __m128i source_32x4a = _mm_add_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)sourceLeft), _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)sourceRight), _mm_setzero_si128()));
2288 
2289  // we separate the 16 bit values further so that we receive 32 bit integers
2290  __m128i source_32x4b = _mm_unpackhi_epi16(source_32x4a, _mm_setzero_si128());
2291  source_32x4a = _mm_unpacklo_epi16(source_32x4a, _mm_setzero_si128());
2292 
2293  // we multiply each value with the same filter factor, and sum the result
2294  source_32x4a = _mm_madd_epi16(source_32x4a, filterFactor_32x4);
2295  source_32x4b = _mm_madd_epi16(source_32x4b, filterFactor_32x4);
2296 
2297  // we add the local result to the sum parameters
2298  target_32x4a = _mm_add_epi32(target_32x4a, source_32x4a);
2299  target_32x4b = _mm_add_epi32(target_32x4b, source_32x4b);
2300 }
2301 
2302 template <>
2303 OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_SSE_2>(const float* sourceLeft, const float* sourceRight, const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2304 {
2305  /**
2306  * This function uses the following SSE instructions, and needs SSE2 or higher
2307  *
2308  * SSE:
2309  * _mm_set_ps1
2310  * _mm_mul_ps
2311  * _mm_add_ps
2312  *
2313  * SSE2:
2314  * _mm_loadu_si128
2315  * _mm_castsi128_ps
2316  */
2317 
2318  // we store one filter value in each of the four 32 bit values
2319  __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2320 
2321  // we load 4 * 2 source values and add them together
2322  __m128 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceLeft + 0)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceRight + 0)));
2323  __m128 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceLeft + 1)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceRight + 1)));
2324 
2325  // we multiply each value with the same filter factor
2326  source_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2327  source_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2328 
2329  // we add the local result to the sum parameters
2330  target_32x4a = _mm_add_ps(target_32x4a, source_32x4a);
2331  target_32x4b = _mm_add_ps(target_32x4b, source_32x4b);
2332 }
2333 
2334 #endif // OCEAN_HARDWARE_SSE_VERSION >= 20
2335 
2336 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
2337 
2338 template <>
2339 OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(const uint8_t* source, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4)
2340 {
2341  ocean_assert(filterFactor <= 0xFFFFu);
2342 
2343  // we store the same filter value in each of the four 16 bit values
2344  const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2345 
2346 #if defined(__aarch64__)
2347 
2348  // we load four 8bit source values and we convert them to 16 bit values afterwards
2349  const uint16x8_t source16_8 = vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(*((const uint32_t*)source))));
2350 
2351 #else
2352 
2353  uint32_t sourceValue;
2354  ((uint8_t*)&sourceValue)[0] = source[0];
2355  ((uint8_t*)&sourceValue)[1] = source[1];
2356  ((uint8_t*)&sourceValue)[2] = source[2];
2357  ((uint8_t*)&sourceValue)[3] = source[3];
2358 
2359  const uint16x8_t source16_8 = vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(sourceValue)));
2360 
2361 #endif // __aarch64__
2362 
2363  // we multiply each 16 bit value with the same 16 bit filter factor and add the 32 bit results to the given values
2364  target_32x4 = vmlal_u16(target_32x4, vget_low_u16(source16_8), filterFactor16_4);
2365 }
2366 
2367 template <>
2368 OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_NEON>(const float* source, const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2369 {
2370  // we store the same filter value in each of the four 32 bit values
2371  const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2372 
2373  // we load four 32 bit source values
2374  const float32x4_t source128 = vld1q_f32(source);
2375 
2376  // we multiply each value with the same filter factor, and sum the result
2377  target_32x4 = vmlaq_f32(target_32x4, source128, filterFactor_32x4);
2378 }
2379 
2380 template <>
2381 OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(const uint8_t* sourceLeft, const uint8_t* sourceRight, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4)
2382 {
2383  ocean_assert(filterFactor <= 0xFFFFu);
2384 
2385  // we store the same filter value in each of the four 16 bit values
2386  const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2387 
2388 #if defined(__aarch64__)
2389 
2390  // we load eight 8bit source values and we convert them to 16 bit values afterwards
2391  const uint16x8_t source16_8 = vaddl_u8(vreinterpret_u8_u32(vdup_n_u32(*((const uint32_t*)sourceLeft))), vreinterpret_u8_u32(vdup_n_u32(*((const uint32_t*)sourceRight))));
2392 
2393 #else
2394 
2395  uint32_t sourceValueLeft;
2396  ((uint8_t*)&sourceValueLeft)[0] = sourceLeft[0];
2397  ((uint8_t*)&sourceValueLeft)[1] = sourceLeft[1];
2398  ((uint8_t*)&sourceValueLeft)[2] = sourceLeft[2];
2399  ((uint8_t*)&sourceValueLeft)[3] = sourceLeft[3];
2400 
2401  uint32_t sourceValueRight;
2402  ((uint8_t*)&sourceValueRight)[0] = sourceRight[0];
2403  ((uint8_t*)&sourceValueRight)[1] = sourceRight[1];
2404  ((uint8_t*)&sourceValueRight)[2] = sourceRight[2];
2405  ((uint8_t*)&sourceValueRight)[3] = sourceRight[3];
2406 
2407  // we load eight 8bit source values and we convert them to 16 bit values afterwards
2408  const uint16x8_t source16_8 = vaddl_u8(vreinterpret_u8_u32(vdup_n_u32(sourceValueLeft)), vreinterpret_u8_u32(vdup_n_u32(sourceValueRight)));
2409 
2410 #endif // __aarch64__
2411 
2412  // we multiply each 16 bit value with the same 16 bit filter factor and add the 32 bit results to the given values
2413  target_32x4 = vmlal_u16(target_32x4, vget_low_u16(source16_8), filterFactor16_4);
2414 }
2415 
2416 template <>
2417 OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_NEON>(const float* sourceLeft, const float* sourceRight, const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2418 {
2419  // we store the same filter value in each of the four 32 bit values
2420  const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2421 
2422  // we load eight 8bit source values and we convert them to 16 bit values afterwards
2423  const float32x4_t source_32x4 = vaddq_f32(vld1q_f32(sourceLeft), vld1q_f32(sourceRight));
2424 
2425  // we multiply each 16 bit value with the same 16 bit filter factor and add the 32 bit results to the given values
2426  target_32x4 = vmlaq_f32(target_32x4, source_32x4, filterFactor_32x4);
2427 }
2428 
2429 template <>
2430 OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(const uint8_t* source, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2431 {
2432  ocean_assert(filterFactor <= 0xFFFFu);
2433 
2434  // we store the same filter value in each of the four 16 bit values
2435  const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2436 
2437  // we load eight 8bit source values and we convert them to 16 bit values afterwards
2438  const uint16x8_t source16_8 = vmovl_u8(vld1_u8(source));
2439 
2440  // we multiply each 16 bit value with the same 16 bit filter factor and add the 32 bit results to the given values
2441  target_32x4a = vmlal_u16(target_32x4a, vget_low_u16(source16_8), filterFactor16_4);
2442  target_32x4b = vmlal_u16(target_32x4b, vget_high_u16(source16_8), filterFactor16_4);
2443 }
2444 
2445 template <>
2446 OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_NEON>(const float* source, const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2447 {
2448  // we store the same filter value in each of the four 32 bit values
2449  const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2450 
2451  // we load eight 32 bit source values
2452  const float32x4_t source_32x4a = vld1q_f32(source + 0);
2453  const float32x4_t source_32x4b = vld1q_f32(source + 4);
2454 
2455  // we multiply each value with the same filter factor, and sum the result
2456  target_32x4a = vmlaq_f32(target_32x4a, source_32x4a, filterFactor_32x4);
2457  target_32x4b = vmlaq_f32(target_32x4b, source_32x4b, filterFactor_32x4);
2458 }
2459 
2460 template <>
2461 OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(const uint8_t* sourceLeft, const uint8_t* sourceRight, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2462 {
2463  ocean_assert(filterFactor <= 0xFFFFu);
2464 
2465  // we store the same filter value in each of the four 16 bit values
2466  const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2467 
2468  // we load eight 8bit source values and we convert them to 16 bit values afterwards
2469  const uint16x8_t source16_8 = vaddl_u8(vld1_u8(sourceLeft), vld1_u8(sourceRight));
2470 
2471  // we multiply each 16 bit value with the same 16 bit filter factor and add the 32 bit results to the given values
2472  target_32x4a = vmlal_u16(target_32x4a, vget_low_u16(source16_8), filterFactor16_4);
2473  target_32x4b = vmlal_u16(target_32x4b, vget_high_u16(source16_8), filterFactor16_4);
2474 }
2475 
2476 template <>
2477 OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_NEON>(const float* sourceLeft, const float* sourceRight, const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2478 {
2479  // we store the same filter value in each of the four 16 bit values
2480  const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2481 
2482  // we load eight 32 bit source values
2483  const float32x4_t source_32x4a = vaddq_f32(vld1q_f32(sourceLeft + 0), vld1q_f32(sourceRight + 0));
2484  const float32x4_t source_32x4b = vaddq_f32(vld1q_f32(sourceLeft + 4), vld1q_f32(sourceRight + 4));
2485 
2486  // we multiply each value with the same filter factor, and sum the result
2487  target_32x4a = vmlaq_f32(target_32x4a, source_32x4a, filterFactor_32x4);
2488  target_32x4b = vmlaq_f32(target_32x4b, source_32x4b, filterFactor_32x4);
2489 }
2490 
2491 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
2492 
2493 template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
2494 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterHorizontalRowOneBlockWith4Elements(const TSource* const source, TFilter* const target, const unsigned int channels, const TFilter* const filter, const unsigned int filterSize, const bool isSymmetric)
2495 {
2496  /*
2497  * We determine 4 filter responses within one loop iteration.
2498  * For a filter with size 5 for 1 channel frames we apply the following strategy:
2499  *
2500  * Source Data: Y Y Y Y Y Y Y Y Y (if the source data has a Y8 pixel format)
2501  * 1 4 6 4 1 .
2502  * 1 4 6 4 1
2503  * 1 4 6 4 1
2504  * . 1 4 6 4 1
2505  * . .
2506  * Target Data: - - Y Y Y Y - -
2507  *
2508  *
2509  * For a filter with size 5 for 3 channel frames we apply the following strategy:
2510  *
2511  * Source Data: R G B R G B R G B R G B R G B R G B R G B R G B (if the source data has a RGB24 pixel format)
2512  * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2513  * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2514  * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2515  * . 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2516  * . . 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2517  * . .
2518  * Target Data: - - - - - - R G B R - - - - - - - - - - - - - - - -
2519  *
2520  */
2521 
2522  ocean_assert(source != nullptr && filter != nullptr);
2523  ocean_assert(channels >= 1u);
2524  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2525 
2526  typename SIMD32x4<TFilter>::Type target_32x4;
2527 
2528  setSIMDZero<TFilter, tProcessorInstructions>(target_32x4);
2529 
2530  if (isSymmetric)
2531  {
2532  const unsigned int filterSize_2 = filterSize / 2u;
2533 
2534  // we iterate over the first half of filter factors [0, filterSize_2)
2535  for (unsigned int n = 0u; n < filterSize_2; ++n)
2536  {
2537  symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, source + (filterSize - n - 1) * channels, filter[n], target_32x4);
2538  }
2539 
2540  // we handle the center filter factor at filterSize_2
2541  asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + filterSize_2 * channels, filter[filterSize_2], target_32x4);
2542  }
2543  else
2544  {
2545  // we iterate over the first half of filter factors [0, filterSize_2)
2546  for (unsigned int n = 0u; n < filterSize; ++n)
2547  {
2548  asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, filter[n], target_32x4);
2549  }
2550  }
2551 
2552  writeSIMD<TFilter, tProcessorInstructions>(target_32x4, target);
2553 }
2554 
2555 template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
2556 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterHorizontalRowOneBlockWith8Elements(const TSource* const source, TFilter* const target, const unsigned int channels, const TFilter* const filter, const unsigned int filterSize, const bool isSymmetric)
2557 {
2558  /*
2559  * We determine 8 filter responses within one loop iteration.
2560  * For a filter with size 5 for 1 channel frames we apply the following strategy:
2561  *
2562  * Source Data: Y Y Y Y Y Y Y Y Y Y Y Y (if the source data has a Y8 pixel format)
2563  * 1 4 6 4 1 .
2564  * 1 4 6 4 1 .
2565  * 1 4 6 4 1 .
2566  * . 1 4 6 4 1 .
2567  * . 1 4 6 4 1 .
2568  * . 1 4 6 4 1
2569  * . 1 4 6 4 1
2570  * . 1 4 6 4 1
2571  * . .
2572  * Target Data: - - Y Y Y Y Y Y Y Y - -
2573  *
2574  *
2575  * For a filter with size 5 for 3 channel frames we apply the following strategy:
2576  *
2577  * Source Data: R G B R G B R G B R G B R G B R G B R G B R G B (if the source data has a RGB24 pixel format)
2578  * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2579  * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2580  * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2581  * . 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2582  * . 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2583  * . . 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2584  * . .
2585  * Target Data: - - - - - - R G B R G B R G - - - - - - - - - - - - - - - -
2586  *
2587  */
2588 
2589  ocean_assert(source != nullptr && filter != nullptr);
2590  ocean_assert(channels >= 1u);
2591  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2592 
2593  typename SIMD32x4<TFilter>::Type target_32x4a, target_32x4b;
2594 
2595  setSIMDZero<TFilter, tProcessorInstructions>(target_32x4a);
2596  setSIMDZero<TFilter, tProcessorInstructions>(target_32x4b);
2597 
2598  if (isSymmetric)
2599  {
2600  const unsigned int filterSize_2 = filterSize / 2u;
2601 
2602  // we iterate over the first half of filter factors [0, filterSize_2)
2603  for (unsigned int n = 0u; n < filterSize_2; ++n)
2604  {
2605  symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, source + (filterSize - n - 1) * channels, filter[n], target_32x4a, target_32x4b);
2606  }
2607 
2608  // we handle the center filter factor at filterSize_2
2609  asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + filterSize_2 * channels, filter[filterSize_2], target_32x4a, target_32x4b);
2610  }
2611  else
2612  {
2613  // we iterate over the first half of filter factors [0, filterSize_2)
2614  for (unsigned int n = 0u; n < filterSize; ++n)
2615  {
2616  asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, filter[n], target_32x4a, target_32x4b);
2617  }
2618  }
2619 
2620  writeSIMD<TFilter, tProcessorInstructions>(target_32x4a, target + 0);
2621  writeSIMD<TFilter, tProcessorInstructions>(target_32x4b, target + 4);
2622 }
2623 
2624 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
2625 
2626 template <>
2627 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
2628 {
2629  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
2630  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2631 
2632  /*
2633  * This function uses the following SSE instructions, and needs SSE2 or higher
2634  *
2635  * SSE1:
2636  * _mm_set_ps1
2637  * _mm_mul_ps
2638  * _mm_add_ps
2639  * _mm_loadu_ps
2640  *
2641  * SSE2:
2642  * _mm_loadu_si128
2643  * _mm_cvtepi32_ps
2644  * _mm_add_epi32
2645  * _mm_cvtps_epi32
2646  * _mm_packs_epi32
2647  * _mm_packus_epi16
2648  * _mm_storeu_si128
2649  */
2650 
2651  /*
2652  * We determine 16 filter responses within one loop iteration.
2653  * For a filter with size 5 for 1 channel frames, with row = 0, we apply the following mirroring strategy:
2654  *
2655  * Source Data:
2656  * 1 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2657  * 0 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2658  * ---------------------------------
2659  * 0 Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <---------
2660  * 1 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2661  * 2 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2662  * 3 Y
2663  * 4 Y
2664  *
2665  * For frames with n channels the strategy stays the same.
2666  */
2667 
2668  const unsigned int filterSize_2 = filterSize / 2u;
2669 
2670  // the border covers row ids within the range [0, filterSize_2)
2671  ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2672 
2673  const __m128i* sourceBlock = (const __m128i*)source;
2674 
2675  // we store one filter value in each of the four 32 bit integer values
2676  __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
2677 
2678  // now we load four input values, and multiply each of them with the center kernel value
2679  __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
2680  __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2681 
2682  // now we load the next four input values, ...
2683  __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
2684  __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2685 
2686  __m128 source_32x4c = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 2));
2687  __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
2688 
2689  __m128 source_32x4d = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 3));
2690  __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
2691 
2692  __m128i source128ai, source128bi;
2693 
2694  // now we proceed with the remaining filter values
2695  for (unsigned int i = 1u; i <= filterSize_2; ++i)
2696  {
2697  // we determine the mirrored locations (and the row offset in relation to the current row)
2698  const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
2699  const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
2700 
2701  // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
2702  const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2703  const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2704 
2705  if (isSymmetric)
2706  {
2707  // we have a symmetric filter, so let's do some optimizations
2708  filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
2709 
2710  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
2711 
2712  source128ai = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 0), _mm_loadu_si128((__m128i*)sourcePlus + 0));
2713  source128bi = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 1), _mm_loadu_si128((__m128i*)sourcePlus + 1));
2714 
2715  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
2716  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
2717 
2718  source128ai = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 2), _mm_loadu_si128((__m128i*)sourcePlus + 2));
2719  source128bi = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 3), _mm_loadu_si128((__m128i*)sourcePlus + 3));
2720 
2721  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
2722  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
2723  }
2724  else
2725  {
2726  // we don't have a symmetric filter, so we need to handle two individual filters
2727  __m128 filterFactor128Minus = _mm_set_ps1(filter[filterSize_2 - i]);
2728  __m128 filterFactor128Plus = _mm_set_ps1(filter[filterSize_2 + i]);
2729 
2730  __m128i source128aiMinus = _mm_loadu_si128((__m128i*)sourceMinus + 0);
2731  __m128i source128aiPlus = _mm_loadu_si128((__m128i*)sourcePlus + 0);
2732 
2733  __m128i source128biMinus = _mm_loadu_si128((__m128i*)sourceMinus + 1);
2734  __m128i source128biPlus = _mm_loadu_si128((__m128i*)sourcePlus + 1);
2735 
2736  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
2737  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
2738 
2739  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
2740  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
2741 
2742  __m128i source128ciMinus = _mm_loadu_si128((__m128i*)sourceMinus + 2);
2743  __m128i source128ciPlus = _mm_loadu_si128((__m128i*)sourcePlus + 2);
2744 
2745  __m128i source128diMinus = _mm_loadu_si128((__m128i*)sourceMinus + 3);
2746  __m128i source128diPlus = _mm_loadu_si128((__m128i*)sourcePlus + 3);
2747 
2748  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciMinus), filterFactor128Minus));
2749  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciPlus), filterFactor128Plus));
2750 
2751  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diMinus), filterFactor128Minus));
2752  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diPlus), filterFactor128Plus));
2753  }
2754  }
2755 
2756  // now we have 8 bit values in each 32 bit register
2757  __m128i result128ab = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
2758  __m128i result128cd = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4c), _mm_cvtps_epi32(result_32x4d));
2759  __m128i result128 = _mm_packus_epi16(result128ab, result128cd);
2760 
2761  _mm_storeu_si128((__m128i*)target, result128);
2762 }
2763 
2764 template <>
2765 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<float, float, PI_SSE_2>(const float* source, float* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
2766 {
2767  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
2768  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2769 
2770  /**
2771  * This function uses the following SSE instructions, and needs SSE2 or higher
2772  *
2773  * SSE:
2774  * _mm_set_ps1
2775  * _mm_mul_ps
2776  * _mm_add_ps
2777  *
2778  * SSE2:
2779  * _mm_loadu_si128
2780  * _mm_castsi128_ps
2781  */
2782 
2783  /*
2784  * We determine 16 filter responses within one loop iteration.
2785  * For a filter with size 5 for 1 channel frames, with row = 0, we apply the following mirroring strategy:
2786  *
2787  * Source Data:
2788  * 1 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2789  * 0 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2790  * ---------------------------------
2791  * 0 Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <---------
2792  * 1 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2793  * 2 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2794  * 3 Y
2795  * 4 Y
2796  *
2797  * For frames with n channels the strategy stays the same.
2798  */
2799 
2800  const unsigned int filterSize_2 = filterSize / 2u;
2801 
2802  // the border covers row ids within the range [0, filterSize_2)
2803  ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2804 
2805  const __m128i* sourceBlock = (const __m128i*)source;
2806 
2807  // we store one filter value in each of the four 32 bit values
2808  __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
2809 
2810  // now we load four input values, and multiply each of them with the center kernel value
2811  __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
2812  __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2813 
2814  // now we load the next four input values, ...
2815  __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
2816  __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2817 
2818  __m128 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 2));
2819  __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
2820 
2821  __m128 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 3));
2822  __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
2823 
2824  // now we proceed with the remaining filter values
2825  for (unsigned int i = 1u; i <= filterSize_2; ++i)
2826  {
2827  // we determine the mirrored locations (and the row offset in relation to the current row)
2828  const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
2829  const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
2830 
2831  // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
2832  const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2833  const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2834 
2835  if (isSymmetric)
2836  {
2837  // we have a symmetric filter, so let's do some optimizations
2838  filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
2839 
2840  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
2841 
2842  source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 0)));
2843  source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 1)));
2844 
2845  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
2846  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
2847 
2848  source_32x4c = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 2)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 2)));
2849  source_32x4d = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 3)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 3)));
2850 
2851  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4));
2852  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4));
2853  }
2854  else
2855  {
2856  // we don't have a symmetric filter, so we need to handle two individual filters
2857  __m128 filterFactor_32x4Minus = _mm_set_ps1(filter[filterSize_2 - i]);
2858  __m128 filterFactor_32x4Plus = _mm_set_ps1(filter[filterSize_2 + i]);
2859 
2860  source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 0));
2861  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
2862 
2863  source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 1));
2864  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
2865 
2866  source_32x4c = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 2));
2867  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Minus));
2868 
2869  source_32x4d = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 3));
2870  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Minus));
2871 
2872  source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 0));
2873  result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
2874 
2875  source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 1));
2876  result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
2877 
2878  source_32x4c = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 2));
2879  result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Plus));
2880 
2881  source_32x4d = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 3));
2882  result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Plus));
2883  }
2884  }
2885 
2886  writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
2887  writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
2888  writeSIMD<float, PI_SSE_2>(result_32x4c, target + 8);
2889  writeSIMD<float, PI_SSE_2>(result_32x4d, target + 12);
2890 }
2891 
2892 #endif // OCEAN_HARDWARE_SSE_VERSION >= 20
2893 
2894 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
2895 
2896 template <>
2897 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
2898 {
2899  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
2900  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2901 
2902  /*
2903  * We determine 16 filter responses within one loop iteration.
2904  * For a filter with size 5 for 1 channel frames, with row = 0, we apply the following mirroring strategy:
2905  *
2906  * Source Data:
2907  * 1 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2908  * 0 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2909  * ---------------------------------
2910  * 0 Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <---------
2911  * 1 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2912  * 2 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2913  * 3 Y
2914  * 4 Y
2915  *
2916  * For frames with n channels the strategy stays the same.
2917  */
2918 
2919  const unsigned int filterSize_2 = filterSize / 2u;
2920 
2921  // the border covers row ids within the range [0, filterSize_2)
2922  ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2923 
2924  // we store one filter value in each of the four 32 bit integer values
2925  float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
2926 
2927  // now we load four input values, and multiply each of them with the center kernel value
2928  float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
2929  float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
2930 
2931  float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
2932  float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
2933 
2934  float32x4_t source_32x4c = vcvtq_f32_u32(vld1q_u32(source + 4 * 2));
2935  float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
2936 
2937  float32x4_t source_32x4d = vcvtq_f32_u32(vld1q_u32(source + 4 * 3));
2938  float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
2939 
2940  // now we proceed with the remaining filter values
2941  for (unsigned int i = 1u; i <= filterSize_2; ++i)
2942  {
2943  // we determine the mirrored locations (and the row offset in relation to the current row)
2944  const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
2945  const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
2946 
2947  // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
2948  const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2949  const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2950 
2951  if (isSymmetric)
2952  {
2953  // we have a symmetric filter, so let's do some optimizations
2954  filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
2955 
2956  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
2957 
2958  uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
2959  uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
2960 
2961  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
2962  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
2963 
2964  source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 2), vld1q_u32(sourcePlus + 4 * 2));
2965  source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 3), vld1q_u32(sourcePlus + 4 * 3));
2966 
2967  result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ai), filterFactor_32x4);
2968  result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128bi), filterFactor_32x4);
2969  }
2970  else
2971  {
2972  // we don't have a symmetric filter, so we need to handle two individual filters
2973 
2974  float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
2975  float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
2976 
2977  uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
2978  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
2979 
2980  uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
2981  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
2982 
2983  uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
2984  result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
2985 
2986  uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
2987  result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
2988 
2989  uint32x4_t source128ciMinus = vld1q_u32(sourceMinus + 4 * 2);
2990  result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciMinus), filterFactor128Minus);
2991 
2992  uint32x4_t source128diMinus = vld1q_u32(sourceMinus + 4 * 3);
2993  result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diMinus), filterFactor128Minus);
2994 
2995  uint32x4_t source128ciPlus = vld1q_u32(sourcePlus + 4 * 2);
2996  result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciPlus), filterFactor128Plus);
2997 
2998  uint32x4_t source128diPlus = vld1q_u32(sourcePlus + 4 * 3);
2999  result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diPlus), filterFactor128Plus);
3000  }
3001  }
3002 
3003  // now we have 8 bit values in each 32 bit register
3004  uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
3005  uint16x8_t result128cd = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4c)), vqmovn_u32(vcvtq_u32_f32(result_32x4d)));
3006 
3007  uint8x16_t result128 = vcombine_u8(vqmovn_u16(result128ab), vqmovn_u16(result128cd));
3008 
3009  vst1q_u8(target, result128);
3010 }
3011 
3012 template <>
3013 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<float, float, PI_NEON>(const float* source, float* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
3014 {
3015  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
3016  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
3017 
3018  /*
3019  * We determine 16 filter responses within one loop iteration.
3020  * For a filter with size 5 for 1 channel frames, with row = 0, we apply the following mirroring strategy:
3021  *
3022  * Source Data:
3023  * 1 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
3024  * 0 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
3025  * ---------------------------------
3026  * 0 Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <---------
3027  * 1 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
3028  * 2 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
3029  * 3 Y
3030  * 4 Y
3031  *
3032  * For frames with n channels the strategy stays the same.
3033  */
3034 
3035  const unsigned int filterSize_2 = filterSize / 2u;
3036 
3037  // the border covers row ids within the range [0, filterSize_2)
3038  ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
3039 
3040  // we store one filter value in each of the four 32 bit integer values
3041  float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
3042 
3043  // now we load four input values, and multiply each of them with the center kernel value
3044  float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
3045  float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
3046 
3047  float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
3048  float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
3049 
3050  float32x4_t source_32x4c = vld1q_f32(source + 4 * 2);
3051  float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
3052 
3053  float32x4_t source_32x4d = vld1q_f32(source + 4 * 3);
3054  float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
3055 
3056  // now we proceed with the remaining filter values
3057  for (unsigned int i = 1u; i <= filterSize_2; ++i)
3058  {
3059  // we determine the mirrored locations (and the row offset in relation to the current row)
3060  const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
3061  const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
3062 
3063  // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
3064  const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
3065  const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
3066 
3067  if (isSymmetric)
3068  {
3069  // we have a symmetric filter, so let's do some optimizations
3070  filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
3071 
3072  // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
3073 
3074  source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
3075  source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
3076 
3077  result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
3078  result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
3079 
3080  source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 2), vld1q_f32(sourcePlus + 4 * 2));
3081  source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 3), vld1q_f32(sourcePlus + 4 * 3));
3082 
3083  result_32x4c = vmlaq_f32(result_32x4c, source_32x4a, filterFactor_32x4);
3084  result_32x4d = vmlaq_f32(result_32x4d, source_32x4b, filterFactor_32x4);
3085  }
3086  else
3087  {
3088  // we don't have a symmetric filter, so we need to handle two individual filters
3089 
3090  float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
3091  float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
3092 
3093  float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
3094  float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
3095 
3096  float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
3097  float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
3098 
3099  result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
3100  result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
3101 
3102  result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
3103  result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
3104 
3105  source128aMinus = vld1q_f32(sourceMinus + 4 * 2);
3106  source128aPlus = vld1q_f32(sourcePlus + 4 * 2);
3107 
3108  source128bMinus = vld1q_f32(sourceMinus + 4 * 3);
3109  source128bPlus = vld1q_f32(sourcePlus + 4 * 3);
3110 
3111  result_32x4c = vmlaq_f32(result_32x4c, source128aMinus, filterFactor128Minus);
3112  result_32x4d = vmlaq_f32(result_32x4d, source128bMinus, filterFactor128Minus);
3113 
3114  result_32x4c = vmlaq_f32(result_32x4c, source128aPlus, filterFactor128Plus);
3115  result_32x4d = vmlaq_f32(result_32x4d, source128bPlus, filterFactor128Plus);
3116  }
3117  }
3118 
3119  vst1q_f32(target + 0, result_32x4a);
3120  vst1q_f32(target + 4, result_32x4b);
3121  vst1q_f32(target + 8, result_32x4c);
3122  vst1q_f32(target + 12, result_32x4d);
3123 }
3124 
3125 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
3126 
3127 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
3128 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int width, const unsigned height, const unsigned int channels, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
3129 {
3130  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
3131  ocean_assert(channels >= 1u);
3132  ocean_assert(filterSize <= height);
3133  ocean_assert(filterSize % 2u == 1u);
3134 
3135  const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3136 
3137  unsigned int remainingElements = width * channels;
3138 
3139  while (remainingElements >= 16u)
3140  {
3141  filterVerticalBorderRow16Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, height, row, filter, filterSize, isSymmetric);
3142 
3143  source += 16;
3144  target += 16;
3145 
3146  remainingElements -= 16u;
3147  }
3148 
3149  while (remainingElements >= 8u)
3150  {
3151  filterVerticalBorderRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, height, row, filter, filterSize, isSymmetric);
3152 
3153  source += 8;
3154  target += 8;
3155 
3156  remainingElements -= 8u;
3157  }
3158 
3159  ocean_assert(width * channels >= 8u);
3160  ocean_assert(remainingElements < 8u);
3161 
3162  if (remainingElements != 0u)
3163  {
3164  const unsigned int shift = 8u - remainingElements;
3165 
3166  filterVerticalBorderRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements, height, row, filter, filterSize, isSymmetric);
3167  }
3168 }
3169 
3170 template <typename TSource, typename TFilter, const ProcessorInstructions tProcessorInstructions>
3171 void FrameFilterSeparable::filterHorizontalSubset(const TSource* source, TFilter* target, const unsigned int width, const unsigned int height, const unsigned int channels, const TFilter* filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
3172 {
3173  ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
3174  ocean_assert(width >= filterSize + 1u);
3175 
3176  ocean_assert(channels >= 1u && channels <= 8u);
3177  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
3178 
3179  ocean_assert_and_suppress_unused(firstRow + numberRows <= height, height);
3180 
3181  const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3182  const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3183 
3184  const bool isSymmetric = isFilterSymmetric(filter, filterSize);
3185 
3186  const unsigned int filterSize_2 = filterSize / 2u;
3187  const unsigned int extraPixels = filterSize_2 * 2u;
3188 
3189  const unsigned int extendedElements = (width + extraPixels) * channels;
3190 
3191  Memory extendedRowMemory = Memory::create<TSource>(extendedElements);
3192  TSource* const extendedRow = extendedRowMemory.data<TSource>();
3193  ocean_assert(extendedRow != nullptr);
3194 
3195  source += firstRow * sourceStrideElements;
3196  target += firstRow * targetStrideElements;
3197 
3198  for (unsigned int rowsProcessed = 0u; rowsProcessed < numberRows; ++rowsProcessed)
3199  {
3200  // we create an intermediate row with extended pixels left and right
3201  fillLeftExtraBorder<TSource>(source, channels, filterSize_2, extendedRow);
3202  memcpy(extendedRow + filterSize_2 * channels, source, width * channels * sizeof(TSource));
3203  fillRightExtraBorder<TSource>(source + width * channels, channels, filterSize_2, extendedRow + (width + filterSize_2) * channels);
3204 
3205  const TSource* extendedSource = extendedRow;
3206 
3207  unsigned int remainingElements = width * channels;
3208 
3209 #if (defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10) || (defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20)
3210 
3211 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
3212  const ProcessorInstructions instructions = ProcessorInstructions(PI_NEON & tProcessorInstructions);
3213 #elif defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
3214  const ProcessorInstructions instructions = ProcessorInstructions(PI_SSE_2 & tProcessorInstructions);
3215 #endif
3216 
3217  // now we apply 8-block-elements as long as they fit into the frame
3218 
3219  while (remainingElements >= 8u)
3220  {
3221  filterHorizontalRowOneBlockWith8Elements<TSource, TFilter, instructions>(extendedSource, target, channels, filter, filterSize, isSymmetric);
3222 
3223  extendedSource += 8;
3224  target += 8;
3225 
3226  remainingElements -= 8u;
3227  }
3228 
3229  // now we apply 4-block-elements as long as they fit into the frame
3230 
3231  while (remainingElements >= 4u)
3232  {
3233  filterHorizontalRowOneBlockWith4Elements<TSource, TFilter, instructions>(extendedSource, target, channels, filter, filterSize, isSymmetric);
3234 
3235  extendedSource += 4;
3236  target += 4;
3237 
3238  remainingElements -= 4u;
3239  }
3240 
3241  // finally, we check whether we have 1-3 elements left; in this case, we simply process some elements another time
3242 
3243  if (remainingElements != 0u)
3244  {
3245  const unsigned int shift = 4u - remainingElements;
3246 
3247  extendedSource -= shift;
3248  target -= shift;
3249 
3250  filterHorizontalRowOneBlockWith4Elements<TSource, TFilter, instructions>(extendedSource, target, channels, filter, filterSize, isSymmetric);
3251 
3252  // we do not need to shift extendedSource += 4
3253  target += 4u;
3254  }
3255 
3256 #else
3257 
3258  OCEAN_SUPPRESS_UNUSED_WARNING(extendedSource);
3259  OCEAN_SUPPRESS_UNUSED_WARNING(remainingElements);
3260  OCEAN_SUPPRESS_UNUSED_WARNING(isSymmetric);
3261 
3262 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10 || OCEAN_HARDWARE_SSE_VERSION >= 20
3263 
3264 #ifdef OCEAN_INTENSIVE_DEBUG
3265  {
3266  const TFilter* const debugTarget = target - width * channels;
3267 
3268  for (unsigned int x = 0u; x < width; ++x)
3269  {
3270  for (unsigned int n = 0u; n < channels; ++n)
3271  {
3272  float result = 0.0f;
3273 
3274  for (int xx = -int(filterSize_2); xx <= int(filterSize_2); ++xx)
3275  {
3276  const unsigned int mirroredXX = (x < filterSize_2) ? mirroredBorderLocationLeft(int(x) + xx) : mirroredBorderLocationRight((unsigned int)(int(x) + xx), width);
3277  result += float(*(source + mirroredXX * channels + int(n))) * filter[xx + int(filterSize_2)];
3278  }
3279 
3280  const TFilter targetValue = debugTarget[x * channels + n];
3281 
3282  if (std::is_same<float, TFilter>::value)
3283  {
3284  ocean_assert(NumericT<TFilter>::isWeakEqual(result, targetValue));
3285  }
3286  else
3287  {
3288  const TFilter result8_converted = (TFilter)(result);
3289  const TFilter result8_rounded = (TFilter)(result + 0.51f);
3290  ocean_assert(result8_converted == targetValue || result8_rounded == targetValue);
3291  }
3292  }
3293  }
3294  }
3295 #endif
3296 
3297  source += sourceStrideElements;
3298  target += targetPaddingElements;
3299  }
3300 }
3301 
3302 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
3303 void FrameFilterSeparable::filterVerticalSubset(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int channels, const float* filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
3304 {
3305  ocean_assert(source != nullptr && target != nullptr);
3306  ocean_assert(filter != nullptr);
3307  ocean_assert(height >= filterSize / 2u + 1u);
3308  ocean_assert(channels >= 1u && channels <= 8u);
3309 
3310  ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
3311 
3312  ocean_assert(firstRow + numberRows <= height);
3313  ocean_assert(width * channels >= 8u * 2u);
3314 
3315  const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3316  const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3317 
3318  const bool isSymmetric = isFilterSymmetric(filter, filterSize);
3319 
3320  const unsigned int filterSize_2 = filterSize / 2u;
3321 
3322 #ifdef OCEAN_INTENSIVE_DEBUG
3323  const TSource* const debugSource = source;
3324 #endif
3325 
3326  source += firstRow * sourceStrideElements;
3327  target += firstRow * targetStrideElements;
3328 
3329  unsigned int row = firstRow;
3330 
3331  // first we check whether we are located at the top border, whether we start within the first filterSize_2 rows
3332 
3333  while (row < min(firstRow + numberRows, filterSize_2))
3334  {
3335  filterVerticalBorderRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, height, channels, row, filter, filterSize, isSymmetric, sourcePaddingElements);
3336 
3337 #ifdef OCEAN_INTENSIVE_DEBUG
3338  {
3339  for (unsigned int x = 0u; x < width * channels; ++x)
3340  {
3341  float result = 0.0f;
3342 
3343  for (int y = -int(filterSize_2); y <= int(filterSize_2); ++y)
3344  {
3345  const unsigned int mirroredY = mirroredBorderLocationLeft(int(row) + y);
3346  result += float(*(debugSource + mirroredY * int(sourceStrideElements) + int(x))) * filter[y + int(filterSize_2)];
3347  }
3348 
3349  const TTarget targetValue = target[x];
3350 
3351  if (std::is_same<float, TTarget>::value)
3352  {
3353  ocean_assert(NumericT<TTarget>::isWeakEqual(result, targetValue));
3354  }
3355  else
3356  {
3357  ocean_assert(NumericT<TTarget>::isEqual((TTarget)(result), targetValue, TTarget(2)));
3358  }
3359  }
3360  }
3361 #endif
3362 
3363  source += sourceStrideElements;
3364  target += targetStrideElements;
3365 
3366  ++row;
3367  }
3368 
3369  // now we proceed the rows not located at the top or bottom border of the frame
3370 
3371  while (row < min(firstRow + numberRows, height - filterSize_2))
3372  {
3373  filterVerticalCoreRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, channels, filter, filterSize, isSymmetric, sourcePaddingElements);
3374 
3375 #ifdef OCEAN_INTENSIVE_DEBUG
3376  {
3377  for (unsigned int x = 0u; x < width * channels; ++x)
3378  {
3379  float result = 0.0f;
3380 
3381  for (int y = -int(filterSize_2); y <= int(filterSize_2); ++y)
3382  result += float(*(debugSource + (int(row) + y) * int(sourceStrideElements) + int(x))) * filter[y + int(filterSize_2)];
3383 
3384  const TTarget targetValue = target[x];
3385 
3386  ocean_assert(result >= 0.0f && result < 256.0f);
3387 
3388  if (std::is_same<float, TTarget>::value)
3389  {
3390  ocean_assert(NumericT<TTarget>::isWeakEqual(result, targetValue));
3391  }
3392  else
3393  {
3394  ocean_assert(NumericT<TTarget>::isEqual((TTarget)(result), targetValue, TTarget(2)));
3395  }
3396  }
3397  }
3398 #endif
3399 
3400  source += sourceStrideElements;
3401  target += targetStrideElements;
3402 
3403  ++row;
3404  }
3405 
3406  // now we check whether we are located at the bottom border, whether we start within the last filterSize_2 rows (or need to process them)
3407 
3408  while (row < firstRow + numberRows)
3409  {
3410  ocean_assert(row + filterSize_2 >= height);
3411 
3412  filterVerticalBorderRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, height, channels, row, filter, filterSize, isSymmetric, sourcePaddingElements);
3413 
3414 #ifdef OCEAN_INTENSIVE_DEBUG
3415  {
3416  // we do not check the left and right corner, we simply check the middle block of the upper border
3417  for (unsigned int x = 0u; x < width * channels; ++x)
3418  {
3419  float result = 0.0f;
3420 
3421  for (int y = -int(filterSize_2); y <= int(filterSize_2); ++y)
3422  {
3423  const unsigned int mirroredY = mirroredBorderLocationRight((unsigned int)(int(row) + y), height);
3424  result += float(*(debugSource + mirroredY * int(sourceStrideElements) + int(x))) * filter[y + int(filterSize_2)];
3425  }
3426 
3427  const TTarget targetValue = target[x];
3428 
3429  ocean_assert(result >= 0.0f && result < 256.0f);
3430 
3431  if (std::is_same<float, TTarget>::value)
3432  {
3433  ocean_assert(NumericT<TTarget>::isWeakEqual(result, targetValue));
3434  }
3435  else
3436  {
3437  ocean_assert(NumericT<TTarget>::isEqual((TTarget)(result), targetValue, TTarget(2)));
3438  }
3439  }
3440  }
3441 #endif
3442 
3443  source += sourceStrideElements;
3444  target += targetStrideElements;
3445 
3446  ++row;
3447  }
3448 }
3449 
3450 template <typename T, typename TFilter, ProcessorInstructions tProcessorInstructions>
3451 inline void FrameFilterSeparable::filter(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const TFilter* horizontalFilter, const unsigned int horizontalFilterSize, const TFilter* verticalFilter, const unsigned int verticalFilterSize, ReusableMemory* reusableMemory, Worker* worker)
3452 {
3453  Frame localIntermediateFrame;
3454  Frame* intermediateFrame = &localIntermediateFrame;
3455 
3456  if (reusableMemory != nullptr)
3457  {
3458  intermediateFrame = &reusableMemory->intermediateFrame_;
3459  }
3460 
3461  intermediateFrame->set(FrameType(width, height, FrameType::genericPixelFormat<TFilter>(channels), FrameType::ORIGIN_UPPER_LEFT), false /*forceOwner*/, true /*forceWritable*/);
3462 
3463  // first we apply the horizontal filtering
3464 
3465  if (worker)
3466  {
3467  worker->executeFunction(Worker::Function::createStatic(&filterHorizontalSubset<T, TFilter, tProcessorInstructions>, source, intermediateFrame->data<TFilter>(), width, height, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame->paddingElements(), 0u, 0u), 0u, height);
3468  }
3469  else
3470  {
3471  filterHorizontalSubset<T, TFilter, tProcessorInstructions>(source, intermediateFrame->data<TFilter>(), width, height, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame->paddingElements(), 0u, height);
3472  }
3473 
3474  // now we apply the vertical filtering
3475  // therefore, we first need to calculate the floating point filter functions (in case we use integer factors)
3476 
3477  std::vector<float> localFloatFilters;
3478  const float* verticalFloatFilter = nullptr;
3479 
3480  if (std::is_same<TFilter, float>::value)
3481  {
3482  verticalFloatFilter = (const float*)(verticalFilter);
3483  }
3484  else
3485  {
3486  ocean_assert((std::is_same<TFilter, unsigned int>::value));
3487 
3488  const TFilter sumHorizontalFilterValues = sumFilterValues(horizontalFilter, horizontalFilterSize);
3489  const TFilter sumVerticalFilterValues = sumFilterValues(verticalFilter, verticalFilterSize);
3490 
3491  const unsigned int normalizationFactor = (unsigned int)(sumHorizontalFilterValues) * (unsigned int)(sumVerticalFilterValues);
3492  ocean_assert(normalizationFactor != 0u);
3493 
3494  const float invNormalizationFactor = 1.0f / float(normalizationFactor);
3495 
3496  std::vector<float>& floatFilterBufferToUse = reusableMemory != nullptr ? reusableMemory->filterFactors_ : localFloatFilters;
3497 
3498  floatFilterBufferToUse.resize(verticalFilterSize);
3499 
3500  for (unsigned int n = 0u; n < verticalFilterSize; ++n)
3501  {
3502  floatFilterBufferToUse[n] = float(verticalFilter[n]) * invNormalizationFactor;
3503  }
3504 
3505  verticalFloatFilter = floatFilterBufferToUse.data();
3506  }
3507 
3508  if (worker)
3509  {
3510  worker->executeFunction(Worker::Function::createStatic(&filterVerticalSubset<TFilter, T, tProcessorInstructions>, intermediateFrame->constdata<TFilter>(), target, width, height, channels, (const float*)(verticalFloatFilter), verticalFilterSize, intermediateFrame->paddingElements(), targetPaddingElements, 0u, 0u), 0u, height);
3511  }
3512  else
3513  {
3514  filterVerticalSubset<TFilter, T, tProcessorInstructions>(intermediateFrame->constdata<TFilter>(), target, width, height, channels, (const float*)(verticalFloatFilter), verticalFilterSize, intermediateFrame->paddingElements(), targetPaddingElements, 0u, height);
3515  }
3516 }
3517 
3518 template <typename T, typename TFilter>
3519 bool FrameFilterSeparable::filter(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const TFilter* horizontalFilter, const unsigned int horizontalFilterSize, const TFilter* verticalFilter, const unsigned int verticalFilterSize, Worker* worker, ReusableMemory* reusableMemory, const ProcessorInstructions processorInstructions)
3520 {
3521  ocean_assert(source != nullptr && target != nullptr);
3522  ocean_assert(width >= horizontalFilterSize && height >= verticalFilterSize);
3523  ocean_assert(channels >= 1u);
3524 
3525  if (source == nullptr || target == nullptr || width < horizontalFilterSize || height < verticalFilterSize || channels == 0u)
3526  {
3527  return false;
3528  }
3529 
3530  OCEAN_SUPPRESS_UNUSED_WARNING(reusableMemory);
3531 
3532  if (width * channels >= 16u && width >= horizontalFilterSize + 1u)
3533  {
3534  switch (Processor::bestInstructionGroup<false>(processorInstructions))
3535  {
3537  // temporary disabled: OCEAN_APPLY_IF_AVX((filter<T, TFilter, PI_GROUP_AVX_2_SSE_4_1>(source, target, width, height, channels, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, worker)));
3538  case PI_GROUP_SSE_4_1:
3539  case PI_GROUP_AVX_2_SSE_2:
3540  case PI_GROUP_SSE_2:
3541  OCEAN_APPLY_IF_SSE((filter<T, TFilter, PI_SSE_2>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, reusableMemory, worker)));
3542  return true;
3543 
3544  case PI_GROUP_NEON:
3545  OCEAN_APPLY_IF_NEON((filter<T, TFilter, PI_GROUP_NEON>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, reusableMemory, worker)));
3546  return true;
3547 
3548  case PI_NONE:
3549  break;
3550 
3551  default:
3552  ocean_assert(false && "Invalid instructions!");
3553  }
3554  }
3555 
3556  if constexpr (std::is_same<float, TFilter>::value)
3557  {
3558  filterUniversal<T>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, (const float*)(horizontalFilter), horizontalFilterSize, (const float*)(verticalFilter), verticalFilterSize, worker);
3559  return true;
3560  }
3561  else
3562  {
3563  if constexpr (std::is_same<unsigned int, TFilter>::value)
3564  {
3565  const TFilter horizontalNormalization = sumFilterValues(horizontalFilter, horizontalFilterSize);
3566  ocean_assert(horizontalNormalization != TFilter(0));
3567 
3568  std::vector<float> horizontalFloatFilter(horizontalFilterSize);
3569  for (size_t n = 0; n < horizontalFloatFilter.size(); ++n)
3570  {
3571  horizontalFloatFilter[n] = float(horizontalFilter[n]) / float(horizontalNormalization);
3572  }
3573 
3574  const TFilter verticalNormalization = sumFilterValues(verticalFilter, verticalFilterSize);
3575  ocean_assert(verticalNormalization != TFilter(0));
3576 
3577  std::vector<float> verticalFloatFilter(verticalFilterSize);
3578  for (size_t n = 0; n < verticalFloatFilter.size(); ++n)
3579  {
3580  verticalFloatFilter[n] = float(verticalFilter[n]) / float(verticalNormalization);
3581  }
3582 
3583  return filterUniversal<T>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFloatFilter.data(), (unsigned int)horizontalFloatFilter.size(), verticalFloatFilter.data(), (unsigned int)verticalFloatFilter.size(), worker);
3584  }
3585  }
3586 
3587  ocean_assert(false && "Invalid combination of parameters!");
3588  return false;
3589 }
3590 
3591 template <typename T>
3592 bool FrameFilterSeparable::filterUniversal(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const float* horizontalFilter, const unsigned int horizontalFilterSize, const float* verticalFilter, const unsigned int verticalFilterSize, Worker* worker)
3593 {
3594  ocean_assert(source != nullptr && target != nullptr);
3595  ocean_assert(width >= 1u && height >= 1u);
3596  ocean_assert(channels != 0u);
3597 
3598  ocean_assert(horizontalFilter != nullptr && verticalFilter != nullptr);
3599  ocean_assert(horizontalFilterSize % 2u == 1u);
3600  ocean_assert(verticalFilterSize % 2u == 1u);
3601 
3602  if (source == nullptr || target == nullptr
3603  || verticalFilter == nullptr || horizontalFilter == nullptr
3604  || horizontalFilterSize > width || verticalFilterSize > height
3605  || horizontalFilterSize % 2u != 1u || verticalFilterSize % 2u != 1u)
3606  {
3607  return false;
3608  }
3609 
3610  typedef typename FloatTyper<T>::Type TIntermediate;
3611 
3612  Frame intermediateFrame(FrameType(width, height, FrameType::genericPixelFormat<TIntermediate>(channels), FrameType::ORIGIN_UPPER_LEFT));
3613 
3614  if (worker)
3615  {
3616  worker->executeFunction(Worker::Function::createStatic(&filterUniversalHorizontalSubset<T, TIntermediate>, source, intermediateFrame.data<TIntermediate>(), width, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame.paddingElements(), 0u, 0u), 0u, height);
3617  worker->executeFunction(Worker::Function::createStatic(&filterUniversalVerticalSubset<T, TIntermediate>, intermediateFrame.constdata<TIntermediate>(), target, width, height, channels, verticalFilter, verticalFilterSize, intermediateFrame.paddingElements(), targetPaddingElements, 0u, 0u), 0u, height);
3618  }
3619  else
3620  {
3621  filterUniversalHorizontalSubset<T, TIntermediate>(source, intermediateFrame.data<TIntermediate>(), width, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame.paddingElements(), 0u, height);
3622  filterUniversalVerticalSubset<T, TIntermediate>(intermediateFrame.data<TIntermediate>(), target, width, height, channels, verticalFilter, verticalFilterSize, intermediateFrame.paddingElements(), targetPaddingElements, 0u, height);
3623  }
3624 
3625  return true;
3626 }
3627 
3628 template <typename T, typename TIntermediate>
3629 void FrameFilterSeparable::filterUniversalHorizontalSubset(const T* source, TIntermediate* target, const unsigned int width, unsigned int channels, const float* horizontalFilter, unsigned int horizontalFilterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
3630 {
3631  ocean_assert(source != nullptr && target != nullptr);
3632  ocean_assert(width >= 1u);
3633  ocean_assert(channels != 0u);
3634 
3635  ocean_assert(horizontalFilterSize <= size_t(width));
3636  ocean_assert(horizontalFilterSize % 2u == 1u);
3637 
3638  const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3639  const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3640 
3641  const unsigned int filterSize = horizontalFilterSize;
3642  const unsigned int filterSize_2 = filterSize / 2u;
3643  ocean_assert(filterSize_2 * 2u <= width);
3644 
3645  std::vector<TIntermediate> filterCopy;
3646 
3647  if (!std::is_same<TIntermediate, float>::value)
3648  {
3649  filterCopy.resize(horizontalFilterSize);
3650  for (size_t n = 0; n < filterCopy.size(); ++n)
3651  {
3652  filterCopy[n] = TIntermediate(horizontalFilter[n]);
3653  }
3654  }
3655 
3656  const TIntermediate* const filter = filterCopy.empty() ? (const TIntermediate*)horizontalFilter : filterCopy.data();
3657 
3658  source += firstRow * sourceStrideElements;
3659  target += firstRow * targetStrideElements;
3660 
3661  TIntermediate* const targetEnd = target + numberRows * targetStrideElements;
3662 
3663  while (target != targetEnd)
3664  {
3665  ocean_assert(target < targetEnd);
3666 
3667  // left border: [0, filterSize_2 - 1]
3668 
3669  for (unsigned int x = 0u; x < filterSize_2; ++x)
3670  {
3671  for (unsigned int n = 0u; n < channels; ++n)
3672  {
3673  TIntermediate response = TIntermediate(source[channels * mirroredBorderLocationLeft(-int(filterSize_2) + int(x)) + n]) * filter[0];
3674 
3675  for (unsigned int s = 1u; s < filterSize; ++s)
3676  response += TIntermediate(source[channels * mirroredBorderLocationLeft(-int(filterSize_2) + int(x + s)) + n]) * filter[s];
3677 
3678  target[n] = response;
3679  }
3680 
3681  target += channels;
3682  // we keep the location of source
3683  }
3684 
3685  // center block: [filterSize_2, width - filterSize - 2)
3686 
3687  for (unsigned int x = filterSize_2; x < width - filterSize_2; ++x)
3688  {
3689  for (unsigned int n = 0u; n < channels; ++n)
3690  {
3691  TIntermediate response = TIntermediate(source[channels * 0u + n]) * filter[0];
3692 
3693  for (unsigned int s = 1u; s < filterSize; ++s)
3694  response += TIntermediate(source[channels * s + n]) * filter[s];
3695 
3696  target[n] = response;
3697  }
3698 
3699  target += channels;
3700  source += channels;
3701  }
3702 
3703  // right border: [width - filterSize_2, width - 1]
3704 
3705  for (unsigned int x = 0u; x < filterSize_2; ++x)
3706  {
3707  for (unsigned int n = 0u; n < channels; ++n)
3708  {
3709  TIntermediate response = TIntermediate(source[channels * mirroredBorderLocationRight(x, filterSize_2 * 2u) + n]) * filter[0];
3710 
3711  for (unsigned int s = 1u; s < filterSize; ++s)
3712  response += TIntermediate(source[channels * mirroredBorderLocationRight(x + s, filterSize_2 * 2u) + n]) * filter[s];
3713 
3714  target[n] = response;
3715  }
3716 
3717  target += channels;
3718  // we keep the location of source
3719  }
3720 
3721  source += filterSize_2 * 2u * channels + sourcePaddingElements;
3722  target += targetPaddingElements;
3723  }
3724 }
3725 
3726 template <typename T, typename TIntermediate>
3727 void FrameFilterSeparable::filterUniversalVerticalSubset(const TIntermediate* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const float* verticalFilter, const unsigned int verticalFilterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
3728 {
3729  ocean_assert(source != nullptr && target != nullptr);
3730  ocean_assert(width >= 1u && height >= 1u);
3731  ocean_assert(channels != 0u);
3732 
3733  ocean_assert(verticalFilterSize <= height);
3734  ocean_assert(verticalFilterSize % 2u == 1u);
3735 
3736  const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3737  const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3738 
3739  const TIntermediate* const sourceStart = source;
3740 
3741  const unsigned int filterSize = verticalFilterSize;
3742  const unsigned int filterSize_2 = filterSize / 2u;
3743  ocean_assert(filterSize_2 * 2u <= height);
3744 
3745  std::vector<TIntermediate> filterCopy;
3746 
3747  if (!std::is_same<TIntermediate, float>::value)
3748  {
3749  filterCopy.resize(verticalFilterSize);
3750 
3751  for (size_t n = 0; n < filterCopy.size(); ++n)
3752  {
3753  filterCopy[n] = TIntermediate(verticalFilter[n]);
3754  }
3755  }
3756 
3757  const TIntermediate* const filter = filterCopy.empty() ? (const TIntermediate*)verticalFilter : filterCopy.data();
3758 
3759  source += max(0, int(firstRow) - int(filterSize_2)) * sourceStrideElements;
3760  target += firstRow * targetStrideElements;
3761 
3762  unsigned int y = firstRow;
3763 
3764  // top border: [0, filterSize_2 - 1]
3765 
3766  while (y < min(filterSize_2, firstRow + numberRows))
3767  {
3768  ocean_assert(source == sourceStart);
3769  const TIntermediate* sourceCopy = source;
3770 
3771  for (unsigned int x = 0u; x < width; ++x)
3772  {
3773  for (unsigned int n = 0u; n < channels; ++n)
3774  {
3775  TIntermediate response = TIntermediate(source[sourceStrideElements * mirroredBorderLocationLeft(-int(filterSize_2) + int(y)) + n]) * filter[0];
3776 
3777  for (unsigned int s = 1u; s < filterSize; ++s)
3778  response += TIntermediate(source[sourceStrideElements * mirroredBorderLocationLeft(-int(filterSize_2) + int(y + s)) + n]) * filter[s];
3779 
3780  target[n] = T(response);
3781  }
3782 
3783  target += channels;
3784  source += channels;
3785  }
3786 
3787  target += targetPaddingElements;
3788 
3789  // we set back the location of the source pointer
3790  source = sourceCopy;
3791  ++y;
3792  }
3793 
3794  // center block: [filterSize_2, height - filterSize - 2)
3795 
3796  const unsigned int centerRows = (unsigned int)max(0, int(min(firstRow + numberRows, height - filterSize_2)) - int(y));
3797 
3798  for (unsigned int yCenter = 0u; yCenter < centerRows; ++yCenter)
3799  {
3800  for (unsigned int x = 0u; x < width; ++x)
3801  {
3802  for (unsigned int c = 0u; c < channels; ++c)
3803  {
3804  TIntermediate response = TIntermediate(source[channels * 0u + c]) * filter[0];
3805 
3806  for (unsigned int s = 1u; s < filterSize; ++s)
3807  response += TIntermediate(source[sourceStrideElements * s + c]) * filter[s];
3808 
3809  target[c] = T(response);
3810  }
3811 
3812  source += channels;
3813  target += channels;
3814  }
3815 
3816  source += sourcePaddingElements;
3817  target += targetPaddingElements;
3818  }
3819 
3820  y += centerRows;
3821 
3822  // bottom border: [height - filterSize_2, height - 1]
3823 
3824  while (y < firstRow + numberRows)
3825  {
3826  ocean_assert(y >= height - filterSize_2 && y < height);
3827  source = sourceStart + (height - filterSize_2 * 2u) * sourceStrideElements;
3828 
3829  const unsigned int yy = y - (height - filterSize_2);
3830  ocean_assert(yy < filterSize_2);
3831 
3832  for (unsigned int x = 0u; x < width; ++x)
3833  {
3834  for (unsigned int n = 0u; n < channels; ++n)
3835  {
3836  TIntermediate response = TIntermediate(source[sourceStrideElements * mirroredBorderLocationRight(yy, filterSize_2 * 2u) + n]) * filter[0];
3837 
3838  for (unsigned int s = 1u; s < filterSize; ++s)
3839  {
3840  response += TIntermediate(source[sourceStrideElements * mirroredBorderLocationRight(yy + s, filterSize_2 * 2u) + n]) * filter[s];
3841  }
3842 
3843  target[n] = T(response);
3844  }
3845 
3846  target += channels;
3847  source += channels;
3848  }
3849 
3850  target += targetPaddingElements;
3851 
3852  ++y;
3853  }
3854 }
3855 
3856 inline unsigned int FrameFilterSeparable::mirroredBorderLocationLeft(const int value)
3857 {
3858  // Original: -3 -2 -1 | 0 1 2 3 4 5 6
3859  // Result: 2 1 0 | 0 1 2 3 4 5 6
3860 
3861  if (value >= 0)
3862  {
3863  return value;
3864  }
3865  else
3866  {
3867  return -value - 1;
3868  }
3869 }
3870 
3871 inline unsigned int FrameFilterSeparable::mirroredBorderLocationRight(const unsigned int value, const unsigned int size)
3872 {
3873  ocean_assert(value < 2u * size);
3874 
3875  // Original: 4 5 6 ... s-2 s-1 | s s+1 s+2
3876  // Result: 4 5 6 ... s-2 s-1 | s-1 s-2 s-3
3877 
3878  if (value < size)
3879  {
3880  return value;
3881  }
3882  else
3883  {
3884  ocean_assert(size * 2u - value - 1u < size);
3885  return size * 2u - value - 1u;
3886  }
3887 }
3888 
3889 }
3890 
3891 }
3892 
3893 #endif // META_OCEAN_CV_FRAME_FILTER_BINOMIAL_H
This class holds re-usable memory for the filtering process.
Definition: FrameFilterSeparable.h:40
ReusableMemory()=default
Default constructor.
std::vector< float > filterFactors_
Float-based filter factors which can be re-used during filtering.
Definition: FrameFilterSeparable.h:56
std::vector< float > normalizedVerticalFilter_
Normalized vertical filter factors which can be re-used during filtering.
Definition: FrameFilterSeparable.h:62
Frame intermediateFrame_
An intermediate frame which can be re-used during filtering.
Definition: FrameFilterSeparable.h:53
std::vector< float > normalizedHorizontalFilter_
Normalized horizontal filter factors which can be re-used during filtering.
Definition: FrameFilterSeparable.h:59
This class implements separable filter.
Definition: FrameFilterSeparable.h:33
static void filterVerticalSubset(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int channels, const float *filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, unsigned int firstRow, const unsigned int numberRows)
Applies the vertical filtering for a subset of the frame with a specified 1D filter kernel for frames...
Definition: FrameFilterSeparable.h:3303
static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource *sourceLeft, const TSource *sourceRight, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4)
Determines the filter responses for one filter factor of a symmetric filter for 4 successive frame el...
static bool filterUniversal(const T *source, T *target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const float *horizontalFilter, const unsigned int horizontalFilterSize, const float *verticalFilter, const unsigned int verticalFilterSize, Worker *worker=nullptr)
Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a hori...
Definition: FrameFilterSeparable.h:3592
static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource *sourceLeft, const TSource *sourceRight, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4a, typename SIMD32x4< TFilter >::Type &target_32x4b)
Determines the filter responses for one filter factor of a symmetric filter for 8 successive frame el...
static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource *source, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4a, typename SIMD32x4< TFilter >::Type &target_32x4b)
Determines the filter responses for one filter factor of an asymmetric filter for 8 successive frame ...
static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith8Elements(const TSource *const source, TFilter *const target, const unsigned int channels, const TFilter *const filter, const unsigned int filterSize, const bool isSymmetric)
Determines the horizontal filter responses for one block with 8 successive frame elements (8 elements...
Definition: FrameFilterSeparable.h:2556
static OCEAN_FORCE_INLINE void writeSIMD(const typename SIMD32x4< T >::Type &value, T *target)
Writes a SIMD with four 32 bit values to (not aligned) memory.
static void filterUniversalHorizontalSubset(const T *source, TIntermediate *target, const unsigned int width, const unsigned int channels, const float *horizontalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies an horizontal filter to a subset of an image with almost arbitrary data type.
Definition: FrameFilterSeparable.h:3629
static void filterUniversalVerticalSubset(const TIntermediate *source, T *target, const unsigned int width, const unsigned int height, const unsigned int channels, const float *verticalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies an vertical filter to a subset of an image with almost arbitrary data type.
Definition: FrameFilterSeparable.h:3727
static void filterHorizontalSubset(const TSource *source, TFilter *target, const unsigned int width, const unsigned int height, const unsigned int channels, const TFilter *filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies the horizontal filtering in a subset of a frame with a specified 1D filter kernel for frames ...
Definition: FrameFilterSeparable.h:3171
static OCEAN_FORCE_INLINE void filterVerticalCoreRow32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int channels, const float *filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
Determines the vertical filter responses for the inner core of a frame for one row.
Definition: FrameFilterSeparable.h:804
static T sumFilterValues(const T *filterValues, const size_t size)
Determines the sum of all elements of a given 1D filter.
Definition: FrameFilterSeparable.h:706
static bool isFilterSymmetric(const T *filterValues, const size_t size)
Returns whether a given 1D filter is symmetric.
Definition: FrameFilterSeparable.h:689
static OCEAN_FORCE_INLINE void filterVerticalBorderRow16Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses near the (vertical) border of a frame for one row while proc...
static OCEAN_FORCE_INLINE void filterVerticalCoreRow4Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
static OCEAN_FORCE_INLINE void filterVerticalCoreRow8Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
static OCEAN_FORCE_INLINE void filterVerticalCoreRow16Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
static void fillLeftExtraBorder(const T *source, const unsigned int channels, const unsigned int pixels, T *extendedRowLeft)
Fills the left border area of an extended row with mirrored pixel information (from the left image re...
Definition: FrameFilterSeparable.h:782
static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource *source, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4)
Determines the filter responses for one filter factor of an asymmetric filter for 4 successive frame ...
static OCEAN_FORCE_INLINE void filterVerticalBorderRow32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
Determines the vertical filter responses near the (vertical) border of a frame for one row.
static void fillRightExtraBorder(const T *sourceEnd, const unsigned int channels, const unsigned int pixels, T *extendedRowRight)
Fills the right border area of an extended row with mirrored pixel information (from the right image ...
Definition: FrameFilterSeparable.h:793
static OCEAN_FORCE_INLINE void filterVerticalBorderRow8Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses near the (vertical) border of a frame for one row while proc...
static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith4Elements(const TSource *const source, TFilter *const target, const unsigned int channels, const TFilter *const filter, const unsigned int filterSize, const bool isSymmetric)
Determines the horizontal filter responses for one block with 4 successive frame elements (4 elements...
Definition: FrameFilterSeparable.h:2494
static bool filter(const Frame &source, Frame &target, const std::vector< unsigned int > &horizontalFilter, const std::vector< unsigned int > &verticalFilter, Worker *worker=nullptr, ReusableMemory *reusableMemory=nullptr, const ProcessorInstructions processorInstructions=Processor::get().instructions())
Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a hori...
static OCEAN_FORCE_INLINE void setSIMDZero(typename SIMD32x4< T >::Type &value)
Sets a given SIMD value to zero.
static Caller< void > createStatic(typename StaticFunctionPointerMaker< void, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass >::Type function)
Creates a new caller container for a static function with no function parameter.
Definition: Caller.h:2876
This class implements Ocean's image class.
Definition: Frame.h:1760
const T * constdata(const unsigned int planeIndex=0u) const
Returns a pointer to the read-only pixel data of a specific plane.
Definition: Frame.h:4136
T * data(const unsigned int planeIndex=0u)
Returns a pointer to the pixel data of a specific plane.
Definition: Frame.h:4127
bool set(const FrameType &frameType, const bool forceOwner, const bool forceWritable=false, const Indices32 &planePaddingElements=Indices32(), const Timestamp &timestamp=Timestamp(false), bool *reallocated=nullptr)
Sets a new frame type for this frame.
unsigned int paddingElements(const unsigned int planeIndex=0u) const
Returns the optional number of padding elements at the end of each row for a specific plane.
Definition: Frame.h:4010
Definition of a frame type composed by the frame dimension, pixel format and pixel origin.
Definition: Frame.h:30
@ ORIGIN_UPPER_LEFT
The first pixel lies in the upper left corner, the last pixel in the lower right corner.
Definition: Frame.h:1018
This class implements an object able to allocate memory.
Definition: base/Memory.h:22
void * data()
Returns the pointer to the writable memory which is allocated by this object.
Definition: base/Memory.h:303
This class provides basic numeric functionalities.
Definition: Numeric.h:57
static Processor & get()
Returns a reference to the unique object.
Definition: Singleton.h:115
This class implements a worker able to distribute function calls over different threads.
Definition: Worker.h:33
bool executeFunction(const Function &function, const unsigned int first, const unsigned int size, const unsigned int firstIndex=(unsigned int)(-1), const unsigned int sizeIndex=(unsigned int)(-1), const unsigned int minimalIterations=1u, const unsigned int threadIndex=(unsigned int)(-1))
Executes a callback function separable by two function parameters.
ProcessorInstructions
Definition of individual processor instruction types.
Definition: base/Processor.h:22
static unsigned int mirroredBorderLocationRight(const unsigned int value, const unsigned int size)
Mirrors a given value at the right border if necessary.
Definition: FrameFilterSeparable.h:3871
static unsigned int mirroredBorderLocationLeft(const int value)
Mirrors a given value at the left border if necessary.
Definition: FrameFilterSeparable.h:3856
@ PI_NONE
Unknown processor instruction set.
Definition: base/Processor.h:24
@ PI_GROUP_AVX_2_SSE_2
All AVX instructions between (including) AVX and AVX2 and SSE instructions between (including) SSE an...
Definition: base/Processor.h:64
@ PI_GROUP_SSE_4_1
All SSE instructions between (including) SSE and SSE4.1.
Definition: base/Processor.h:60
@ PI_SSE_2
SEE2 instructions.
Definition: base/Processor.h:28
@ PI_NEON
NEON instructions.
Definition: base/Processor.h:50
@ PI_GROUP_AVX_2_SSE_4_1
All AVX instructions between (including) AVX and AVX2 and SSE instructions between (including) SSE an...
Definition: base/Processor.h:68
@ PI_GROUP_SSE_2
All SSE instructions between (including) SSE and SSE2.
Definition: base/Processor.h:58
@ PI_GROUP_NEON
All NEON instructions (which is currently NEON only).
Definition: base/Processor.h:66
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15
float32x4_t Type
Definition: FrameFilterSeparable.h:683
__m128 Type
Definition: FrameFilterSeparable.h:663
uint32x4_t Type
Definition: FrameFilterSeparable.h:674
__m128i Type
Definition: FrameFilterSeparable.h:654
Definition of a 128 bit SIMD data type holding four 32 bit values.
Definition: FrameFilterSeparable.h:72
DataType< uint32_t, 4u >::Type Type
Definition: FrameFilterSeparable.h:73
Default definition of a type with tBytes bytes.
Definition: DataType.h:32
float Type
The 32 bit floating point data type for any data type T but 'double'.
Definition: DataType.h:373