Ocean
Loading...
Searching...
No Matches
FrameFilterSeparable.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_FRAME_FILTER_SEPARABLE_H
9#define META_OCEAN_CV_FRAME_FILTER_SEPARABLE_H
10
11#include "ocean/cv/CV.h"
12#include "ocean/cv/NEON.h"
13#include "ocean/cv/SSE.h"
14
15#include "ocean/base/Frame.h"
16#include "ocean/base/Memory.h"
18#include "ocean/base/Worker.h"
19
20#include "ocean/math/Numeric.h"
21
22namespace Ocean
23{
24
25namespace CV
26{
27
28/**
29 * This class implements separable filter.
30 * @ingroup cv
31 */
32class OCEAN_CV_EXPORT FrameFilterSeparable
33{
34 public:
35
36 /**
37 * This class holds re-usable memory for the filtering process.
38 */
40 {
42
43 public:
44
45 /**
46 * Default constructor.
47 */
48 ReusableMemory() = default;
49
50 protected:
51
52 /// An intermediate frame which can be re-used during filtering.
54
55 /// Float-based filter factors which can be re-used during filtering.
56 std::vector<float> filterFactors_;
57
58 /// Normalized horizontal filter factors which can be re-used during filtering.
59 std::vector<float> normalizedHorizontalFilter_;
60
61 /// Normalized vertical filter factors which can be re-used during filtering.
62 std::vector<float> normalizedVerticalFilter_;
63 };
64
65 protected:
66
67 /**
68 * Definition of a 128 bit SIMD data type holding four 32 bit values.
69 */
70 template <typename T>
71 struct SIMD32x4
72 {
74 };
75
76 public:
77
78 /**
79 * Returns whether a given 1D filter is symmetric.
80 * @param filterValues The individual values of the 1D filter, must be valid
81 * @param size The size of the filter (the number of filter elements), with range [1, infinity), must be odd
82 * @return True, if so
83 * @tparam T The data type of each filter value, e.g., 'unsigned int', or 'float'
84 */
85 template <typename T>
86 static bool isFilterSymmetric(const T* filterValues, const size_t size);
87
88 /**
89 * Determines the sum of all elements of a given 1D filter.
90 * @param filterValues The individual values of the 1D filter, must be valid
91 * @param size The size of the filter (the number of filter elements), with range [1, infinity)
92 * @return The sum of all filter values
93 * @tparam T The data type of each filter value, e.g., 'unsigned int', or 'float'
94 */
95 template <typename T>
96 static T sumFilterValues(const T* filterValues, const size_t size);
97
98 /**
99 * Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a horizontal 1D filter and a vertical 1D filter for frames with zipped pixel format.
100 * The filter result is stored in a target frame with zipped pixel format.
101 *
102 * The provided filter values are given with integer precision, the filter responses will be normalized automatically.<br>
103 *
104 * Here is an example showing how to use this function:
105 * @code
106 * void function(const Frame& rgbFrame)
107 * {
108 * // now let's create a simple Gaussian blur filter with kernel size 3
109 * const std::vector<unsigned int> filter[] = {1u, 2u, 1u};
110 *
111 * // so let's filter our frame
112 * Frame targetFrame(rgbFrame.frameType());
113 * FrameFilterSeparable::filter(rgbFrame, targetFrame, filter, filter);
114 * }
115 * @endcode
116 * @param source The source frame to be filtered, with zipped pixel format and with data type DT_UNSIGNED_INTEGER_8, or DT_SIGNED_FLOAT_32, must be valid
117 * @param target The target frame receiving the filtered results, will be set to the correct frame type, if invalid or if the type does not match the source frame
118 * @param horizontalFilter The horizontal filter, the number of filter elements must be odd, at least 1 element
119 * @param verticalFilter The vertical filter, the number of filter elements must be odd, at least 1 element
120 * @param worker Optional worker object to distribute the computation
121 * @param reusableMemory An optional object holding reusable memory which can be used during filtering, nullptr otherwise
122 * @param processorInstructions The set of available instructions, may be any combination of instructions
123 * @see filter<T, TFilter>()
124 */
125 static bool filter(const Frame& source, Frame& target, const std::vector<unsigned int>& horizontalFilter, const std::vector<unsigned int>& verticalFilter, Worker* worker = nullptr, ReusableMemory* reusableMemory = nullptr, const ProcessorInstructions processorInstructions = Processor::get().instructions());
126
127 /**
128 * Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a horizontal 1D filter and a vertical 1D filter for frames with zipped pixel format.
129 * The filter result is stored in a target frame with zipped pixel format.
130 *
131 * When providing filter values with integer precision, the filter responses will be normalized automatically.<br>
132 * In contrast, when providing filter values with floating point precision, the filter responses will not be normalized.<br>
133 * Thus, you need to provide a normalized filter already when providing floating point filters.
134 *
135 * Here is an example showing how to use this function:
136 * @code
137 * void function(const Frame& rgbFrame)
138 * {
139 * // now let's create a simple Gaussian blur filter with kernel size 3
140 * const unsigned int horizontalFilter[] = {1u, 2u, 1u};
141 * const unsigned int verticalFilter[] = {1u, 2u, 1u};
142 *
143 * // so let's filter our frame
144 * Frame targetFrame(rgbFrame.frameType());
145 * FrameFilterSeparable::filter<uint8_t, unsigned int>(rgbFrame.constdata<uint8_t>(), targetFrame.data<uint8_t>(), rgbFrame.width(), rgbFrame.height(), rgbFrame.channels(), horizontalFilter, 3u, verticalFilter, 3u, rgbFrame.paddingElements(), targetFrame.paddingElements());
146 * }
147 * @endcode
148 * @param source The source frame to be filtered, must be valid
149 * @param target The target frame receiving the filtered results, can be the same memory pointer as 'source', must be valid
150 * @param width The width of the source (and target) frame in pixel, with range [horizontalFilterSize, infinity)
151 * @param height The height of the source (and target) frame in pixel, with range [verticalFilterSize, infinity)
152 * @param channels The number of channels the source frame (and target frame) has, with range [1, infinity)
153 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
154 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
155 * @param horizontalFilter The elements of the horizontal filter, must be valid
156 * @param horizontalFilterSize The number of elements the horizontal filter has, with range [1, width], must be odd
157 * @param verticalFilter The elements of the vertical filter, must be valid
158 * @param verticalFilterSize The number of elements the vertical filter has, with range [1, height], must be odd
159 * @param worker Optional worker object to distribute the computation
160 * @param reusableMemory An optional object holding reusable memory which can be used during filtering, nullptr otherwise
161 * @param processorInstructions The set of available instructions, may be any combination of instructions
162 * @tparam T The data type of each pixel channel of the source frame (and target frame) e.g., 'uint8_t', or 'float'
163 * @tparam TFilter The data type of each filter elements e.g., 'unsigned int', or 'float'
164 * @see filterUniversal<T>()
165 */
166 template <typename T, typename TFilter>
167 static bool filter(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const TFilter* horizontalFilter, const unsigned int horizontalFilterSize, const TFilter* verticalFilter, const unsigned int verticalFilterSize, Worker* worker = nullptr, ReusableMemory* reusableMemory = nullptr, const ProcessorInstructions processorInstructions = Processor::get().instructions());
168
169 /**
170 * Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a horizontal 1D filter and a vertical 1D filter for frames with almost arbitrary pixel format.
171 * This function supports images with arbitrary pixel format as long as the pixel format is zipped (e.g,. FrameType::FORMAT_Y8, FrameType::FORMAT_RGB24, ...).
172 *
173 * Beware: This function is not highly optimized, you may want to check whether Ocean provided a more optimized implementation for your purpose if performance matters e.g., filer<T, TFilter>().
174 *
175 * Here is an example showing how to use this function:
176 * @code
177 * void function(const Frame& rgbFrame)
178 * {
179 * // let's say we receive a frame with FORMAT_RGB24 pixel format
180 * if (rgbFrame.pixelFormat() != FrameType::FORMAT_RGB24)
181 * {
182 * // wrong pixel format
183 * return;
184 * }
185 *
186 * // let's convert this frame to a floating point frame
187 * Frame floatFrameWith3Channels(FrameType(rgbFrame, FrameType::genericPixelFormat<float, 3u>()));
188 *
189 * FrameConverter::cast<uint8_t, float>(rgbFrame.constdata<uint8_t>(), floatFrameWith3Channels.data<float>(), rgbFrame.width(), rgbFrame.height(), rgbFrame.channels());
190 *
191 * // now let's create a simple Gaussian blur filter with kernel size 3
192 * const float horizontalFilter[] = {0.25f, 0.5f, 0.25f};
193 * const float verticalFilter[] = {0.25f, 0.5f, 0.25f};
194 *
195 * const unsigned int channels = 3u;
196 *
197 * // so let's filter our floating point frame
198 * Frame floatTargetFrame(floatFrameWith3Channels.frameType());
199 * FrameFilterSeparable::filterUniversal<float>(floatFrameWith3Channels.constdata<float>(), floatTargetFrame.data<float>(), floatTargetFrame.width(), floatTargetFrame.height(), channels, horizontalFilter, 3u, verticalFilter, 3u);
200 *
201 * // btw: we could also apply the same filter to our RGB24 frame (with uint8_t values)
202 * // however, this time we lose the floating point accuracy
203 * Frame rgbTargetFrame(rgbFrame.frameType());
204 * FrameFilterSeparable::filterUniversal<uint8_t>(rgbFrame.constdata<uint8_t>(), rgbTargetFrame.data<uint8_t>(), rgbFrame.width(), rgbFrame.height(), channels, horizontalFilter, 3u, verticalFilter, 3u, rgbFrame.paddingElements(), rgbTargetFrame.paddingElements());
205 * }
206 * @endcode
207 * @param source The source frame to which the filter will be applied, must be valid
208 * @param target The target frame receiving the filtered results, can be the same memory pointer as 'source', must be valid
209 * @param width The width of the source frame (and target frame) in pixel, with range [1, infinity)
210 * @param height The height of the source frame (and target frame) in pixel, with range [1, infinity)
211 * @param channels The number of channels the source and target frame have, with range [1, infinity)
212 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
213 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
214 * @param horizontalFilter The (separable) horizontal filter to be applied, must be valid
215 * @param horizontalFilterSize The number of horizontal filter elements, with range [1, width], must be odd
216 * @param verticalFilter The (separable) vertical filter to be applied, must be valid
217 * @param verticalFilterSize The number of vertical filter elements, with range [1, height], must be odd
218 * @param worker Optional worker object to distribute the computation to several CPU cores
219 * @return True, if the filter could be applied; False, if the input parameters were wrong
220 * @tparam T The data type of each pixel channel of the source and target frame, e.g., 'uint8_t', 'int', 'float', ...
221 * @see filter<T, TFilter>()
222 */
223 template <typename T>
224 static bool filterUniversal(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const float* horizontalFilter, const unsigned int horizontalFilterSize, const float* verticalFilter, const unsigned int verticalFilterSize, Worker* worker = nullptr);
225
226 protected:
227
228 /**
229 * Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a horizontal 1D filter and a vertical 1D filter for frames with zipped pixel format.
230 * The filter result is stored in a target frame with zipped pixel format.
231 * @param source The source frame to be filtered, must be valid
232 * @param target The target frame receiving the filtered results, must be valid
233 * @param width The width of the source (and target) frame in pixel, with range [horizontalFilterSize, infinity)
234 * @param height The height of the source (and target) frame in pixel, with range [verticalFilterSize, infinity)
235 * @param channels The number of channels the source frame (and target frame) has, with range [1, infinity)
236 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
237 * @param targetPaddingElements The number of padding elements at the end of each target row, in elements, with range [0, infinity)
238 * @param horizontalFilter The elements of the horizontal filter, must be valid
239 * @param horizontalFilterSize The number of elements the horizontal filter has, with range [1, width], must be odd
240 * @param verticalFilter The elements of the vertical filter, must be valid
241 * @param verticalFilterSize The number of elements the vertical filter has, with range [1, height], must be odd
242 * @param reusableMemory An optional object holding reusable memory which can be used during filtering, nullptr otherwise
243 * @param worker Optional worker object to distribute the computation
244 * @tparam T The data type of each pixel channel of the source frame (and target frame) e.g., 'uint8_t', or 'float'
245 * @tparam TFilter The data type of each filter elements e.g., 'unsigned int', or 'float'
246 * @tparam tProcessorInstructions The processor instructions that can be used
247 * @see filterUniversal<T>()
248 */
249 template <typename T, typename TFilter, ProcessorInstructions tProcessorInstructions>
250 static void filter(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const TFilter* horizontalFilter, const unsigned int horizontalFilterSize, const TFilter* verticalFilter, const unsigned int verticalFilterSize, ReusableMemory* reusableMemory = nullptr, Worker* worker = nullptr);
251
252 /**
253 * Sets a given SIMD value to zero.
254 * @param value The SIMD value to be set
255 * @tparam T The 32 bit data type of the SIMD value
256 * @tparam tProcessorInstructions The set of available instructions, may be any combination of instructions
257 */
258 template <typename T, ProcessorInstructions tProcessorInstructions>
259 static OCEAN_FORCE_INLINE void setSIMDZero(typename SIMD32x4<T>::Type& value);
260
261 /**
262 * Writes a SIMD with four 32 bit values to (not aligned) memory.
263 * @param value The SIMD value to be written
264 * @param target The buffer receiving the values
265 * @tparam T The 32 bit data type of the SIMD value
266 * @tparam tProcessorInstructions The set of available instructions, may be any combination of instructions
267 */
268 template <typename T, ProcessorInstructions tProcessorInstructions>
269 static OCEAN_FORCE_INLINE void writeSIMD(const typename SIMD32x4<T>::Type& value, T* target);
270
271 /**
272 * Fills the left border area of an extended row with mirrored pixel information (from the left image region).
273 * @param source The source row providing the image information to be mirrored, must be valid
274 * @param channels The number of channels the source frame has, with range [1, infinity)
275 * @param pixels The number of pixels to be mirrored, should be filterSize / 2u, with range [1, width]
276 * @param extendedRowLeft The pointer to the left border area of the extended row to which the mirrored image content will be copied, must be valid
277 * @tparam T The data type of each pixel channel, e.g., 'uint8_t', or 'float'
278 * @see fillRightExtraBorder().
279 */
280 template <typename T>
281 static void fillLeftExtraBorder(const T* source, const unsigned int channels, const unsigned int pixels, T* extendedRowLeft);
282
283 /**
284 * Fills the right border area of an extended row with mirrored pixel information (from the right image region).
285 * @param sourceEnd The end of the source row providing the image information to be mirrored (source + width * channels), must be valid
286 * @param channels The number of channels the source frame has, with range [1, infinity)
287 * @param pixels The number of pixels to be mirrored, should be filterSize / 2u, with range [1, width]
288 * @param extendedRowRight The pointer to the right border area of the extended row to which the mirrored image content will be copied, must be valid
289 * @tparam T The data type of each pixel channel, e.g., 'uint8_t', or 'float'
290 * @see fillLeftExtraBorder().
291 */
292 template <typename T>
293 static void fillRightExtraBorder(const T* sourceEnd, const unsigned int channels, const unsigned int pixels, T* extendedRowRight);
294
295 /**
296 * Determines the filter responses for one filter factor of an asymmetric filter for 4 successive frame elements (4 elements are 4 successive pixels in a Y8 frame or 1 + 1/3 pixels in a RGB24 frame) and adds the individual results to given target elements.
297 * <pre>
298 * This function calculates the following:
299 * target[0] += source[0] * filterFactor
300 * ...
301 * target[3] += source[3] * filterFactor
302 * </pre>
303 * @param source The source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
304 * @param filterFactor The filter factor to be used for multiplication
305 * @param target_32x4 The four 32 bit accumulated filter response values to which the multiplication result will be added
306 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
307 * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
308 * @tparam tProcessorInstructions The set of available processor instructions needed
309 * @see symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements().
310 */
311 template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
312 static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource* source, const TFilter& filterFactor, typename SIMD32x4<TFilter>::Type& target_32x4);
313
314 /**
315 * Determines the filter responses for one filter factor of a symmetric filter for 4 successive frame elements (4 elements are 4 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame) and adds the individual results to given target elements.
316 * This function applies a run-time known filter factor.
317 * <pre>
318 * This function calculates the following:
319 * targeta[0] += (sourceLeft[0] + sourceRight[0]) * filterFactor
320 * ...
321 * targetb[3] += (sourceLeft[3] + sourceRight[3]) * filterFactor
322 * </pre>
323 * @param sourceLeft The left source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
324 * @param sourceRight The right source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
325 * @param filterFactor The filter factor to be used for multiplication
326 * @param target_32x4 The four 32 bit accumulated filter response values to which the multiplication result will be added
327 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
328 * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
329 * @tparam tProcessorInstructions The set of available processor instructions needed
330 * @see asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements().
331 */
332 template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
333 static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource* sourceLeft, const TSource* sourceRight, const TFilter& filterFactor, typename SIMD32x4<TFilter>::Type& target_32x4);
334
335 /**
336 * Determines the filter responses for one filter factor of an asymmetric filter for 8 successive frame elements (8 elements are 8 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame) and adds the individual results to given target elements.
337 * <pre>
338 * This function calculates the following:
339 * targeta[0] += source[0] * filterFactor
340 * ...
341 * targeta[3] += source[3] * filterFactor
342 * targetb[4] += source[4] * filterFactor
343 * ...
344 * targetb[7] += source[7] * filterFactor
345 * </pre>
346 * @param source The source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
347 * @param filterFactor The filter factor to be used for multiplication
348 * @param target_32x4a The first four 32 bit accumulated filter response values to which the multiplication result will be added
349 * @param target_32x4b The second four 32 bit accumulated filter response values to which the multiplication result will be added
350 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
351 * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
352 * @tparam tProcessorInstructions The set of available processor instructions needed
353 * @see symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements().
354 */
355 template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
356 static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource* source, const TFilter& filterFactor, typename SIMD32x4<TFilter>::Type& target_32x4a, typename SIMD32x4<TFilter>::Type& target_32x4b);
357
358 /**
359 * Determines the filter responses for one filter factor of a symmetric filter for 8 successive frame elements (8 elements are 8 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame) and adds the individual results to given target elements.
360 * This function applies a run-time known filter factor.
361 * <pre>
362 * This function calculates the following:
363 * targeta[0] += (sourceLeft[0] + sourceRight[0]) * filterFactor
364 * ...
365 * targeta[3] += (sourceLeft[3] + sourceRight[3]) * filterFactor
366 * targetb[4] += (sourceLeft[4] + sourceRight[4]) * filterFactor
367 * ...
368 * targetb[7] += (sourceLeft[7] + sourceRight[7]) * filterFactor
369 * </pre>
370 * @param sourceLeft The left source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
371 * @param sourceRight The right source elements for which the filter will be applied, the buffer must have at least 8 elements, must be valid
372 * @param filterFactor The filter factor to be used for multiplication
373 * @param target_32x4a The first four 32 bit accumulated filter response values to which the multiplication result will be added
374 * @param target_32x4b The second four 32 bit accumulated filter response values to which the multiplication result will be added
375 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
376 * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
377 * @tparam tProcessorInstructions The set of available processor instructions needed
378 * @see asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements().
379 */
380 template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
381 static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource* sourceLeft, const TSource* sourceRight, const TFilter& filterFactor, typename SIMD32x4<TFilter>::Type& target_32x4a, typename SIMD32x4<TFilter>::Type& target_32x4b);
382
383 /**
384 * Determines the horizontal filter responses for one block with 4 successive frame elements (4 elements are 4 successive pixels in a Y8 frame or 1 + 1/3 pixels in a RGB24 frame).
385 * @param source The first source element for which the filter will be applied, the buffer must contain at least 4 + 'filterSize' - 1 elements, must be valid
386 * @param target The first target element receiving the filter responses, the buffer must contain at least 4 elements, must be valid
387 * @param channels The number of channels the source (and target) frame has, with range [1, infinity)
388 * @param filter The filter factors of the horizontal filter, with 'filterSize' elements, must be valid
389 * @param filterSize The size of the given filter, with range [1, width], must be odd
390 * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
391 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
392 * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
393 * @tparam tProcessorInstructions The set of available processor instructions needed
394 * @see isFilterSymmetric().
395 */
396 template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
397 static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith4Elements(const TSource* const source, TFilter* const target, const unsigned int channels, const TFilter* const filter, const unsigned int filterSize, const bool isSymmetric);
398
399 /**
400 * Determines the horizontal filter responses for one block with 8 successive frame elements (8 elements are 8 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame).
401 * @param source The first source element for which the filter will be applied, the buffer must contain at least 8 + 'filterSize' - 1 elements, must be valid
402 * @param target The first target element receiving the filter responses, the buffer must contain at least 8 elements, must be valid
403 * @param channels The number of channels the source (and target) frame has, with range [1, infinity)
404 * @param filter The filter factors of the horizontal filter, with 'filterSize' elements, must be valid
405 * @param filterSize The size of the given filter, with range [1, width], must be odd
406 * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
407 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
408 * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
409 * @tparam tProcessorInstructions The set of available processor instructions needed
410 * @see isFilterSymmetric().
411 */
412 template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
413 static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith8Elements(const TSource* const source, TFilter* const target, const unsigned int channels, const TFilter* const filter, const unsigned int filterSize, const bool isSymmetric);
414
415 /**
416 * Determines the vertical filter responses for the inner core of a frame for one row while processing a block of 4 elements within one iteration (4 elements are 4 successive pixels in a Y8 frame or 1 + 1/3 pixels in a RGB24 frame).
417 * The inner core lies within the frame not covering the frame border of size of filterSize/2.<br>
418 * @param source The first source element that will be used for filtering, must be valid
419 * @param target The first target elements that will receive the filtered results, must be valid
420 * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [width * channels, infinity)
421 * @param filter The filter factors also containing the normalization, must be
422 * @param filterSize The size of the given filter, with range [1, width], must be odd
423 * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
424 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
425 * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
426 * @tparam tProcessorInstructions The set of available processor instructions needed
427 */
428 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
429 static OCEAN_FORCE_INLINE void filterVerticalCoreRow4Elements32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric);
430
431 /**
432 * Determines the vertical filter responses for the inner core of a frame for one row while processing a block of 8 elements within one iteration (8 elements are 8 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame).
433 * The inner core lies within the frame not covering the frame border of size of filterSize/2.<br>
434 * @param source The first source element that will be used for filtering, must be valid
435 * @param target The first target elements that will receive the filtered results, must be valid
436 * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [width * channels, infinity)
437 * @param filter The filter factors also containing the normalization, must be
438 * @param filterSize The size of the given filter, with range [1, width], must be odd
439 * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
440 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
441 * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
442 * @tparam tProcessorInstructions The set of available processor instructions needed
443 */
444 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
445 static OCEAN_FORCE_INLINE void filterVerticalCoreRow8Elements32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric);
446
447 /**
448 * Determines the vertical filter responses for the inner core of a frame for one row while processing a block of 16 elements within one iteration (16 elements are 16 successive pixels in a Y8 frame or 5 + 1/3 pixels in a RGB24 frame).
449 * The inner core lies within the frame not covering the (vertical) frame border of size of filterSize/2.<br>
450 * @param source The first source element that will be used for filtering, must be valid
451 * @param target The first target elements that will receive the filtered results, must be valid
452 * @param sourceStrideElements The stride of the frame in elements, stideElements = width * channels + paddingElements, with range [1, infinity)
453 * @param filter The filter factors also containing the normalization, must be
454 * @param filterSize The size of the given filter, with range [1, width], must be odd
455 * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
456 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
457 * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
458 * @tparam tProcessorInstructions The set of available processor instructions needed
459 */
460 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
461 static OCEAN_FORCE_INLINE void filterVerticalCoreRow16Elements32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric);
462
463 /**
464 * Determines the vertical filter responses for the inner core of a frame for one row.
465 * The inner core lies within the frame not covering the (vertical) frame border of size of filterSize/2.<br>
466 * @param source The first source element that will be used for filtering, must be valid
467 * @param target The first target elements that will receive the filtered results, must be valid
468 * @param width The width of the frame in pixel, with range [1, infinity)
469 * @param channels The number of channels the source (and target) frame has, with range [1, infinity)
470 * @param filter The filter factors also containing the normalization, must be
471 * @param filterSize The size of the given filter, with range [1, width], must be odd
472 * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
473 * @param sourcePaddingElements The number of padding elements at the end of each source row, in elements, with range [0, infinity)
474 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
475 * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
476 * @tparam tProcessorInstructions The set of available processor instructions needed
477 */
478 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
479 static OCEAN_FORCE_INLINE void filterVerticalCoreRow32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int width, const unsigned int channels, const float* filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements);
480
481 /**
482 * Determines the vertical filter responses near the (vertical) border of a frame for one row while processing a block of 8 elements within one iteration (8 elements are 8 successive pixels in a Y8 frame or 2 + 2/3 pixels in a RGB24 frame).
483 * The border covers the upper and lower filterSize/2 rows of a frame as this area needs a special handling of filter locations lying outside the frame.
484 * @param source The first source element that will be used for filtering, must be valid
485 * @param target The first target elements that will receive the filtered results, must be valid
486 * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [width * channels, infinity)
487 * @param height The height of the frame in pixel, with range [1, infinity)
488 * @param row The row to be handled, with range [0, height - 1]
489 * @param filter The filter factors, must be 'filterSize' individual values
490 * @param filterSize The number of filter factors, with range [1, width - 1], must be odd
491 * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
492 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
493 * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
494 * @tparam tProcessorInstructions The set of available processor instructions needed
495 */
496 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
497 static OCEAN_FORCE_INLINE void filterVerticalBorderRow8Elements32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric);
498
499 /**
500 * Determines the vertical filter responses near the (vertical) border of a frame for one row while processing a block of 16 elements within one iteration (16 elements are 16 successive pixels in a Y8 frame or 5 + 1/3 pixels in a RGB24 frame).
501 * The border covers the upper and lower filterSize/2 rows of a frame as this area needs a special handling of filter locations lying outside the frame.
502 * @param source The first source element that will be used for filtering, must be valid
503 * @param target The first target elements that will receive the filtered results, must be valid
504 * @param sourceStrideElements The number of elements between two successive rows, in elements, with range [width * channels, infinity)
505 * @param height The height of the frame in pixel, with range [1, infinity)
506 * @param row The row to be handled, with range [0, height - 1]
507 * @param filter The filter factors, must be 'filterSize' individual values
508 * @param filterSize The number of filter factors, with range [1, width - 1], must be odd
509 * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
510 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
511 * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
512 * @tparam tProcessorInstructions The set of available processor instructions needed
513 */
514 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
515 static OCEAN_FORCE_INLINE void filterVerticalBorderRow16Elements32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric);
516
517 /**
518 * Determines the vertical filter responses near the (vertical) border of a frame for one row.
519 * The border covers the upper and lower filterSize/2 rows of a frame as this area needs a special handling of filter locations lying outside the frame.
520 * @param source The first source element that will be used for filtering, must be valid
521 * @param target The first target elements that will receive the filtered results, must be valid
522 * @param width The width of the frame in pixel, with range [1, infinity)
523 * @param height The height of the frame in pixel, with range [1, infinity)
524 * @param channels The number of data channels both frames have, with range [1, infinity)
525 * @param row The row to be handled, with range [0, height - 1]
526 * @param filter The filter factors, must be 'filterSize' individual values
527 * @param filterSize The number of filter factors, with range [1, width - 1], must be odd
528 * @param isSymmetric True, if the provided filter is symmetric; False, otherwise
529 * @param sourcePaddingElements Optional number of padding elements at the end of each source row, in elements, with range [0, infinity)
530 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
531 * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
532 * @tparam tProcessorInstructions The set of available processor instructions needed
533 */
534 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
535 static OCEAN_FORCE_INLINE void filterVerticalBorderRow32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements);
536
537 /**
538 * Applies the horizontal filtering in a subset of a frame with a specified 1D filter kernel for frames with zipped pixel format.
539 * The filter result is stored in a target frame with zipped pixel format and 32 bit per channel.
540 * @param source The source frame to be filtered, must be valid
541 * @param target The target frame receiving the filtered results, must be valid
542 * @param width The width of the source (and target) frame in pixel, with range [filterSize + 1, infinity)
543 * @param height The height of the source (and target) frame in pixel, with range [filterSize, infinity)
544 * @param channels The number of data channels both frames have, with range [1, infinity)
545 * @param filter The filter factors, must be 'filterSize' individual values
546 * @param filterSize The number of filter factors, with range [1, width - 1], must be odd
547 * @param sourcePaddingElements Optional number of padding elements at the end of each source row, in elements, with range [0, infinity)
548 * @param targetPaddingElements Optional number of padding elements at the end of each target row, in elements, with range [0, infinity)
549 * @param firstRow The first row to be handled, with range [0, height)
550 * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
551 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
552 * @tparam TFilter The data type of the filter elements e.g., 'unsigned int', or 'float'
553 * @tparam tProcessorInstructions The set of available processor instructions needed
554 */
555 template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
556 static void filterHorizontalSubset(const TSource* source, TFilter* target, const unsigned int width, const unsigned int height, const unsigned int channels, const TFilter* filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
557
558 /**
559 * Applies the vertical filtering for a subset of the frame with a specified 1D filter kernel for frames with zipped pixel format and 32 bit per channel.
560 * The filter result is stored in a target frame with zipped pixel format and 8 bit per channel.<br>
561 * This function uses floating point filter factors ensuring the final result is normalized.
562 * @param source The source frame to be filtered, must be valid
563 * @param target The target frame receiving the filtered results, must be valid
564 * @param width The width of the source (and target) frame in pixel, with range [max(filterSize + 1, 16 / channels), infinity)
565 * @param height The height of the source (and target) frame in pixel, with range [filterSize, infinity)
566 * @param channels The number of data channels both frames have, with range [1, infinity)
567 * @param filter The filter factors, must be 'filterSize' individual values
568 * @param filterSize The number of filter factors, with range [1, width - 1], must be odd
569 * @param sourcePaddingElements Optional number of padding elements at the end of each source row, in elements, with range [0, infinity)
570 * @param targetPaddingElements Optional number of padding elements at the end of each target row, in elements, with range [0, infinity)
571 * @param firstRow The first row to be handled, with range [0, height]
572 * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
573 * @tparam TSource The data type of the source elements e.g., 'uint8_t', or 'float'
574 * @tparam TTarget The data type of the filter elements e.g., 'unsigned int', or 'float'
575 * @tparam tProcessorInstructions The set of available processor instructions needed
576 */
577 template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
578 static void filterVerticalSubset(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int channels, const float* filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, unsigned int firstRow, const unsigned int numberRows);
579
580 /**
581 * Applies an horizontal filter to a subset of an image with almost arbitrary data type.
582 * @param source The source frame to which the filter will be applied, must be valid
583 * @param target The target frame receiving the filter response, must be valid
584 * @param width The width of the source frame (and target frame) in pixel, with range [1, infinity)
585 * @param channels The number of channels the source and target frame have, with range [1, infinity)
586 * @param horizontalFilter The (separable) horizontal filter to be applied, must be valid
587 * @param filterSize The number of filter elements, must be odd, with range [1, width]
588 * @param sourcePaddingElements Optional number of padding elements at the end of each source row, in elements, with range [0, infinity)
589 * @param targetPaddingElements Optional number of padding elements at the end of each target row, in elements, with range [0, infinity)
590 * @param firstRow The first row to be handled, with range [0, height)
591 * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
592 * @tparam T The data type of each pixel channel of the source and target frame, e.g., 'uint8_t', 'int', 'float', ...
593 * @tparam TIntermediate The data type of the intermediate target frame, should be either 'float' or 'double'
594 */
595 template <typename T, typename TIntermediate>
596 static void filterUniversalHorizontalSubset(const T* source, TIntermediate* target, const unsigned int width, const unsigned int channels, const float* horizontalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
597
598 /**
599 * Applies an vertical filter to a subset of an image with almost arbitrary data type.
600 * @param source The source frame to which the filter will be applied, must be valid
601 * @param target The target frame receiving the filter response, must be valid
602 * @param width The width of the source frame (and target frame) in pixel, with range [1, infinity)
603 * @param height The height of the source frame (and target frame) in pixel, with range [1, infinity)
604 * @param channels The number of channels the source and target frame have, with range [1, infinity)
605 * @param verticalFilter The (separable) vertical filter to be applied, must be valid
606 * @param filterSize The number of filter elements, must be odd, with range [1, width]
607 * @param sourcePaddingElements Optional number of padding elements at the end of each source row, in elements, with range [0, infinity)
608 * @param targetPaddingElements Optional number of padding elements at the end of each target row, in elements, with range [0, infinity)
609 * @param firstRow The first row to be handled, with range [0, height)
610 * @param numberRows The number of rows to be handled, with range [1, height - firstRow]
611 * @tparam T The data type of each pixel channel of the source and target frame, e.g., 'uint8_t', 'int', 'float', ...
612 * @tparam TIntermediate The data type of the intermediate target frame, should be either 'float' or 'double'
613 */
614 template <typename T, typename TIntermediate>
615 static void filterUniversalVerticalSubset(const TIntermediate* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const float* verticalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows);
616
617 /**
618 * Mirrors a given value at the left border if necessary.
619 * The function provides a result as below:<br>
620 * <pre>
621 * Original: -3 -2 -1 | 0 1 2 3 4 5 6
622 * Result: 2 1 0 | 0 1 2 3 4 5 6
623 * </pre>
624 * @param value The value to be mirrored, with range (-infinity, infinity)
625 * @return Mirrored value
626 * @ingroup base
627 */
628 static inline unsigned int mirroredBorderLocationLeft(const int value);
629
630 /**
631 * Mirrors a given value at the right border if necessary.
632 * The values is mirrored according to a given size parameter.<br>
633 * The function provides a result as below:<br>
634 * <pre>
635 * Original: 4 5 6 ... s-2 s-1 | s s+1 s+2
636 * Result: 4 5 6 ... s-2 s-1 | s-1 s-2 s-3
637 * </pre>
638 * @param value The value to be mirrored, with range [0, 2*size)
639 * @param size Specified size defining the upper mirror border, with range [1, 2147483647]
640 * @return Mirrored value
641 * @ingroup base
642 */
643 static inline unsigned int mirroredBorderLocationRight(const unsigned int value, const unsigned int size);
644};
645
646#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 10
647
648/**
649 * Definition of a 128 bit SIMD data type holding four 32 bit values.
650 */
651template <>
653{
654 typedef __m128i Type;
655};
656
657/**
658 * Definition of a 128 bit SIMD data type holding four 32 bit values.
659 */
660template <>
662{
663 typedef __m128 Type;
664};
665
666#elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
667
668/**
669 * Definition of a 128 bit SIMD data type holding four 32 bit values.
670 */
671template <>
672struct FrameFilterSeparable::SIMD32x4<unsigned int>
673{
674 typedef uint32x4_t Type;
675};
676
677/**
678 * Definition of a 128 bit SIMD data type holding four 32 bit values.
679 */
680template <>
682{
683 typedef float32x4_t Type;
684};
685
686#endif
687
688template <typename T>
689bool FrameFilterSeparable::isFilterSymmetric(const T* filterValues, const size_t size)
690{
691 ocean_assert(filterValues != nullptr);
692 ocean_assert(size >= 1 && size % 2 == 1);
693
694 for (size_t n = 0; n < size / 2; ++n)
695 {
696 if (NumericT<T>::isNotEqual(filterValues[n], filterValues[size - n - 1]))
697 {
698 return false;
699 }
700 }
701
702 return true;
703}
704
705template <typename T>
706T FrameFilterSeparable::sumFilterValues(const T* filterValues, const size_t size)
707{
708 ocean_assert(filterValues != nullptr);
709 ocean_assert(size >= 1);
710
711 T sum = filterValues[0];
712
713 for (size_t n = 1; n < size; ++n)
714 {
715 sum += filterValues[n];
716 }
717
718 return sum;
719}
720
721#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
722
723template <>
724OCEAN_FORCE_INLINE void FrameFilterSeparable::setSIMDZero<unsigned int, PI_SSE_2>(typename SIMD32x4<unsigned int>::Type& value)
725{
726 // SSE2: _mm_setzero_si128
727
728 value = _mm_setzero_si128();
729}
730
731template <>
732OCEAN_FORCE_INLINE void FrameFilterSeparable::setSIMDZero<float, PI_SSE_2>(typename SIMD32x4<float>::Type& value)
733{
734 // SSE: _mm_set_ps1
735
736 value = _mm_set_ps1(0.0f);
737}
738
739template <>
740OCEAN_FORCE_INLINE void FrameFilterSeparable::writeSIMD<unsigned int, PI_SSE_2>(const SIMD32x4<unsigned int>::Type& value, unsigned int* target)
741{
742 _mm_storeu_si128((__m128i*)target, value);
743}
744
745template <>
746OCEAN_FORCE_INLINE void FrameFilterSeparable::writeSIMD<float, PI_SSE_2>(const SIMD32x4<float>::Type& value, float* target)
747{
748 _mm_storeu_si128((__m128i*)target, _mm_castps_si128(value));
749}
750
751#endif // OCEAN_HARDWARE_SSE_VERSION >= 20
752
753#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
754
755template <>
756OCEAN_FORCE_INLINE void FrameFilterSeparable::setSIMDZero<unsigned int, PI_NEON>(typename SIMD32x4<unsigned int>::Type& value)
757{
758 value = vdupq_n_u32(0u);
759}
760
761template <>
762OCEAN_FORCE_INLINE void FrameFilterSeparable::setSIMDZero<float, PI_NEON>(typename SIMD32x4<float>::Type& value)
763{
764 value = vdupq_n_f32(0.0f);
765}
766
767template <>
768OCEAN_FORCE_INLINE void FrameFilterSeparable::writeSIMD<unsigned int, PI_NEON>(const SIMD32x4<unsigned int>::Type& value, unsigned int* target)
769{
770 vst1q_u32(target, value);
771}
772
773template <>
774OCEAN_FORCE_INLINE void FrameFilterSeparable::writeSIMD<float, PI_NEON>(const SIMD32x4<float>::Type& value, float* target)
775{
776 vst1q_f32(target, value);
777}
778
779#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
780
781template <typename T>
782void FrameFilterSeparable::fillLeftExtraBorder(const T* source, const unsigned int channels, const unsigned int pixels, T* extendedRow)
783{
784 ocean_assert(source != nullptr && extendedRow != nullptr);
785
786 for (unsigned int n = 0u; n < pixels; ++n)
787 {
788 memcpy(extendedRow + n * channels, source + (pixels - n - 1u) * channels, sizeof(T) * channels);
789 }
790}
791
792template <typename T>
793void FrameFilterSeparable::fillRightExtraBorder(const T* sourceEnd, const unsigned int channels, const unsigned int pixels, T* extendedRow)
794{
795 ocean_assert(sourceEnd != nullptr && extendedRow != nullptr);
796
797 for (unsigned int n = 0u; n < pixels; ++n)
798 {
799 memcpy(extendedRow + n * channels, sourceEnd - (n + 1u) * int(channels), sizeof(T) * channels);
800 }
801}
802
803template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
804OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int width, const unsigned int channels, const float* filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
805{
806 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
807 ocean_assert(channels >= 1u);
808 ocean_assert(filterSize % 2u == 1u);
809
810 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
811
812 unsigned int remainingElements = width * channels;
813
814 while (remainingElements >= 16u)
815 {
816 filterVerticalCoreRow16Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, filter, filterSize, isSymmetric);
817
818 source += 16;
819 target += 16;
820
821 remainingElements -= 16u;
822 }
823
824 while (remainingElements >= 8u)
825 {
826 filterVerticalCoreRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, filter, filterSize, isSymmetric);
827
828 source += 8;
829 target += 8;
830
831 remainingElements -= 8u;
832 }
833
834#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
835
836 while (remainingElements >= 4u)
837 {
838 filterVerticalCoreRow4Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, filter, filterSize, isSymmetric);
839
840 source += 4;
841 target += 4;
842
843 remainingElements -= 4u;
844 }
845
846 ocean_assert(width * channels >= 4u);
847 ocean_assert(remainingElements < 4u);
848
849 if (remainingElements != 0u)
850 {
851 const unsigned int shift = 4u - remainingElements;
852
853 filterVerticalCoreRow4Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements, filter, filterSize, isSymmetric);
854 }
855
856#else
857
858 ocean_assert(width * channels >= 8u);
859 ocean_assert(remainingElements < 8u);
860
861 if (remainingElements != 0u)
862 {
863 const unsigned int shift = 8u - remainingElements;
864
865 filterVerticalCoreRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements, filter, filterSize, isSymmetric);
866 }
867
868#endif // OCEAN_HARDWARE_SSE_VERSION >= 20
869}
870
871#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
872
873template <>
874OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow4Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
875{
876 ocean_assert(source != nullptr && target != nullptr);
877 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
878
879 /**
880 * This function uses the following SSE instructions, and needs SSE2 or higher
881 *
882 * SSE1:
883 * _mm_set_ps1
884 * _mm_mul_ps
885 * _mm_add_ps
886 * _mm_loadu_ps
887 *
888 * SSE2:
889 * _mm_loadu_si128
890 * _mm_cvtepi32_ps
891 * _mm_add_epi32
892 * _mm_cvtps_epi32
893 * _mm_packs_epi32
894 * _mm_packus_epi16
895 */
896
897 const unsigned int filterSize_2 = filterSize / 2u;
898
899 const __m128i* sourceBlock = (const __m128i*)source;
900
901 // we store one filter value in each of the four 32 bit integer values
902 __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
903
904 // now we load four input values, and multiply each of them with the center kernel value
905 __m128 source128 = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock));
906 __m128 result128 = _mm_mul_ps(source128, filterFactor_32x4);
907
908 // now we proceed with the remaining filter values
909 for (unsigned int i = 1u; i <= filterSize_2; ++i)
910 {
911 const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
912 const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
913
914 if (isSymmetric)
915 {
916 // we have a symmetric filter, so let's do some optimizations
917 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
918
919 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
920
921 __m128i source128i = _mm_add_epi32(_mm_loadu_si128(sourceMinus), _mm_loadu_si128(sourcePlus));
922
923 result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128i), filterFactor_32x4));
924 }
925 else
926 {
927 // we don't have a symmetric filter, so we need to handle two individual filters
928 __m128 filterFactor128Minus = _mm_set_ps1(filter[filterSize_2 - i]);
929 __m128 filterFactor128Plus = _mm_set_ps1(filter[filterSize_2 + i]);
930
931 __m128i source128iMinus = _mm_loadu_si128(sourceMinus);
932 __m128i source128iPlus = _mm_loadu_si128(sourcePlus);
933
934 result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128iMinus), filterFactor128Minus));
935 result128 = _mm_add_ps(result128, _mm_mul_ps(_mm_cvtepi32_ps(source128iPlus), filterFactor128Plus));
936 }
937 }
938
939 // now we have 8 bit values in each 32 bit register
940
941 __m128i source128i = _mm_cvtps_epi32(result128);
942 source128i = _mm_packs_epi32(source128i, source128i);
943 source128i = _mm_packus_epi16(source128i, source128i);
944
945 *((unsigned int*)target) = SSE::value_u32<0u>(source128i);
946}
947
948template <>
949OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow4Elements32BitPerChannelFloat<float, float, PI_SSE_2>(const float* source, float* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
950{
951 ocean_assert(source != nullptr && target != nullptr);
952 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
953
954 /**
955 * This function uses the following SSE instructions, and needs SSE2 or higher
956 *
957 * SSE:
958 * _mm_set_ps1
959 * _mm_mul_ps
960 * _mm_add_ps
961 *
962 * SSE2:
963 * _mm_loadu_si128
964 * _mm_castsi128_ps
965 */
966
967 const unsigned int filterSize_2 = filterSize / 2u;
968
969 const __m128i* sourceBlock = (const __m128i*)source;
970
971 // we store one filter value in each of the four 32 bit values
972 __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
973
974 // now we load four input values, and multiply each of them with the center kernel value
975 __m128 source128 = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
976 __m128 result128 = _mm_mul_ps(source128, filterFactor_32x4);
977
978 // now we proceed with the remaining filter values
979 for (unsigned int i = 1u; i <= filterSize_2; ++i)
980 {
981 const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
982 const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
983
984 if (isSymmetric)
985 {
986 // we have a symmetric filter, so let's do some optimizations
987 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
988
989 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
990
991 source128 = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus)));
992
993 result128 = _mm_add_ps(result128, _mm_mul_ps(source128, filterFactor_32x4));
994 }
995 else
996 {
997 // we don't have a symmetric filter, so we need to handle two individual filters
998 __m128 filterFactor_32x4Minus = _mm_set_ps1(filter[filterSize_2 - i]);
999 __m128 filterFactor_32x4Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1000
1001 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1002 __m128 source128Minus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus));
1003 __m128 source128Plus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus));
1004
1005 result128 = _mm_add_ps(result128, _mm_mul_ps(source128Minus, filterFactor_32x4Minus));
1006 result128 = _mm_add_ps(result128, _mm_mul_ps(source128Plus, filterFactor_32x4Plus));
1007 }
1008 }
1009
1010 writeSIMD<float, PI_SSE_2>(result128, target);
1011}
1012
1013template <>
1014OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1015{
1016 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1017 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1018
1019 /**
1020 * This function uses the following SSE instructions, and needs SSE2 or higher
1021 *
1022 * SSE1:
1023 * _mm_set_ps1
1024 * _mm_mul_ps
1025 * _mm_add_ps
1026 * _mm_loadu_ps
1027 *
1028 * SSE2:
1029 * _mm_loadu_si128
1030 * _mm_cvtepi32_ps
1031 * _mm_add_epi32
1032 * _mm_cvtps_epi32
1033 * _mm_packs_epi32
1034 * _mm_packus_epi16
1035 * _mm_storel_epi64
1036 */
1037
1038 const unsigned int filterSize_2 = filterSize / 2u;
1039
1040 const __m128i* sourceBlock = (const __m128i*)source;
1041
1042 // we store one filter value in each of the four 32 bit integer values
1043 __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1044
1045 // now we load four input values, and multiply each of them with the center kernel value
1046 __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
1047 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1048
1049 // now we load the next four input values, ...
1050 __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
1051 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1052
1053 // now we proceed with the remaining filter values
1054 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1055 {
1056 const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
1057 const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
1058
1059 if (isSymmetric)
1060 {
1061 // we have a symmetric filter, so let's do some optimizations
1062 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1063
1064 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1065 __m128i source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 0), _mm_loadu_si128(sourcePlus + 0));
1066 __m128i source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 1), _mm_loadu_si128(sourcePlus + 1));
1067
1068 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1069 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1070 }
1071 else
1072 {
1073 // we don't have a symmetric filter, so we need to handle two individual filters
1074 __m128 filterFactor128Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1075 __m128 filterFactor128Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1076
1077 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1078 __m128i source128aiMinus =_mm_loadu_si128(sourceMinus + 0);
1079 __m128i source128aiPlus = _mm_loadu_si128(sourcePlus + 0);
1080 __m128i source128biMinus = _mm_loadu_si128(sourceMinus + 1);
1081 __m128i source128biPlus = _mm_loadu_si128(sourcePlus + 1);
1082
1083 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
1084 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
1085
1086 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
1087 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
1088 }
1089 }
1090
1091 // now we have 8 bit values in each 32 bit register
1092 __m128i result128 = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
1093 result128 = _mm_packus_epi16(result128, result128);
1094
1095 _mm_storel_epi64((__m128i*)target, result128);
1096}
1097
1098template <>
1099OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<float, float, PI_SSE_2>(const float* source, float* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1100{
1101 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1102 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1103
1104 /**
1105 * This function uses the following SSE instructions, and needs SSE2 or higher
1106 *
1107 * SSE:
1108 * _mm_set_ps1
1109 * _mm_mul_ps
1110 * _mm_add_ps
1111 *
1112 * SSE2:
1113 * _mm_loadu_si128
1114 * _mm_castsi128_ps
1115 */
1116
1117 const unsigned int filterSize_2 = filterSize / 2u;
1118
1119 const __m128i* sourceBlock = (const __m128i*)source;
1120
1121 // we store one filter value in each of the four 32 bit values
1122 __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1123
1124 // now we load four input values, and multiply each of them with the center kernel value
1125 __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
1126 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1127
1128 // now we load the next four input values, ...
1129 __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
1130 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1131
1132 // now we proceed with the remaining filter values
1133 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1134 {
1135 const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
1136 const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
1137
1138 if (isSymmetric)
1139 {
1140 // we have a symmetric filter, so let's do some optimizations
1141 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1142
1143 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1144
1145 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0)));
1146 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1)));
1147
1148 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
1149 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
1150 }
1151 else
1152 {
1153 // we don't have a symmetric filter, so we need to handle two individual filters
1154 __m128 filterFactor_32x4Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1155 __m128 filterFactor_32x4Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1156
1157 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1158 __m128 source128aMinus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0));
1159 __m128 source128aPlus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0));
1160 __m128 source128bMinus = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1));
1161 __m128 source128bPlus = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1));
1162
1163 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source128aMinus, filterFactor_32x4Minus));
1164 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source128aPlus, filterFactor_32x4Plus));
1165
1166 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source128bMinus, filterFactor_32x4Minus));
1167 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source128bPlus, filterFactor_32x4Plus));
1168 }
1169 }
1170
1171 writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
1172 writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
1173}
1174
1175#endif // OCEAN_HARDWARE_SSE_VERSION >= 20
1176
1177#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1178
1179template <>
1180OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1181{
1182 const unsigned int filterSize_2 = filterSize / 2u;
1183
1184 // we store one filter value in each of the four 32 bit integer values
1185 float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
1186
1187 // now we load four input values, and multiply each of them with the center kernel value
1188 float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
1189 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1190
1191 float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
1192 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1193
1194 // now we proceed with the remaining filter values
1195 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1196 {
1197 const unsigned int* sourceMinus = source - sourceStrideElements * i;
1198 const unsigned int* sourcePlus = source + sourceStrideElements * i;
1199
1200 if (isSymmetric)
1201 {
1202 // we have a symmetric filter, so let's do some optimizations
1203 filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
1204
1205 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1206 uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
1207 uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
1208
1209 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1210 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1211 }
1212 else
1213 {
1214 // we don't have a symmetric filter, so we need to handle two individual filters
1215
1216 float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
1217 float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
1218
1219 uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
1220 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
1221
1222 uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
1223 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
1224
1225 uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
1226 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
1227
1228 uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
1229 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
1230 }
1231 }
1232
1233 // now we have 8 bit values in each 32 bit register
1234 uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
1235
1236 uint8x8_t result64 = vqmovn_u16(result128ab);
1237
1238 vst1_u8(target, result64);
1239}
1240
1241template <>
1242OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow8Elements32BitPerChannelFloat<float, float, PI_NEON>(const float* source, float* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1243{
1244 const unsigned int filterSize_2 = filterSize / 2u;
1245
1246 // we store one filter value in each of the four 32 bit integer values
1247 float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
1248
1249 // now we load four input values, and multiply each of them with the center kernel value
1250 float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
1251 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1252
1253 float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
1254 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1255
1256 // now we proceed with the remaining filter values
1257 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1258 {
1259 const float* sourceMinus = source - sourceStrideElements * i;
1260 const float* sourcePlus = source + sourceStrideElements * i;
1261
1262 if (isSymmetric)
1263 {
1264 // we have a symmetric filter, so let's do some optimizations
1265 filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
1266
1267 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1268 source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
1269 source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
1270
1271 result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
1272 result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
1273 }
1274 else
1275 {
1276 // we don't have a symmetric filter, so we need to handle two individual filters
1277
1278 float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
1279 float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
1280
1281 float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
1282 float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
1283
1284 float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
1285 float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
1286
1287 result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
1288 result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
1289
1290 result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
1291 result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
1292 }
1293 }
1294
1295 vst1q_f32(target + 0, result_32x4a);
1296 vst1q_f32(target + 4, result_32x4b);
1297}
1298
1299#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
1300
1301#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
1302
1303template <>
1304OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1305{
1306 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1307 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1308
1309 /**
1310 * This function uses the following SSE instructions, and needs SSE2 or higher
1311 *
1312 * SSE1:
1313 * _mm_set_ps1
1314 * _mm_mul_ps
1315 * _mm_add_ps
1316 * _mm_loadu_ps
1317 *
1318 * SSE2:
1319 * _mm_loadu_si128
1320 * _mm_cvtepi32_ps
1321 * _mm_add_epi32
1322 * _mm_cvtps_epi32
1323 * _mm_packs_epi32
1324 * _mm_packus_epi16
1325 * _mm_storeu_si128
1326 */
1327
1328 /**
1329 * We determine 16 filter responses within one loop iteration.
1330 * For a filter with size 5 for 1 channel frames we apply the following strategy:
1331 *
1332 * Source Data:
1333 * Y
1334 * Y
1335 * Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1336 * Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
1337 * Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <------------
1338 * Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
1339 * Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1340 * Y
1341 * Y
1342 *
1343 * Further, we use the fact that the filter kernel is symmetric so that we start at the center row (the target row) and then going to the filter's borders
1344 *
1345 * For frames with n channels the strategy stays the same.
1346 */
1347
1348 const unsigned int filterSize_2 = filterSize / 2u;
1349
1350 const __m128i* sourceBlock = (const __m128i*)source;
1351
1352 // we store one filter value in each of the four 32 bit integer values
1353 __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1354
1355 // now we load four input values, and multiply each of them with the center kernel value
1356 __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
1357 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1358
1359 // now we load the next four input values, ...
1360 __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
1361 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1362
1363 __m128 source_32x4c = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 2));
1364 __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
1365
1366 __m128 source_32x4d = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 3));
1367 __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
1368
1369 // now we proceed with the remaining filter values
1370 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1371 {
1372 const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
1373 const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
1374
1375 if (isSymmetric)
1376 {
1377 // we have a symmetric filter, so let's do some optimizations
1378 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1379
1380 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1381
1382 __m128i source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 0), _mm_loadu_si128(sourcePlus + 0));
1383 __m128i source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 1), _mm_loadu_si128(sourcePlus + 1));
1384
1385 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1386 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1387
1388 source128ai = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 2), _mm_loadu_si128(sourcePlus + 2));
1389 source128bi = _mm_add_epi32(_mm_loadu_si128(sourceMinus + 3), _mm_loadu_si128(sourcePlus + 3));
1390
1391 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1392 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1393 }
1394 else
1395 {
1396 // we don't have a symmetric filter, so we need to handle two individual filters
1397 __m128 filterFactor128Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1398 __m128 filterFactor128Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1399
1400 __m128i source128aiMinus = _mm_loadu_si128(sourceMinus + 0);
1401 __m128i source128aiPlus = _mm_loadu_si128(sourcePlus + 0);
1402
1403 __m128i source128biMinus = _mm_loadu_si128(sourceMinus + 1);
1404 __m128i source128biPlus = _mm_loadu_si128(sourcePlus + 1);
1405
1406 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
1407 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
1408
1409 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
1410 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
1411
1412 __m128i source128ciMinus = _mm_loadu_si128(sourceMinus + 2);
1413 __m128i source128ciPlus = _mm_loadu_si128(sourcePlus + 2);
1414
1415 __m128i source128diMinus = _mm_loadu_si128(sourceMinus + 3);
1416 __m128i source128diPlus = _mm_loadu_si128(sourcePlus + 3);
1417
1418 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciMinus), filterFactor128Minus));
1419 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diMinus), filterFactor128Minus));
1420
1421 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciPlus), filterFactor128Plus));
1422 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diPlus), filterFactor128Plus));
1423 }
1424 }
1425
1426 // now we have 8 bit values in each 32 bit register
1427 __m128i result128ab = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
1428 __m128i result128cd = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4c), _mm_cvtps_epi32(result_32x4d));
1429 __m128i result128 = _mm_packus_epi16(result128ab, result128cd);
1430
1431 _mm_storeu_si128((__m128i*)target, result128);
1432}
1433
1434template <>
1435OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<float, float, PI_SSE_2>(const float* source, float* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1436{
1437 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1438 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1439
1440 /**
1441 * This function uses the following SSE instructions, and needs SSE2 or higher
1442 *
1443 * SSE:
1444 * _mm_set_ps1
1445 * _mm_mul_ps
1446 * _mm_add_ps
1447 *
1448 * SSE2:
1449 * _mm_loadu_si128
1450 * _mm_castsi128_ps
1451 */
1452
1453 /**
1454 * We determine 16 filter responses within one loop iteration.
1455 * For a filter with size 5 for 1 channel frames we apply the following strategy:
1456 *
1457 * Source Data:
1458 * Y
1459 * Y
1460 * Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1461 * Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
1462 * Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <------------
1463 * Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
1464 * Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1465 * Y
1466 * Y
1467 *
1468 * Further, we use the fact that the filter kernel is symmetric so that we start at the center row (the target row) and then going to the filter's borders
1469 *
1470 * For frames with n channels the strategy stays the same.
1471 */
1472
1473 const unsigned int filterSize_2 = filterSize / 2u;
1474
1475 const __m128i* sourceBlock = (const __m128i*)source;
1476
1477 // we store one filter value in each of the four 32 bit values
1478 __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1479
1480 // now we load four input values, and multiply each of them with the center kernel value
1481 __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
1482 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1483
1484 // now we load the next four input values, ...
1485 __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
1486 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1487
1488 __m128 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 2));
1489 __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
1490
1491 __m128 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 3));
1492 __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
1493
1494 // now we proceed with the remaining filter values
1495 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1496 {
1497 const __m128i* sourceMinus = (const __m128i*)(source - sourceStrideElements * i);
1498 const __m128i* sourcePlus = (const __m128i*)(source + sourceStrideElements * i);
1499
1500 if (isSymmetric)
1501 {
1502 // we have a symmetric filter, so let's do some optimizations
1503 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1504
1505 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1506
1507 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0)));
1508 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1)));
1509
1510 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
1511 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
1512
1513 source_32x4c = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 2)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 2)));
1514 source_32x4d = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 3)), _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 3)));
1515
1516 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4));
1517 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4));
1518 }
1519 else
1520 {
1521 // we don't have a symmetric filter, so we need to handle two individual filters
1522 __m128 filterFactor_32x4Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1523 __m128 filterFactor_32x4Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1524
1525 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 0));
1526 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
1527
1528 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 1));
1529 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
1530
1531 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 2));
1532 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Minus));
1533
1534 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceMinus + 3));
1535 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Minus));
1536
1537 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 0));
1538 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
1539
1540 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 1));
1541 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
1542
1543 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 2));
1544 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Plus));
1545
1546 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourcePlus + 3));
1547 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Plus));
1548 }
1549 }
1550
1551 writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
1552 writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
1553 writeSIMD<float, PI_SSE_2>(result_32x4c, target + 8);
1554 writeSIMD<float, PI_SSE_2>(result_32x4d, target + 12);
1555}
1556
1557#endif // OCEAN_HARDWARE_SSE_VERSION >= 20
1558
1559#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1560
1561template <>
1562OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1563{
1564 const unsigned int filterSize_2 = filterSize / 2u;
1565
1566 // we store one filter value in each of the four 32 bit integer values
1567 float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
1568
1569 // now we load four input values, and multiply each of them with the center kernel value
1570 float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
1571 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1572
1573 float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
1574 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1575
1576 float32x4_t source_32x4c = vcvtq_f32_u32(vld1q_u32(source + 4 * 2));
1577 float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
1578
1579 float32x4_t source_32x4d = vcvtq_f32_u32(vld1q_u32(source + 4 * 3));
1580 float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
1581
1582 // now we proceed with the remaining filter values
1583 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1584 {
1585 const unsigned int* sourceMinus = source - sourceStrideElements * i;
1586 const unsigned int* sourcePlus = source + sourceStrideElements * i;
1587
1588 if (isSymmetric)
1589 {
1590 // we have a symmetric filter, so let's do some optimizations
1591 filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
1592
1593 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1594 uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
1595 uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
1596
1597 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1598 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1599
1600 source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 2), vld1q_u32(sourcePlus + 4 * 2));
1601 source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 3), vld1q_u32(sourcePlus + 4 * 3));
1602
1603 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1604 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1605 }
1606 else
1607 {
1608 // we don't have a symmetric filter, so we need to handle two individual filters
1609
1610 float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
1611 float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
1612
1613 uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
1614 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
1615
1616 uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
1617 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
1618
1619 uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
1620 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
1621
1622 uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
1623 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
1624
1625 uint32x4_t source128ciMinus = vld1q_u32(sourceMinus + 4 * 2);
1626 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciMinus), filterFactor128Minus);
1627
1628 uint32x4_t source128diMinus = vld1q_u32(sourceMinus + 4 * 3);
1629 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diMinus), filterFactor128Minus);
1630
1631 uint32x4_t source128ciPlus = vld1q_u32(sourcePlus + 4 * 2);
1632 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciPlus), filterFactor128Plus);
1633
1634 uint32x4_t source128diPlus = vld1q_u32(sourcePlus + 4 * 3);
1635 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diPlus), filterFactor128Plus);
1636 }
1637 }
1638
1639 // now we have 8 bit values in each 32 bit register
1640 uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
1641 uint16x8_t result128cd = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4c)), vqmovn_u32(vcvtq_u32_f32(result_32x4d)));
1642
1643 uint8x16_t result128 = vcombine_u8(vqmovn_u16(result128ab), vqmovn_u16(result128cd));
1644
1645 vst1q_u8(target, result128);
1646}
1647
1648template <>
1649 OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalCoreRow16Elements32BitPerChannelFloat<float, float, PI_NEON>(const float* source, float* target, const unsigned int sourceStrideElements, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1650{
1651 const unsigned int filterSize_2 = filterSize / 2u;
1652
1653 // we store one filter value in each of the four 32 bit integer values
1654 float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
1655
1656 // now we load four input values, and multiply each of them with the center kernel value
1657 float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
1658 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1659
1660 float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
1661 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1662
1663 float32x4_t source_32x4c = vld1q_f32(source + 4 * 2);
1664 float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
1665
1666 float32x4_t source_32x4d = vld1q_f32(source + 4 * 3);
1667 float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
1668
1669 // now we proceed with the remaining filter values
1670 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1671 {
1672 const float* sourceMinus = source - sourceStrideElements * i;
1673 const float* sourcePlus = source + sourceStrideElements * i;
1674
1675 if (isSymmetric)
1676 {
1677 // we have a symmetric filter, so let's do some optimizations
1678 filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
1679
1680 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1681 source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
1682 source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
1683
1684 result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
1685 result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
1686
1687 source_32x4c = vaddq_f32(vld1q_f32(sourceMinus + 4 * 2), vld1q_f32(sourcePlus + 4 * 2));
1688 source_32x4d = vaddq_f32(vld1q_f32(sourceMinus + 4 * 3), vld1q_f32(sourcePlus + 4 * 3));
1689
1690 result_32x4c = vmlaq_f32(result_32x4c, source_32x4c, filterFactor_32x4);
1691 result_32x4d = vmlaq_f32(result_32x4d, source_32x4d, filterFactor_32x4);
1692 }
1693 else
1694 {
1695 // we don't have a symmetric filter, so we need to handle two individual filters
1696
1697 float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
1698 float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
1699
1700 float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
1701 float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
1702
1703 float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
1704 float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
1705
1706 result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
1707 result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
1708
1709 result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
1710 result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
1711
1712 source128aMinus = vld1q_f32(sourceMinus + 4 * 2);
1713 source128aPlus = vld1q_f32(sourcePlus + 4 * 2);
1714
1715 source128bMinus = vld1q_f32(sourceMinus + 4 * 3);
1716 source128bPlus = vld1q_f32(sourcePlus + 4 * 3);
1717
1718 result_32x4c = vmlaq_f32(result_32x4c, source128aMinus, filterFactor128Minus);
1719 result_32x4d = vmlaq_f32(result_32x4d, source128bMinus, filterFactor128Minus);
1720
1721 result_32x4c = vmlaq_f32(result_32x4c, source128aPlus, filterFactor128Plus);
1722 result_32x4d = vmlaq_f32(result_32x4d, source128bPlus, filterFactor128Plus);
1723 }
1724 }
1725
1726 vst1q_f32(target + 0, result_32x4a);
1727 vst1q_f32(target + 4, result_32x4b);
1728 vst1q_f32(target + 8, result_32x4c);
1729 vst1q_f32(target + 12, result_32x4d);
1730}
1731
1732#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
1733
1734#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
1735
1736template <>
1737OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1738{
1739 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1740 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1741
1742 /**
1743 * This function uses the following SSE instructions, and needs SSE2 or higher
1744 *
1745 * SSE1:
1746 * _mm_set_ps1
1747 * _mm_mul_ps
1748 * _mm_add_ps
1749 * _mm_loadu_ps
1750 *
1751 * SSE2:
1752 * _mm_loadu_si128
1753 * _mm_cvtepi32_ps
1754 * _mm_add_epi32
1755 * _mm_cvtps_epi32
1756 * _mm_packs_epi32
1757 * _mm_packus_epi16
1758 * _mm_storel_epi64
1759 */
1760
1761 const unsigned int filterSize_2 = filterSize / 2u;
1762
1763 // the border covers row ids within the range [0, filterSize_2)
1764 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
1765
1766 const __m128i* sourceBlock = (const __m128i*)source;
1767
1768 // we store one filter value in each of the four 32 bit integer values
1769 __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1770
1771 // now we load four input values, and multiply each of them with the center kernel value
1772 __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
1773 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1774
1775 // now we load the next four input values, ...
1776 __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
1777 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1778
1779 __m128i source128ai, source128bi;
1780
1781 // now we proceed with the remaining filter values
1782 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1783 {
1784 // we determine the mirrored locations (and the row offset in relation to the current row)
1785 const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
1786 const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
1787
1788 // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
1789 const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
1790 const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
1791
1792 if (isSymmetric)
1793 {
1794 // we have a symmetric filter, so let's do some optimizations
1795 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1796
1797 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1798
1799 source128ai = _mm_add_epi32(_mm_loadu_si128((const __m128i*)sourceMinus + 0), _mm_loadu_si128((const __m128i*)sourcePlus + 0));
1800 source128bi = _mm_add_epi32(_mm_loadu_si128((const __m128i*)sourceMinus + 1), _mm_loadu_si128((const __m128i*)sourcePlus + 1));
1801
1802 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
1803 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
1804 }
1805 else
1806 {
1807 // we don't have a symmetric filter, so we need to handle two individual filters
1808
1809 __m128 filterFactor128Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1810 __m128 filterFactor128Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1811
1812 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1813
1814 source128ai = _mm_loadu_si128((const __m128i*)sourceMinus + 0);
1815 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor128Minus));
1816
1817 source128bi = _mm_loadu_si128((const __m128i*)sourceMinus + 1);
1818 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor128Minus));
1819
1820 source128ai = _mm_loadu_si128((const __m128i*)sourcePlus + 0);
1821 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor128Plus));
1822
1823 source128bi = _mm_loadu_si128((const __m128i*)sourcePlus + 1);
1824 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor128Plus));
1825 }
1826 }
1827
1828 // now we have 8 bit values in each 32 bit register
1829 __m128i result128 = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
1830 result128 = _mm_packus_epi16(result128, result128);
1831
1832 _mm_storel_epi64((__m128i*)target, result128);
1833}
1834
1835template <>
1836OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<float, float, PI_SSE_2>(const float* source, float* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1837{
1838 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1839 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
1840
1841 /**
1842 * This function uses the following SSE instructions, and needs SSE2 or higher
1843 *
1844 * SSE:
1845 * _mm_set_ps1
1846 * _mm_mul_ps
1847 * _mm_add_ps
1848 *
1849 * SSE2:
1850 * _mm_loadu_si128
1851 * _mm_castsi128_ps
1852 */
1853
1854 const unsigned int filterSize_2 = filterSize / 2u;
1855
1856 // the border covers row ids within the range [0, filterSize_2)
1857 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
1858
1859 const __m128i* sourceBlock = (const __m128i*)source;
1860
1861 // we store one filter value in each of the four 32 bit values
1862 __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
1863
1864 // now we load four input values, and multiply each of them with the center kernel value
1865 __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
1866 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
1867
1868 // now we load the next four input values, ...
1869 __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
1870 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
1871
1872 // now we proceed with the remaining filter values
1873 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1874 {
1875 // we determine the mirrored locations (and the row offset in relation to the current row)
1876 const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
1877 const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
1878
1879 // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
1880 const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
1881 const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
1882
1883 if (isSymmetric)
1884 {
1885 // we have a symmetric filter, so let's do some optimizations
1886 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
1887
1888 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1889
1890 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 0)));
1891 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 1)));
1892
1893 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
1894 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
1895 }
1896 else
1897 {
1898 // we don't have a symmetric filter, so we need to handle two individual filters
1899 __m128 filterFactor_32x4Minus = _mm_set_ps1(filter[filterSize_2 - i]);
1900 __m128 filterFactor_32x4Plus = _mm_set_ps1(filter[filterSize_2 + i]);
1901
1902 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 0));
1903 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
1904
1905 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 1));
1906 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
1907
1908 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 0));
1909 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
1910
1911 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 1));
1912 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
1913 }
1914 }
1915
1916 writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
1917 writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
1918}
1919
1920#endif // OCEAN_HARDWARE_SSE_VERSION >= 20
1921
1922#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
1923
1924template <>
1925OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
1926{
1927 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
1928 ocean_assert(filterSize % 2u == 1u);
1929
1930 const unsigned int filterSize_2 = filterSize / 2u;
1931
1932 // the border covers row ids within the range [0, filterSize_2)
1933 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
1934
1935 // we store one filter value in each of the four 32 bit integer values
1936 float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
1937
1938 // now we load four input values, and multiply each of them with the center kernel value
1939 float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
1940 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
1941
1942 float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
1943 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
1944
1945 // now we proceed with the remaining filter values
1946 for (unsigned int i = 1u; i <= filterSize_2; ++i)
1947 {
1948 // we determine the mirrored locations (and the row offset in relation to the current row)
1949 const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
1950 const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
1951
1952 // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
1953 const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
1954 const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
1955
1956 if (isSymmetric)
1957 {
1958 // we have a symmetric filter, so let's do some optimizations
1959 filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
1960
1961 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
1962
1963 uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
1964 uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
1965
1966 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
1967 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
1968 }
1969 else
1970 {
1971 // we don't have a symmetric filter, so we need to handle two individual filters
1972
1973 float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
1974 float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
1975
1976 uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
1977 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
1978
1979 uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
1980 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
1981
1982 uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
1983 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
1984
1985 uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
1986 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
1987 }
1988 }
1989
1990 // now we have 8 bit values in each 32 bit register
1991 uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
1992
1993 uint8x8_t result64 = vqmovn_u16(result128ab);
1994
1995 vst1_u8(target, result64);
1996}
1997
1998template <>
1999OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow8Elements32BitPerChannelFloat<float, float, PI_NEON>(const float* source, float* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
2000{
2001 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
2002 ocean_assert(filterSize % 2u == 1u);
2003
2004 const unsigned int filterSize_2 = filterSize / 2u;
2005
2006 // the border covers row ids within the range [0, filterSize_2)
2007 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2008
2009 // we store one filter value in each of the four 32 bit integer values
2010 float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
2011
2012 // now we load four input values, and multiply each of them with the center kernel value
2013 float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
2014 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
2015
2016 float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
2017 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
2018
2019 // now we proceed with the remaining filter values
2020 for (unsigned int i = 1u; i <= filterSize_2; ++i)
2021 {
2022 // we determine the mirrored locations (and the row offset in relation to the current row)
2023 const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
2024 const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
2025
2026 // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
2027 const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2028 const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2029
2030 if (isSymmetric)
2031 {
2032 // we have a symmetric filter, so let's do some optimizations
2033 filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
2034
2035 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
2036
2037 source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
2038 source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
2039
2040 result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
2041 result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
2042 }
2043 else
2044 {
2045 // we don't have a symmetric filter, so we need to handle two individual filters
2046
2047 float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
2048 float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
2049
2050 float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
2051 float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
2052
2053 float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
2054 float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
2055
2056 result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
2057 result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
2058
2059 result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
2060 result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
2061 }
2062 }
2063
2064 vst1q_f32(target + 0, result_32x4a);
2065 vst1q_f32(target + 4, result_32x4b);
2066}
2067
2068#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
2069
2070#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
2071
2072template <>
2073OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(const uint8_t* source, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target)
2074{
2075 /**
2076 * This function uses the following SSE instructions, and needs SSE2 or higher
2077 *
2078 * SSE2:
2079 * _mm_set1_epi32
2080 * _mm_unpacklo_epi8
2081 * _mm_unpackhi_epi16
2082 * _mm_setzero_si128
2083 * _mm_madd_epi16
2084 * _mm_add_epi32
2085 */
2086
2087 // we store one filter value in each of the four 32 bit integer values
2088 __m128i filterFactor_32x4 = _mm_set1_epi32(int(filterFactor));
2089
2090 // we load four source values into the lower 32 bit of our 128 bit register
2091 __m128i source128 = _mm_set1_epi32(*((const int*)source));
2092
2093 // we separate the source values to receive 16 bit integers
2094 source128 = _mm_unpacklo_epi8(source128, _mm_setzero_si128());
2095
2096 // we separate the 16 bit values further so that we receive 32 bit integers
2097 source128 = _mm_unpackhi_epi16(source128, _mm_setzero_si128());
2098
2099 // we multiply each value with the same filter factor, and sum the result
2100 source128 = _mm_madd_epi16(source128, filterFactor_32x4);
2101
2102 // we add the local result to the sum parameters
2103 target = _mm_add_epi32(target, source128);
2104}
2105
2106template <>
2107OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_SSE_2>(const float* source, const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2108{
2109 /**
2110 * This function uses the following SSE instructions, and needs SSE2 or higher
2111 *
2112 * SSE:
2113 * _mm_set_ps1
2114 * _mm_mul_ps
2115 * _mm_add_ps
2116 *
2117 * SSE2:
2118 * _mm_loadu_si128
2119 * _mm_castsi128_ps
2120 */
2121
2122 // we store one filter value in each of the four 32 bit values
2123 __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2124
2125 // we load 8 source values into two 128 bit registers
2126 __m128 source_32x4 = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)source));
2127
2128 // we multiply each value with the same filter factor
2129 source_32x4 = _mm_mul_ps(source_32x4, filterFactor_32x4);
2130
2131 // we add the local result to the sum parameters
2132 target_32x4 = _mm_add_ps(target_32x4, source_32x4);
2133}
2134
2135template <>
2136OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(const uint8_t* sourceLeft, const uint8_t* sourceRight, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target)
2137{
2138 /**
2139 * This function uses the following SSE instructions, and needs SSE2 or higher
2140 *
2141 * SSE2:
2142 * _mm_set1_epi32
2143 * _mm_unpacklo_epi8
2144 * _mm_unpackhi_epi16
2145 * _mm_setzero_si128
2146 * _mm_madd_epi16
2147 * _mm_add_epi32
2148 */
2149
2150 // we store one filter value in each of the four 32 bit integer values
2151 __m128i filterFactor_32x4 = _mm_set1_epi32(int(filterFactor));
2152
2153 // we load 4 source values from the left side and 4 source values from the right side, we separate the values to receive 16 bit integers and add them together
2154 __m128i source128 = _mm_add_epi16(_mm_unpacklo_epi8(_mm_set1_epi32(*((const int*)sourceLeft)), _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_set1_epi32(*((const int*)sourceRight)), _mm_setzero_si128()));
2155
2156 // we separate the 16 bit values further so that we receive 32 bit integers
2157 source128 = _mm_unpackhi_epi16(source128, _mm_setzero_si128());
2158
2159 // we multiply each value with the same filter factor, and sum the result
2160 source128 = _mm_madd_epi16(source128, filterFactor_32x4);
2161
2162 // we add the local result to the sum parameters
2163 target = _mm_add_epi32(target, source128);
2164}
2165
2166template <>
2167OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_SSE_2>(const float* sourceLeft, const float* sourceRight, const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2168{
2169 /**
2170 * This function uses the following SSE instructions, and needs SSE2 or higher
2171 *
2172 * SSE:
2173 * _mm_set_ps1
2174 * _mm_mul_ps
2175 * _mm_add_ps
2176 *
2177 * SSE2:
2178 * _mm_loadu_si128
2179 * _mm_castsi128_ps
2180 */
2181
2182 // we store one filter value in each of the four 32 bit values
2183 __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2184
2185 // we load 4 * 2 source values and add them together
2186 __m128 source_32x4 = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceLeft)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceRight)));
2187
2188 // we multiply each value with the same filter factor
2189 source_32x4 = _mm_mul_ps(source_32x4, filterFactor_32x4);
2190
2191 // we add the local result to the sum parameters
2192 target_32x4 = _mm_add_ps(target_32x4, source_32x4);
2193}
2194
2195template <>
2196OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(const uint8_t* source, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2197{
2198 /**
2199 * This function uses the following SSE instructions, and needs SSE2 or higher
2200 *
2201 * SSE2:
2202 * _mm_set1_epi32
2203 * _mm_loadl_epi64
2204 * _mm_unpacklo_epi8
2205 * _mm_unpackhi_epi16
2206 * _mm_unpacklo_epi16
2207 * _mm_setzero_si128
2208 * _mm_madd_epi16
2209 * _mm_add_epi32
2210 */
2211
2212 // we store one filter value in each of the four 32 bit integer values
2213 __m128i filterFactor_32x4 = _mm_set1_epi32(int(filterFactor));
2214
2215 // we load eight source values into the lower 64 bit of our 128 bit register
2216 __m128i source_32x4a = _mm_loadl_epi64((const __m128i*)source);
2217
2218 // we separate the source values to receive 16 bit integers
2219 source_32x4a = _mm_unpacklo_epi8(source_32x4a, _mm_setzero_si128());
2220
2221 // we separate the 16 bit values further so that we receive 32 bit integers
2222 __m128i source_32x4b = _mm_unpackhi_epi16(source_32x4a, _mm_setzero_si128());
2223 source_32x4a = _mm_unpacklo_epi16(source_32x4a, _mm_setzero_si128());
2224
2225 // we multiply each value with the same filter factor, and sum the result
2226 source_32x4a = _mm_madd_epi16(source_32x4a, filterFactor_32x4);
2227 source_32x4b = _mm_madd_epi16(source_32x4b, filterFactor_32x4);
2228
2229 // we add the local result to the sum parameters
2230 target_32x4a = _mm_add_epi32(target_32x4a, source_32x4a);
2231 target_32x4b = _mm_add_epi32(target_32x4b, source_32x4b);
2232}
2233
2234template <>
2235OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_SSE_2>(const float* source, const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2236{
2237 /**
2238 * This function uses the following SSE instructions, and needs SSE2 or higher
2239 *
2240 * SSE:
2241 * _mm_set_ps1
2242 * _mm_mul_ps
2243 * _mm_add_ps
2244 *
2245 * SSE2:
2246 * _mm_loadu_si128
2247 * _mm_castsi128_ps
2248 */
2249
2250 // we store one filter value in each of the four 32 bit values
2251 __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2252
2253 // we load 8 source values into two 128 bit registers
2254 __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)source + 0));
2255 __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)source + 1));
2256
2257 // we multiply each value with the same filter factor
2258 source_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2259 source_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2260
2261 // we add the local result to the sum parameters
2262 target_32x4a = _mm_add_ps(target_32x4a, source_32x4a);
2263 target_32x4b = _mm_add_ps(target_32x4b, source_32x4b);
2264}
2265
2266template <>
2267OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_SSE_2>(const uint8_t* sourceLeft, const uint8_t* sourceRight, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2268{
2269 /**
2270 * This function uses the following SSE instructions, and needs SSE2 or higher
2271 *
2272 * SSE2:
2273 * _mm_set1_epi32
2274 * _mm_loadl_epi64
2275 * _mm_unpacklo_epi8
2276 * _mm_unpackhi_epi16
2277 * _mm_unpacklo_epi16
2278 * _mm_setzero_si128
2279 * _mm_madd_epi16
2280 * _mm_add_epi32
2281 */
2282
2283 // we store one filter value in each of the four 32 bit integer values
2284 __m128i filterFactor_32x4 = _mm_set1_epi32(int(filterFactor));
2285
2286 // we load 8 source values from the left side and 8 source values from the right side, we separate the values to receive 16 bit integers and add them together
2287 __m128i source_32x4a = _mm_add_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)sourceLeft), _mm_setzero_si128()), _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)sourceRight), _mm_setzero_si128()));
2288
2289 // we separate the 16 bit values further so that we receive 32 bit integers
2290 __m128i source_32x4b = _mm_unpackhi_epi16(source_32x4a, _mm_setzero_si128());
2291 source_32x4a = _mm_unpacklo_epi16(source_32x4a, _mm_setzero_si128());
2292
2293 // we multiply each value with the same filter factor, and sum the result
2294 source_32x4a = _mm_madd_epi16(source_32x4a, filterFactor_32x4);
2295 source_32x4b = _mm_madd_epi16(source_32x4b, filterFactor_32x4);
2296
2297 // we add the local result to the sum parameters
2298 target_32x4a = _mm_add_epi32(target_32x4a, source_32x4a);
2299 target_32x4b = _mm_add_epi32(target_32x4b, source_32x4b);
2300}
2301
2302template <>
2303OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_SSE_2>(const float* sourceLeft, const float* sourceRight, const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2304{
2305 /**
2306 * This function uses the following SSE instructions, and needs SSE2 or higher
2307 *
2308 * SSE:
2309 * _mm_set_ps1
2310 * _mm_mul_ps
2311 * _mm_add_ps
2312 *
2313 * SSE2:
2314 * _mm_loadu_si128
2315 * _mm_castsi128_ps
2316 */
2317
2318 // we store one filter value in each of the four 32 bit values
2319 __m128 filterFactor_32x4 = _mm_set_ps1(filterFactor);
2320
2321 // we load 4 * 2 source values and add them together
2322 __m128 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceLeft + 0)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceRight + 0)));
2323 __m128 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceLeft + 1)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceRight + 1)));
2324
2325 // we multiply each value with the same filter factor
2326 source_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2327 source_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2328
2329 // we add the local result to the sum parameters
2330 target_32x4a = _mm_add_ps(target_32x4a, source_32x4a);
2331 target_32x4b = _mm_add_ps(target_32x4b, source_32x4b);
2332}
2333
2334#endif // OCEAN_HARDWARE_SSE_VERSION >= 20
2335
2336#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
2337
2338template <>
2339OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(const uint8_t* source, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4)
2340{
2341 ocean_assert(filterFactor <= 0xFFFFu);
2342
2343 // we store the same filter value in each of the four 16 bit values
2344 const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2345
2346#if defined(__aarch64__)
2347
2348 // we load four 8bit source values and we convert them to 16 bit values afterwards
2349 const uint16x8_t source16_8 = vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(*((const uint32_t*)source))));
2350
2351#else
2352
2353 uint32_t sourceValue;
2354 ((uint8_t*)&sourceValue)[0] = source[0];
2355 ((uint8_t*)&sourceValue)[1] = source[1];
2356 ((uint8_t*)&sourceValue)[2] = source[2];
2357 ((uint8_t*)&sourceValue)[3] = source[3];
2358
2359 const uint16x8_t source16_8 = vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(sourceValue)));
2360
2361#endif // __aarch64__
2362
2363 // we multiply each 16 bit value with the same 16 bit filter factor and add the 32 bit results to the given values
2364 target_32x4 = vmlal_u16(target_32x4, vget_low_u16(source16_8), filterFactor16_4);
2365}
2366
2367template <>
2368OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_NEON>(const float* source, const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2369{
2370 // we store the same filter value in each of the four 32 bit values
2371 const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2372
2373 // we load four 32 bit source values
2374 const float32x4_t source128 = vld1q_f32(source);
2375
2376 // we multiply each value with the same filter factor, and sum the result
2377 target_32x4 = vmlaq_f32(target_32x4, source128, filterFactor_32x4);
2378}
2379
2380template <>
2381OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(const uint8_t* sourceLeft, const uint8_t* sourceRight, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4)
2382{
2383 ocean_assert(filterFactor <= 0xFFFFu);
2384
2385 // we store the same filter value in each of the four 16 bit values
2386 const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2387
2388#if defined(__aarch64__)
2389
2390 // we load eight 8bit source values and we convert them to 16 bit values afterwards
2391 const uint16x8_t source16_8 = vaddl_u8(vreinterpret_u8_u32(vdup_n_u32(*((const uint32_t*)sourceLeft))), vreinterpret_u8_u32(vdup_n_u32(*((const uint32_t*)sourceRight))));
2392
2393#else
2394
2395 uint32_t sourceValueLeft;
2396 ((uint8_t*)&sourceValueLeft)[0] = sourceLeft[0];
2397 ((uint8_t*)&sourceValueLeft)[1] = sourceLeft[1];
2398 ((uint8_t*)&sourceValueLeft)[2] = sourceLeft[2];
2399 ((uint8_t*)&sourceValueLeft)[3] = sourceLeft[3];
2400
2401 uint32_t sourceValueRight;
2402 ((uint8_t*)&sourceValueRight)[0] = sourceRight[0];
2403 ((uint8_t*)&sourceValueRight)[1] = sourceRight[1];
2404 ((uint8_t*)&sourceValueRight)[2] = sourceRight[2];
2405 ((uint8_t*)&sourceValueRight)[3] = sourceRight[3];
2406
2407 // we load eight 8bit source values and we convert them to 16 bit values afterwards
2408 const uint16x8_t source16_8 = vaddl_u8(vreinterpret_u8_u32(vdup_n_u32(sourceValueLeft)), vreinterpret_u8_u32(vdup_n_u32(sourceValueRight)));
2409
2410#endif // __aarch64__
2411
2412 // we multiply each 16 bit value with the same 16 bit filter factor and add the 32 bit results to the given values
2413 target_32x4 = vmlal_u16(target_32x4, vget_low_u16(source16_8), filterFactor16_4);
2414}
2415
2416template <>
2417OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<float, float, ProcessorInstructions::PI_NEON>(const float* sourceLeft, const float* sourceRight, const float& filterFactor, SIMD32x4<float>::Type& target_32x4)
2418{
2419 // we store the same filter value in each of the four 32 bit values
2420 const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2421
2422 // we load eight 8bit source values and we convert them to 16 bit values afterwards
2423 const float32x4_t source_32x4 = vaddq_f32(vld1q_f32(sourceLeft), vld1q_f32(sourceRight));
2424
2425 // we multiply each 16 bit value with the same 16 bit filter factor and add the 32 bit results to the given values
2426 target_32x4 = vmlaq_f32(target_32x4, source_32x4, filterFactor_32x4);
2427}
2428
2429template <>
2430OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(const uint8_t* source, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2431{
2432 ocean_assert(filterFactor <= 0xFFFFu);
2433
2434 // we store the same filter value in each of the four 16 bit values
2435 const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2436
2437 // we load eight 8bit source values and we convert them to 16 bit values afterwards
2438 const uint16x8_t source16_8 = vmovl_u8(vld1_u8(source));
2439
2440 // we multiply each 16 bit value with the same 16 bit filter factor and add the 32 bit results to the given values
2441 target_32x4a = vmlal_u16(target_32x4a, vget_low_u16(source16_8), filterFactor16_4);
2442 target_32x4b = vmlal_u16(target_32x4b, vget_high_u16(source16_8), filterFactor16_4);
2443}
2444
2445template <>
2446OCEAN_FORCE_INLINE void FrameFilterSeparable::asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_NEON>(const float* source, const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2447{
2448 // we store the same filter value in each of the four 32 bit values
2449 const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2450
2451 // we load eight 32 bit source values
2452 const float32x4_t source_32x4a = vld1q_f32(source + 0);
2453 const float32x4_t source_32x4b = vld1q_f32(source + 4);
2454
2455 // we multiply each value with the same filter factor, and sum the result
2456 target_32x4a = vmlaq_f32(target_32x4a, source_32x4a, filterFactor_32x4);
2457 target_32x4b = vmlaq_f32(target_32x4b, source_32x4b, filterFactor_32x4);
2458}
2459
2460template <>
2461OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<uint8_t, unsigned int, ProcessorInstructions::PI_NEON>(const uint8_t* sourceLeft, const uint8_t* sourceRight, const unsigned int& filterFactor, SIMD32x4<unsigned int>::Type& target_32x4a, SIMD32x4<unsigned int>::Type& target_32x4b)
2462{
2463 ocean_assert(filterFactor <= 0xFFFFu);
2464
2465 // we store the same filter value in each of the four 16 bit values
2466 const uint16x4_t filterFactor16_4 = vdup_n_u16(uint16_t(filterFactor));
2467
2468 // we load eight 8bit source values and we convert them to 16 bit values afterwards
2469 const uint16x8_t source16_8 = vaddl_u8(vld1_u8(sourceLeft), vld1_u8(sourceRight));
2470
2471 // we multiply each 16 bit value with the same 16 bit filter factor and add the 32 bit results to the given values
2472 target_32x4a = vmlal_u16(target_32x4a, vget_low_u16(source16_8), filterFactor16_4);
2473 target_32x4b = vmlal_u16(target_32x4b, vget_high_u16(source16_8), filterFactor16_4);
2474}
2475
2476template <>
2477OCEAN_FORCE_INLINE void FrameFilterSeparable::symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<float, float, ProcessorInstructions::PI_NEON>(const float* sourceLeft, const float* sourceRight, const float& filterFactor, SIMD32x4<float>::Type& target_32x4a, SIMD32x4<float>::Type& target_32x4b)
2478{
2479 // we store the same filter value in each of the four 16 bit values
2480 const float32x4_t filterFactor_32x4 = vdupq_n_f32(filterFactor);
2481
2482 // we load eight 32 bit source values
2483 const float32x4_t source_32x4a = vaddq_f32(vld1q_f32(sourceLeft + 0), vld1q_f32(sourceRight + 0));
2484 const float32x4_t source_32x4b = vaddq_f32(vld1q_f32(sourceLeft + 4), vld1q_f32(sourceRight + 4));
2485
2486 // we multiply each value with the same filter factor, and sum the result
2487 target_32x4a = vmlaq_f32(target_32x4a, source_32x4a, filterFactor_32x4);
2488 target_32x4b = vmlaq_f32(target_32x4b, source_32x4b, filterFactor_32x4);
2489}
2490
2491#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
2492
2493template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
2494OCEAN_FORCE_INLINE void FrameFilterSeparable::filterHorizontalRowOneBlockWith4Elements(const TSource* const source, TFilter* const target, const unsigned int channels, const TFilter* const filter, const unsigned int filterSize, const bool isSymmetric)
2495{
2496 /*
2497 * We determine 4 filter responses within one loop iteration.
2498 * For a filter with size 5 for 1 channel frames we apply the following strategy:
2499 *
2500 * Source Data: Y Y Y Y Y Y Y Y Y (if the source data has a Y8 pixel format)
2501 * 1 4 6 4 1 .
2502 * 1 4 6 4 1
2503 * 1 4 6 4 1
2504 * . 1 4 6 4 1
2505 * . .
2506 * Target Data: - - Y Y Y Y - -
2507 *
2508 *
2509 * For a filter with size 5 for 3 channel frames we apply the following strategy:
2510 *
2511 * Source Data: R G B R G B R G B R G B R G B R G B R G B R G B (if the source data has a RGB24 pixel format)
2512 * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2513 * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2514 * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2515 * . 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2516 * . . 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2517 * . .
2518 * Target Data: - - - - - - R G B R - - - - - - - - - - - - - - - -
2519 *
2520 */
2521
2522 ocean_assert(source != nullptr && filter != nullptr);
2523 ocean_assert(channels >= 1u);
2524 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2525
2526 typename SIMD32x4<TFilter>::Type target_32x4;
2527
2528 setSIMDZero<TFilter, tProcessorInstructions>(target_32x4);
2529
2530 if (isSymmetric)
2531 {
2532 const unsigned int filterSize_2 = filterSize / 2u;
2533
2534 // we iterate over the first half of filter factors [0, filterSize_2)
2535 for (unsigned int n = 0u; n < filterSize_2; ++n)
2536 {
2537 symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, source + (filterSize - n - 1) * channels, filter[n], target_32x4);
2538 }
2539
2540 // we handle the center filter factor at filterSize_2
2541 asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + filterSize_2 * channels, filter[filterSize_2], target_32x4);
2542 }
2543 else
2544 {
2545 // we iterate over the first half of filter factors [0, filterSize_2)
2546 for (unsigned int n = 0u; n < filterSize; ++n)
2547 {
2548 asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, filter[n], target_32x4);
2549 }
2550 }
2551
2552 writeSIMD<TFilter, tProcessorInstructions>(target_32x4, target);
2553}
2554
2555template <typename TSource, typename TFilter, ProcessorInstructions tProcessorInstructions>
2556OCEAN_FORCE_INLINE void FrameFilterSeparable::filterHorizontalRowOneBlockWith8Elements(const TSource* const source, TFilter* const target, const unsigned int channels, const TFilter* const filter, const unsigned int filterSize, const bool isSymmetric)
2557{
2558 /*
2559 * We determine 8 filter responses within one loop iteration.
2560 * For a filter with size 5 for 1 channel frames we apply the following strategy:
2561 *
2562 * Source Data: Y Y Y Y Y Y Y Y Y Y Y Y (if the source data has a Y8 pixel format)
2563 * 1 4 6 4 1 .
2564 * 1 4 6 4 1 .
2565 * 1 4 6 4 1 .
2566 * . 1 4 6 4 1 .
2567 * . 1 4 6 4 1 .
2568 * . 1 4 6 4 1
2569 * . 1 4 6 4 1
2570 * . 1 4 6 4 1
2571 * . .
2572 * Target Data: - - Y Y Y Y Y Y Y Y - -
2573 *
2574 *
2575 * For a filter with size 5 for 3 channel frames we apply the following strategy:
2576 *
2577 * Source Data: R G B R G B R G B R G B R G B R G B R G B R G B (if the source data has a RGB24 pixel format)
2578 * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2579 * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2580 * 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2581 * . 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2582 * . 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2583 * . . 1 1 1 4 4 4 6 6 6 4 4 4 1 1 1
2584 * . .
2585 * Target Data: - - - - - - R G B R G B R G - - - - - - - - - - - - - - - -
2586 *
2587 */
2588
2589 ocean_assert(source != nullptr && filter != nullptr);
2590 ocean_assert(channels >= 1u);
2591 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2592
2593 typename SIMD32x4<TFilter>::Type target_32x4a, target_32x4b;
2594
2595 setSIMDZero<TFilter, tProcessorInstructions>(target_32x4a);
2596 setSIMDZero<TFilter, tProcessorInstructions>(target_32x4b);
2597
2598 if (isSymmetric)
2599 {
2600 const unsigned int filterSize_2 = filterSize / 2u;
2601
2602 // we iterate over the first half of filter factors [0, filterSize_2)
2603 for (unsigned int n = 0u; n < filterSize_2; ++n)
2604 {
2605 symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, source + (filterSize - n - 1) * channels, filter[n], target_32x4a, target_32x4b);
2606 }
2607
2608 // we handle the center filter factor at filterSize_2
2609 asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + filterSize_2 * channels, filter[filterSize_2], target_32x4a, target_32x4b);
2610 }
2611 else
2612 {
2613 // we iterate over the first half of filter factors [0, filterSize_2)
2614 for (unsigned int n = 0u; n < filterSize; ++n)
2615 {
2616 asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements<TSource, TFilter, tProcessorInstructions>(source + n * channels, filter[n], target_32x4a, target_32x4b);
2617 }
2618 }
2619
2620 writeSIMD<TFilter, tProcessorInstructions>(target_32x4a, target + 0);
2621 writeSIMD<TFilter, tProcessorInstructions>(target_32x4b, target + 4);
2622}
2623
2624#if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
2625
2626template <>
2627OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_SSE_2>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
2628{
2629 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
2630 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2631
2632 /*
2633 * This function uses the following SSE instructions, and needs SSE2 or higher
2634 *
2635 * SSE1:
2636 * _mm_set_ps1
2637 * _mm_mul_ps
2638 * _mm_add_ps
2639 * _mm_loadu_ps
2640 *
2641 * SSE2:
2642 * _mm_loadu_si128
2643 * _mm_cvtepi32_ps
2644 * _mm_add_epi32
2645 * _mm_cvtps_epi32
2646 * _mm_packs_epi32
2647 * _mm_packus_epi16
2648 * _mm_storeu_si128
2649 */
2650
2651 /*
2652 * We determine 16 filter responses within one loop iteration.
2653 * For a filter with size 5 for 1 channel frames, with row = 0, we apply the following mirroring strategy:
2654 *
2655 * Source Data:
2656 * 1 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2657 * 0 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2658 * ---------------------------------
2659 * 0 Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <---------
2660 * 1 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2661 * 2 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2662 * 3 Y
2663 * 4 Y
2664 *
2665 * For frames with n channels the strategy stays the same.
2666 */
2667
2668 const unsigned int filterSize_2 = filterSize / 2u;
2669
2670 // the border covers row ids within the range [0, filterSize_2)
2671 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2672
2673 const __m128i* sourceBlock = (const __m128i*)source;
2674
2675 // we store one filter value in each of the four 32 bit integer values
2676 __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
2677
2678 // now we load four input values, and multiply each of them with the center kernel value
2679 __m128 source_32x4a = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 0));
2680 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2681
2682 // now we load the next four input values, ...
2683 __m128 source_32x4b = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 1));
2684 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2685
2686 __m128 source_32x4c = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 2));
2687 __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
2688
2689 __m128 source_32x4d = _mm_cvtepi32_ps(_mm_loadu_si128(sourceBlock + 3));
2690 __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
2691
2692 __m128i source128ai, source128bi;
2693
2694 // now we proceed with the remaining filter values
2695 for (unsigned int i = 1u; i <= filterSize_2; ++i)
2696 {
2697 // we determine the mirrored locations (and the row offset in relation to the current row)
2698 const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
2699 const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
2700
2701 // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
2702 const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2703 const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2704
2705 if (isSymmetric)
2706 {
2707 // we have a symmetric filter, so let's do some optimizations
2708 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
2709
2710 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
2711
2712 source128ai = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 0), _mm_loadu_si128((__m128i*)sourcePlus + 0));
2713 source128bi = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 1), _mm_loadu_si128((__m128i*)sourcePlus + 1));
2714
2715 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
2716 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
2717
2718 source128ai = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 2), _mm_loadu_si128((__m128i*)sourcePlus + 2));
2719 source128bi = _mm_add_epi32(_mm_loadu_si128((__m128i*)sourceMinus + 3), _mm_loadu_si128((__m128i*)sourcePlus + 3));
2720
2721 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ai), filterFactor_32x4));
2722 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128bi), filterFactor_32x4));
2723 }
2724 else
2725 {
2726 // we don't have a symmetric filter, so we need to handle two individual filters
2727 __m128 filterFactor128Minus = _mm_set_ps1(filter[filterSize_2 - i]);
2728 __m128 filterFactor128Plus = _mm_set_ps1(filter[filterSize_2 + i]);
2729
2730 __m128i source128aiMinus = _mm_loadu_si128((__m128i*)sourceMinus + 0);
2731 __m128i source128aiPlus = _mm_loadu_si128((__m128i*)sourcePlus + 0);
2732
2733 __m128i source128biMinus = _mm_loadu_si128((__m128i*)sourceMinus + 1);
2734 __m128i source128biPlus = _mm_loadu_si128((__m128i*)sourcePlus + 1);
2735
2736 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiMinus), filterFactor128Minus));
2737 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(_mm_cvtepi32_ps(source128aiPlus), filterFactor128Plus));
2738
2739 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biMinus), filterFactor128Minus));
2740 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(_mm_cvtepi32_ps(source128biPlus), filterFactor128Plus));
2741
2742 __m128i source128ciMinus = _mm_loadu_si128((__m128i*)sourceMinus + 2);
2743 __m128i source128ciPlus = _mm_loadu_si128((__m128i*)sourcePlus + 2);
2744
2745 __m128i source128diMinus = _mm_loadu_si128((__m128i*)sourceMinus + 3);
2746 __m128i source128diPlus = _mm_loadu_si128((__m128i*)sourcePlus + 3);
2747
2748 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciMinus), filterFactor128Minus));
2749 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(_mm_cvtepi32_ps(source128ciPlus), filterFactor128Plus));
2750
2751 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diMinus), filterFactor128Minus));
2752 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(_mm_cvtepi32_ps(source128diPlus), filterFactor128Plus));
2753 }
2754 }
2755
2756 // now we have 8 bit values in each 32 bit register
2757 __m128i result128ab = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4a), _mm_cvtps_epi32(result_32x4b));
2758 __m128i result128cd = _mm_packs_epi32(_mm_cvtps_epi32(result_32x4c), _mm_cvtps_epi32(result_32x4d));
2759 __m128i result128 = _mm_packus_epi16(result128ab, result128cd);
2760
2761 _mm_storeu_si128((__m128i*)target, result128);
2762}
2763
2764template <>
2765OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<float, float, PI_SSE_2>(const float* source, float* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
2766{
2767 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
2768 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2769
2770 /**
2771 * This function uses the following SSE instructions, and needs SSE2 or higher
2772 *
2773 * SSE:
2774 * _mm_set_ps1
2775 * _mm_mul_ps
2776 * _mm_add_ps
2777 *
2778 * SSE2:
2779 * _mm_loadu_si128
2780 * _mm_castsi128_ps
2781 */
2782
2783 /*
2784 * We determine 16 filter responses within one loop iteration.
2785 * For a filter with size 5 for 1 channel frames, with row = 0, we apply the following mirroring strategy:
2786 *
2787 * Source Data:
2788 * 1 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2789 * 0 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2790 * ---------------------------------
2791 * 0 Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <---------
2792 * 1 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2793 * 2 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2794 * 3 Y
2795 * 4 Y
2796 *
2797 * For frames with n channels the strategy stays the same.
2798 */
2799
2800 const unsigned int filterSize_2 = filterSize / 2u;
2801
2802 // the border covers row ids within the range [0, filterSize_2)
2803 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2804
2805 const __m128i* sourceBlock = (const __m128i*)source;
2806
2807 // we store one filter value in each of the four 32 bit values
2808 __m128 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2]);
2809
2810 // now we load four input values, and multiply each of them with the center kernel value
2811 __m128 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 0));
2812 __m128 result_32x4a = _mm_mul_ps(source_32x4a, filterFactor_32x4);
2813
2814 // now we load the next four input values, ...
2815 __m128 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 1));
2816 __m128 result_32x4b = _mm_mul_ps(source_32x4b, filterFactor_32x4);
2817
2818 __m128 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 2));
2819 __m128 result_32x4c = _mm_mul_ps(source_32x4c, filterFactor_32x4);
2820
2821 __m128 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128(sourceBlock + 3));
2822 __m128 result_32x4d = _mm_mul_ps(source_32x4d, filterFactor_32x4);
2823
2824 // now we proceed with the remaining filter values
2825 for (unsigned int i = 1u; i <= filterSize_2; ++i)
2826 {
2827 // we determine the mirrored locations (and the row offset in relation to the current row)
2828 const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
2829 const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
2830
2831 // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
2832 const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2833 const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2834
2835 if (isSymmetric)
2836 {
2837 // we have a symmetric filter, so let's do some optimizations
2838 filterFactor_32x4 = _mm_set_ps1(filter[filterSize_2 + i]);
2839
2840 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
2841
2842 source_32x4a = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 0)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 0)));
2843 source_32x4b = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 1)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 1)));
2844
2845 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4));
2846 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4));
2847
2848 source_32x4c = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 2)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 2)));
2849 source_32x4d = _mm_add_ps(_mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 3)), _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 3)));
2850
2851 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4));
2852 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4));
2853 }
2854 else
2855 {
2856 // we don't have a symmetric filter, so we need to handle two individual filters
2857 __m128 filterFactor_32x4Minus = _mm_set_ps1(filter[filterSize_2 - i]);
2858 __m128 filterFactor_32x4Plus = _mm_set_ps1(filter[filterSize_2 + i]);
2859
2860 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 0));
2861 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Minus));
2862
2863 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 1));
2864 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Minus));
2865
2866 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 2));
2867 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Minus));
2868
2869 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourceMinus + 3));
2870 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Minus));
2871
2872 source_32x4a = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 0));
2873 result_32x4a = _mm_add_ps(result_32x4a, _mm_mul_ps(source_32x4a, filterFactor_32x4Plus));
2874
2875 source_32x4b = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 1));
2876 result_32x4b = _mm_add_ps(result_32x4b, _mm_mul_ps(source_32x4b, filterFactor_32x4Plus));
2877
2878 source_32x4c = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 2));
2879 result_32x4c = _mm_add_ps(result_32x4c, _mm_mul_ps(source_32x4c, filterFactor_32x4Plus));
2880
2881 source_32x4d = _mm_castsi128_ps(_mm_loadu_si128((const __m128i*)sourcePlus + 3));
2882 result_32x4d = _mm_add_ps(result_32x4d, _mm_mul_ps(source_32x4d, filterFactor_32x4Plus));
2883 }
2884 }
2885
2886 writeSIMD<float, PI_SSE_2>(result_32x4a, target + 0);
2887 writeSIMD<float, PI_SSE_2>(result_32x4b, target + 4);
2888 writeSIMD<float, PI_SSE_2>(result_32x4c, target + 8);
2889 writeSIMD<float, PI_SSE_2>(result_32x4d, target + 12);
2890}
2891
2892#endif // OCEAN_HARDWARE_SSE_VERSION >= 20
2893
2894#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
2895
2896template <>
2897OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<unsigned int, uint8_t, PI_NEON>(const unsigned int* source, uint8_t* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
2898{
2899 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
2900 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
2901
2902 /*
2903 * We determine 16 filter responses within one loop iteration.
2904 * For a filter with size 5 for 1 channel frames, with row = 0, we apply the following mirroring strategy:
2905 *
2906 * Source Data:
2907 * 1 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2908 * 0 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2909 * ---------------------------------
2910 * 0 Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <---------
2911 * 1 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
2912 * 2 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2913 * 3 Y
2914 * 4 Y
2915 *
2916 * For frames with n channels the strategy stays the same.
2917 */
2918
2919 const unsigned int filterSize_2 = filterSize / 2u;
2920
2921 // the border covers row ids within the range [0, filterSize_2)
2922 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
2923
2924 // we store one filter value in each of the four 32 bit integer values
2925 float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
2926
2927 // now we load four input values, and multiply each of them with the center kernel value
2928 float32x4_t source_32x4a = vcvtq_f32_u32(vld1q_u32(source + 4 * 0));
2929 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
2930
2931 float32x4_t source_32x4b = vcvtq_f32_u32(vld1q_u32(source + 4 * 1));
2932 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
2933
2934 float32x4_t source_32x4c = vcvtq_f32_u32(vld1q_u32(source + 4 * 2));
2935 float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
2936
2937 float32x4_t source_32x4d = vcvtq_f32_u32(vld1q_u32(source + 4 * 3));
2938 float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
2939
2940 // now we proceed with the remaining filter values
2941 for (unsigned int i = 1u; i <= filterSize_2; ++i)
2942 {
2943 // we determine the mirrored locations (and the row offset in relation to the current row)
2944 const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
2945 const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
2946
2947 // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
2948 const unsigned int* sourceMinus = source + offsetMinus * int(sourceStrideElements);
2949 const unsigned int* sourcePlus = source + offsetPlus * int(sourceStrideElements);
2950
2951 if (isSymmetric)
2952 {
2953 // we have a symmetric filter, so let's do some optimizations
2954 filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
2955
2956 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
2957
2958 uint32x4_t source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 0), vld1q_u32(sourcePlus + 4 * 0));
2959 uint32x4_t source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 1), vld1q_u32(sourcePlus + 4 * 1));
2960
2961 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128ai), filterFactor_32x4);
2962 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128bi), filterFactor_32x4);
2963
2964 source128ai = vaddq_u32(vld1q_u32(sourceMinus + 4 * 2), vld1q_u32(sourcePlus + 4 * 2));
2965 source128bi = vaddq_u32(vld1q_u32(sourceMinus + 4 * 3), vld1q_u32(sourcePlus + 4 * 3));
2966
2967 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ai), filterFactor_32x4);
2968 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128bi), filterFactor_32x4);
2969 }
2970 else
2971 {
2972 // we don't have a symmetric filter, so we need to handle two individual filters
2973
2974 float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
2975 float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
2976
2977 uint32x4_t source128aiMinus = vld1q_u32(sourceMinus + 4 * 0);
2978 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiMinus), filterFactor128Minus);
2979
2980 uint32x4_t source128biMinus = vld1q_u32(sourceMinus + 4 * 1);
2981 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biMinus), filterFactor128Minus);
2982
2983 uint32x4_t source128aiPlus = vld1q_u32(sourcePlus + 4 * 0);
2984 result_32x4a = vmlaq_f32(result_32x4a, vcvtq_f32_u32(source128aiPlus), filterFactor128Plus);
2985
2986 uint32x4_t source128biPlus = vld1q_u32(sourcePlus + 4 * 1);
2987 result_32x4b = vmlaq_f32(result_32x4b, vcvtq_f32_u32(source128biPlus), filterFactor128Plus);
2988
2989 uint32x4_t source128ciMinus = vld1q_u32(sourceMinus + 4 * 2);
2990 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciMinus), filterFactor128Minus);
2991
2992 uint32x4_t source128diMinus = vld1q_u32(sourceMinus + 4 * 3);
2993 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diMinus), filterFactor128Minus);
2994
2995 uint32x4_t source128ciPlus = vld1q_u32(sourcePlus + 4 * 2);
2996 result_32x4c = vmlaq_f32(result_32x4c, vcvtq_f32_u32(source128ciPlus), filterFactor128Plus);
2997
2998 uint32x4_t source128diPlus = vld1q_u32(sourcePlus + 4 * 3);
2999 result_32x4d = vmlaq_f32(result_32x4d, vcvtq_f32_u32(source128diPlus), filterFactor128Plus);
3000 }
3001 }
3002
3003 // now we have 8 bit values in each 32 bit register
3004 uint16x8_t result128ab = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4a)), vqmovn_u32(vcvtq_u32_f32(result_32x4b)));
3005 uint16x8_t result128cd = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(result_32x4c)), vqmovn_u32(vcvtq_u32_f32(result_32x4d)));
3006
3007 uint8x16_t result128 = vcombine_u8(vqmovn_u16(result128ab), vqmovn_u16(result128cd));
3008
3009 vst1q_u8(target, result128);
3010}
3011
3012template <>
3013OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow16Elements32BitPerChannelFloat<float, float, PI_NEON>(const float* source, float* target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric)
3014{
3015 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
3016 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
3017
3018 /*
3019 * We determine 16 filter responses within one loop iteration.
3020 * For a filter with size 5 for 1 channel frames, with row = 0, we apply the following mirroring strategy:
3021 *
3022 * Source Data:
3023 * 1 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
3024 * 0 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
3025 * ---------------------------------
3026 * 0 Y 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 <---------
3027 * 1 Y 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
3028 * 2 Y 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
3029 * 3 Y
3030 * 4 Y
3031 *
3032 * For frames with n channels the strategy stays the same.
3033 */
3034
3035 const unsigned int filterSize_2 = filterSize / 2u;
3036
3037 // the border covers row ids within the range [0, filterSize_2)
3038 ocean_assert(row < filterSize_2 || row + filterSize_2 >= height);
3039
3040 // we store one filter value in each of the four 32 bit integer values
3041 float32x4_t filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2]);
3042
3043 // now we load four input values, and multiply each of them with the center kernel value
3044 float32x4_t source_32x4a = vld1q_f32(source + 4 * 0);
3045 float32x4_t result_32x4a = vmulq_f32(source_32x4a, filterFactor_32x4);
3046
3047 float32x4_t source_32x4b = vld1q_f32(source + 4 * 1);
3048 float32x4_t result_32x4b = vmulq_f32(source_32x4b, filterFactor_32x4);
3049
3050 float32x4_t source_32x4c = vld1q_f32(source + 4 * 2);
3051 float32x4_t result_32x4c = vmulq_f32(source_32x4c, filterFactor_32x4);
3052
3053 float32x4_t source_32x4d = vld1q_f32(source + 4 * 3);
3054 float32x4_t result_32x4d = vmulq_f32(source_32x4d, filterFactor_32x4);
3055
3056 // now we proceed with the remaining filter values
3057 for (unsigned int i = 1u; i <= filterSize_2; ++i)
3058 {
3059 // we determine the mirrored locations (and the row offset in relation to the current row)
3060 const int offsetMinus = int(mirroredBorderLocationLeft(int(row) - int(i))) - int(row);
3061 const int offsetPlus = int(mirroredBorderLocationRight(row + i, height)) - int(row);
3062
3063 // depending on whether we are at the top border or at the bottom border we change the minus and plus source values
3064 const float* sourceMinus = source + offsetMinus * int(sourceStrideElements);
3065 const float* sourcePlus = source + offsetPlus * int(sourceStrideElements);
3066
3067 if (isSymmetric)
3068 {
3069 // we have a symmetric filter, so let's do some optimizations
3070 filterFactor_32x4 = vdupq_n_f32(filter[filterSize_2 + i]);
3071
3072 // we sum the values of the upper and the lower row (as both will be multiplied with the same filter value)
3073
3074 source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 0), vld1q_f32(sourcePlus + 4 * 0));
3075 source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 1), vld1q_f32(sourcePlus + 4 * 1));
3076
3077 result_32x4a = vmlaq_f32(result_32x4a, source_32x4a, filterFactor_32x4);
3078 result_32x4b = vmlaq_f32(result_32x4b, source_32x4b, filterFactor_32x4);
3079
3080 source_32x4a = vaddq_f32(vld1q_f32(sourceMinus + 4 * 2), vld1q_f32(sourcePlus + 4 * 2));
3081 source_32x4b = vaddq_f32(vld1q_f32(sourceMinus + 4 * 3), vld1q_f32(sourcePlus + 4 * 3));
3082
3083 result_32x4c = vmlaq_f32(result_32x4c, source_32x4a, filterFactor_32x4);
3084 result_32x4d = vmlaq_f32(result_32x4d, source_32x4b, filterFactor_32x4);
3085 }
3086 else
3087 {
3088 // we don't have a symmetric filter, so we need to handle two individual filters
3089
3090 float32x4_t filterFactor128Minus = vdupq_n_f32(filter[filterSize_2 - i]);
3091 float32x4_t filterFactor128Plus = vdupq_n_f32(filter[filterSize_2 + i]);
3092
3093 float32x4_t source128aMinus = vld1q_f32(sourceMinus + 4 * 0);
3094 float32x4_t source128aPlus = vld1q_f32(sourcePlus + 4 * 0);
3095
3096 float32x4_t source128bMinus = vld1q_f32(sourceMinus + 4 * 1);
3097 float32x4_t source128bPlus = vld1q_f32(sourcePlus + 4 * 1);
3098
3099 result_32x4a = vmlaq_f32(result_32x4a, source128aMinus, filterFactor128Minus);
3100 result_32x4b = vmlaq_f32(result_32x4b, source128bMinus, filterFactor128Minus);
3101
3102 result_32x4a = vmlaq_f32(result_32x4a, source128aPlus, filterFactor128Plus);
3103 result_32x4b = vmlaq_f32(result_32x4b, source128bPlus, filterFactor128Plus);
3104
3105 source128aMinus = vld1q_f32(sourceMinus + 4 * 2);
3106 source128aPlus = vld1q_f32(sourcePlus + 4 * 2);
3107
3108 source128bMinus = vld1q_f32(sourceMinus + 4 * 3);
3109 source128bPlus = vld1q_f32(sourcePlus + 4 * 3);
3110
3111 result_32x4c = vmlaq_f32(result_32x4c, source128aMinus, filterFactor128Minus);
3112 result_32x4d = vmlaq_f32(result_32x4d, source128bMinus, filterFactor128Minus);
3113
3114 result_32x4c = vmlaq_f32(result_32x4c, source128aPlus, filterFactor128Plus);
3115 result_32x4d = vmlaq_f32(result_32x4d, source128bPlus, filterFactor128Plus);
3116 }
3117 }
3118
3119 vst1q_f32(target + 0, result_32x4a);
3120 vst1q_f32(target + 4, result_32x4b);
3121 vst1q_f32(target + 8, result_32x4c);
3122 vst1q_f32(target + 12, result_32x4d);
3123}
3124
3125#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
3126
3127template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
3128OCEAN_FORCE_INLINE void FrameFilterSeparable::filterVerticalBorderRow32BitPerChannelFloat(const TSource* source, TTarget* target, const unsigned int width, const unsigned height, const unsigned int channels, const unsigned int row, const float* filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
3129{
3130 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
3131 ocean_assert(channels >= 1u);
3132 ocean_assert(filterSize <= height);
3133 ocean_assert(filterSize % 2u == 1u);
3134
3135 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3136
3137 unsigned int remainingElements = width * channels;
3138
3139 while (remainingElements >= 16u)
3140 {
3141 filterVerticalBorderRow16Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, height, row, filter, filterSize, isSymmetric);
3142
3143 source += 16;
3144 target += 16;
3145
3146 remainingElements -= 16u;
3147 }
3148
3149 while (remainingElements >= 8u)
3150 {
3151 filterVerticalBorderRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, sourceStrideElements, height, row, filter, filterSize, isSymmetric);
3152
3153 source += 8;
3154 target += 8;
3155
3156 remainingElements -= 8u;
3157 }
3158
3159 ocean_assert(width * channels >= 8u);
3160 ocean_assert(remainingElements < 8u);
3161
3162 if (remainingElements != 0u)
3163 {
3164 const unsigned int shift = 8u - remainingElements;
3165
3166 filterVerticalBorderRow8Elements32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source - shift, target - shift, sourceStrideElements, height, row, filter, filterSize, isSymmetric);
3167 }
3168}
3169
3170template <typename TSource, typename TFilter, const ProcessorInstructions tProcessorInstructions>
3171void FrameFilterSeparable::filterHorizontalSubset(const TSource* source, TFilter* target, const unsigned int width, const unsigned int height, const unsigned int channels, const TFilter* filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
3172{
3173 ocean_assert(source != nullptr && target != nullptr && filter != nullptr);
3174 ocean_assert(width >= filterSize + 1u);
3175
3176 ocean_assert(channels >= 1u && channels <= 8u);
3177 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
3178
3179 ocean_assert_and_suppress_unused(firstRow + numberRows <= height, height);
3180
3181 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3182 const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3183
3184 const bool isSymmetric = isFilterSymmetric(filter, filterSize);
3185
3186 const unsigned int filterSize_2 = filterSize / 2u;
3187 const unsigned int extraPixels = filterSize_2 * 2u;
3188
3189 const unsigned int extendedElements = (width + extraPixels) * channels;
3190
3191 Memory extendedRowMemory = Memory::create<TSource>(extendedElements);
3192 TSource* const extendedRow = extendedRowMemory.data<TSource>();
3193 ocean_assert(extendedRow != nullptr);
3194
3195 source += firstRow * sourceStrideElements;
3196 target += firstRow * targetStrideElements;
3197
3198 for (unsigned int rowsProcessed = 0u; rowsProcessed < numberRows; ++rowsProcessed)
3199 {
3200 // we create an intermediate row with extended pixels left and right
3201 fillLeftExtraBorder<TSource>(source, channels, filterSize_2, extendedRow);
3202 memcpy(extendedRow + filterSize_2 * channels, source, width * channels * sizeof(TSource));
3203 fillRightExtraBorder<TSource>(source + width * channels, channels, filterSize_2, extendedRow + (width + filterSize_2) * channels);
3204
3205 const TSource* extendedSource = extendedRow;
3206
3207 unsigned int remainingElements = width * channels;
3208
3209#if (defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10) || (defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20)
3210
3211#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
3212 const ProcessorInstructions instructions = ProcessorInstructions(PI_NEON & tProcessorInstructions);
3213#elif defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 20
3214 const ProcessorInstructions instructions = ProcessorInstructions(PI_SSE_2 & tProcessorInstructions);
3215#endif
3216
3217 // now we apply 8-block-elements as long as they fit into the frame
3218
3219 while (remainingElements >= 8u)
3220 {
3221 filterHorizontalRowOneBlockWith8Elements<TSource, TFilter, instructions>(extendedSource, target, channels, filter, filterSize, isSymmetric);
3222
3223 extendedSource += 8;
3224 target += 8;
3225
3226 remainingElements -= 8u;
3227 }
3228
3229 // now we apply 4-block-elements as long as they fit into the frame
3230
3231 while (remainingElements >= 4u)
3232 {
3233 filterHorizontalRowOneBlockWith4Elements<TSource, TFilter, instructions>(extendedSource, target, channels, filter, filterSize, isSymmetric);
3234
3235 extendedSource += 4;
3236 target += 4;
3237
3238 remainingElements -= 4u;
3239 }
3240
3241 // finally, we check whether we have 1-3 elements left; in this case, we simply process some elements another time
3242
3243 if (remainingElements != 0u)
3244 {
3245 const unsigned int shift = 4u - remainingElements;
3246
3247 extendedSource -= shift;
3248 target -= shift;
3249
3250 filterHorizontalRowOneBlockWith4Elements<TSource, TFilter, instructions>(extendedSource, target, channels, filter, filterSize, isSymmetric);
3251
3252 // we do not need to shift extendedSource += 4
3253 target += 4u;
3254 }
3255
3256#else
3257
3258 OCEAN_SUPPRESS_UNUSED_WARNING(extendedSource);
3259 OCEAN_SUPPRESS_UNUSED_WARNING(remainingElements);
3260 OCEAN_SUPPRESS_UNUSED_WARNING(isSymmetric);
3261
3262#endif // OCEAN_HARDWARE_NEON_VERSION >= 10 || OCEAN_HARDWARE_SSE_VERSION >= 20
3263
3264#ifdef OCEAN_INTENSIVE_DEBUG
3265 {
3266 const TFilter* const debugTarget = target - width * channels;
3267
3268 for (unsigned int x = 0u; x < width; ++x)
3269 {
3270 for (unsigned int n = 0u; n < channels; ++n)
3271 {
3272 float result = 0.0f;
3273
3274 for (int xx = -int(filterSize_2); xx <= int(filterSize_2); ++xx)
3275 {
3276 const unsigned int mirroredXX = (x < filterSize_2) ? mirroredBorderLocationLeft(int(x) + xx) : mirroredBorderLocationRight((unsigned int)(int(x) + xx), width);
3277 result += float(*(source + mirroredXX * channels + int(n))) * filter[xx + int(filterSize_2)];
3278 }
3279
3280 const TFilter targetValue = debugTarget[x * channels + n];
3281
3282 if (std::is_same<float, TFilter>::value)
3283 {
3284 ocean_assert(NumericT<TFilter>::isWeakEqual(result, targetValue));
3285 }
3286 else
3287 {
3288 const TFilter result8_converted = (TFilter)(result);
3289 const TFilter result8_rounded = (TFilter)(result + 0.51f);
3290 ocean_assert(result8_converted == targetValue || result8_rounded == targetValue);
3291 }
3292 }
3293 }
3294 }
3295#endif
3296
3297 source += sourceStrideElements;
3298 target += targetPaddingElements;
3299 }
3300}
3301
3302template <typename TSource, typename TTarget, ProcessorInstructions tProcessorInstructions>
3303void FrameFilterSeparable::filterVerticalSubset(const TSource* source, TTarget* target, const unsigned int width, const unsigned int height, const unsigned int channels, const float* filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
3304{
3305 ocean_assert(source != nullptr && target != nullptr);
3306 ocean_assert(filter != nullptr);
3307 ocean_assert(height >= filterSize / 2u + 1u);
3308 ocean_assert(channels >= 1u && channels <= 8u);
3309
3310 ocean_assert(filterSize >= 1u && (filterSize % 2u) == 1u);
3311
3312 ocean_assert(firstRow + numberRows <= height);
3313 ocean_assert(width * channels >= 8u * 2u);
3314
3315 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3316 const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3317
3318 const bool isSymmetric = isFilterSymmetric(filter, filterSize);
3319
3320 const unsigned int filterSize_2 = filterSize / 2u;
3321
3322#ifdef OCEAN_INTENSIVE_DEBUG
3323 const TSource* const debugSource = source;
3324#endif
3325
3326 source += firstRow * sourceStrideElements;
3327 target += firstRow * targetStrideElements;
3328
3329 unsigned int row = firstRow;
3330
3331 // first we check whether we are located at the top border, whether we start within the first filterSize_2 rows
3332
3333 while (row < min(firstRow + numberRows, filterSize_2))
3334 {
3335 filterVerticalBorderRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, height, channels, row, filter, filterSize, isSymmetric, sourcePaddingElements);
3336
3337#ifdef OCEAN_INTENSIVE_DEBUG
3338 {
3339 for (unsigned int x = 0u; x < width * channels; ++x)
3340 {
3341 float result = 0.0f;
3342
3343 for (int y = -int(filterSize_2); y <= int(filterSize_2); ++y)
3344 {
3345 const unsigned int mirroredY = mirroredBorderLocationLeft(int(row) + y);
3346 result += float(*(debugSource + mirroredY * int(sourceStrideElements) + int(x))) * filter[y + int(filterSize_2)];
3347 }
3348
3349 const TTarget targetValue = target[x];
3350
3351 if (std::is_same<float, TTarget>::value)
3352 {
3353 ocean_assert(NumericT<TTarget>::isWeakEqual(result, targetValue));
3354 }
3355 else
3356 {
3357 ocean_assert(NumericT<TTarget>::isEqual((TTarget)(result), targetValue, TTarget(2)));
3358 }
3359 }
3360 }
3361#endif
3362
3363 source += sourceStrideElements;
3364 target += targetStrideElements;
3365
3366 ++row;
3367 }
3368
3369 // now we proceed the rows not located at the top or bottom border of the frame
3370
3371 while (row < min(firstRow + numberRows, height - filterSize_2))
3372 {
3373 filterVerticalCoreRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, channels, filter, filterSize, isSymmetric, sourcePaddingElements);
3374
3375#ifdef OCEAN_INTENSIVE_DEBUG
3376 {
3377 for (unsigned int x = 0u; x < width * channels; ++x)
3378 {
3379 float result = 0.0f;
3380
3381 for (int y = -int(filterSize_2); y <= int(filterSize_2); ++y)
3382 result += float(*(debugSource + (int(row) + y) * int(sourceStrideElements) + int(x))) * filter[y + int(filterSize_2)];
3383
3384 const TTarget targetValue = target[x];
3385
3386 ocean_assert(result >= 0.0f && result < 256.0f);
3387
3388 if (std::is_same<float, TTarget>::value)
3389 {
3390 ocean_assert(NumericT<TTarget>::isWeakEqual(result, targetValue));
3391 }
3392 else
3393 {
3394 ocean_assert(NumericT<TTarget>::isEqual((TTarget)(result), targetValue, TTarget(2)));
3395 }
3396 }
3397 }
3398#endif
3399
3400 source += sourceStrideElements;
3401 target += targetStrideElements;
3402
3403 ++row;
3404 }
3405
3406 // now we check whether we are located at the bottom border, whether we start within the last filterSize_2 rows (or need to process them)
3407
3408 while (row < firstRow + numberRows)
3409 {
3410 ocean_assert(row + filterSize_2 >= height);
3411
3412 filterVerticalBorderRow32BitPerChannelFloat<TSource, TTarget, tProcessorInstructions>(source, target, width, height, channels, row, filter, filterSize, isSymmetric, sourcePaddingElements);
3413
3414#ifdef OCEAN_INTENSIVE_DEBUG
3415 {
3416 // we do not check the left and right corner, we simply check the middle block of the upper border
3417 for (unsigned int x = 0u; x < width * channels; ++x)
3418 {
3419 float result = 0.0f;
3420
3421 for (int y = -int(filterSize_2); y <= int(filterSize_2); ++y)
3422 {
3423 const unsigned int mirroredY = mirroredBorderLocationRight((unsigned int)(int(row) + y), height);
3424 result += float(*(debugSource + mirroredY * int(sourceStrideElements) + int(x))) * filter[y + int(filterSize_2)];
3425 }
3426
3427 const TTarget targetValue = target[x];
3428
3429 ocean_assert(result >= 0.0f && result < 256.0f);
3430
3431 if (std::is_same<float, TTarget>::value)
3432 {
3433 ocean_assert(NumericT<TTarget>::isWeakEqual(result, targetValue));
3434 }
3435 else
3436 {
3437 ocean_assert(NumericT<TTarget>::isEqual((TTarget)(result), targetValue, TTarget(2)));
3438 }
3439 }
3440 }
3441#endif
3442
3443 source += sourceStrideElements;
3444 target += targetStrideElements;
3445
3446 ++row;
3447 }
3448}
3449
3450template <typename T, typename TFilter, ProcessorInstructions tProcessorInstructions>
3451inline void FrameFilterSeparable::filter(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const TFilter* horizontalFilter, const unsigned int horizontalFilterSize, const TFilter* verticalFilter, const unsigned int verticalFilterSize, ReusableMemory* reusableMemory, Worker* worker)
3452{
3453 Frame localIntermediateFrame;
3454 Frame* intermediateFrame = &localIntermediateFrame;
3455
3456 if (reusableMemory != nullptr)
3457 {
3458 intermediateFrame = &reusableMemory->intermediateFrame_;
3459 }
3460
3461 intermediateFrame->set(FrameType(width, height, FrameType::genericPixelFormat<TFilter>(channels), FrameType::ORIGIN_UPPER_LEFT), false /*forceOwner*/, true /*forceWritable*/);
3462
3463 // first we apply the horizontal filtering
3464
3465 if (worker)
3466 {
3467 worker->executeFunction(Worker::Function::createStatic(&filterHorizontalSubset<T, TFilter, tProcessorInstructions>, source, intermediateFrame->data<TFilter>(), width, height, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame->paddingElements(), 0u, 0u), 0u, height);
3468 }
3469 else
3470 {
3471 filterHorizontalSubset<T, TFilter, tProcessorInstructions>(source, intermediateFrame->data<TFilter>(), width, height, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame->paddingElements(), 0u, height);
3472 }
3473
3474 // now we apply the vertical filtering
3475 // therefore, we first need to calculate the floating point filter functions (in case we use integer factors)
3476
3477 std::vector<float> localFloatFilters;
3478 const float* verticalFloatFilter = nullptr;
3479
3480 if (std::is_same<TFilter, float>::value)
3481 {
3482 verticalFloatFilter = (const float*)(verticalFilter);
3483 }
3484 else
3485 {
3486 ocean_assert((std::is_same<TFilter, unsigned int>::value));
3487
3488 const TFilter sumHorizontalFilterValues = sumFilterValues(horizontalFilter, horizontalFilterSize);
3489 const TFilter sumVerticalFilterValues = sumFilterValues(verticalFilter, verticalFilterSize);
3490
3491 const unsigned int normalizationFactor = (unsigned int)(sumHorizontalFilterValues) * (unsigned int)(sumVerticalFilterValues);
3492 ocean_assert(normalizationFactor != 0u);
3493
3494 const float invNormalizationFactor = 1.0f / float(normalizationFactor);
3495
3496 std::vector<float>& floatFilterBufferToUse = reusableMemory != nullptr ? reusableMemory->filterFactors_ : localFloatFilters;
3497
3498 floatFilterBufferToUse.resize(verticalFilterSize);
3499
3500 for (unsigned int n = 0u; n < verticalFilterSize; ++n)
3501 {
3502 floatFilterBufferToUse[n] = float(verticalFilter[n]) * invNormalizationFactor;
3503 }
3504
3505 verticalFloatFilter = floatFilterBufferToUse.data();
3506 }
3507
3508 if (worker)
3509 {
3510 worker->executeFunction(Worker::Function::createStatic(&filterVerticalSubset<TFilter, T, tProcessorInstructions>, intermediateFrame->constdata<TFilter>(), target, width, height, channels, (const float*)(verticalFloatFilter), verticalFilterSize, intermediateFrame->paddingElements(), targetPaddingElements, 0u, 0u), 0u, height);
3511 }
3512 else
3513 {
3514 filterVerticalSubset<TFilter, T, tProcessorInstructions>(intermediateFrame->constdata<TFilter>(), target, width, height, channels, (const float*)(verticalFloatFilter), verticalFilterSize, intermediateFrame->paddingElements(), targetPaddingElements, 0u, height);
3515 }
3516}
3517
3518template <typename T, typename TFilter>
3519bool FrameFilterSeparable::filter(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const TFilter* horizontalFilter, const unsigned int horizontalFilterSize, const TFilter* verticalFilter, const unsigned int verticalFilterSize, Worker* worker, ReusableMemory* reusableMemory, const ProcessorInstructions processorInstructions)
3520{
3521 ocean_assert(source != nullptr && target != nullptr);
3522 ocean_assert(width >= horizontalFilterSize && height >= verticalFilterSize);
3523 ocean_assert(channels >= 1u);
3524
3525 if (source == nullptr || target == nullptr || width < horizontalFilterSize || height < verticalFilterSize || channels == 0u)
3526 {
3527 return false;
3528 }
3529
3530 OCEAN_SUPPRESS_UNUSED_WARNING(reusableMemory);
3531
3532 if (width * channels >= 16u && width >= horizontalFilterSize + 1u)
3533 {
3534 switch (Processor::bestInstructionGroup<false>(processorInstructions))
3535 {
3537 // temporary disabled: OCEAN_APPLY_IF_AVX((filter<T, TFilter, PI_GROUP_AVX_2_SSE_4_1>(source, target, width, height, channels, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, worker)));
3538 case PI_GROUP_SSE_4_1:
3540 case PI_GROUP_SSE_2:
3541 OCEAN_APPLY_IF_SSE((filter<T, TFilter, PI_SSE_2>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, reusableMemory, worker)));
3542 return true;
3543
3544 case PI_GROUP_NEON:
3545 OCEAN_APPLY_IF_NEON((filter<T, TFilter, PI_GROUP_NEON>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFilter, horizontalFilterSize, verticalFilter, verticalFilterSize, reusableMemory, worker)));
3546 return true;
3547
3548 case PI_NONE:
3549 break;
3550
3551 default:
3552 ocean_assert(false && "Invalid instructions!");
3553 }
3554 }
3555
3556 if constexpr (std::is_same<float, TFilter>::value)
3557 {
3558 filterUniversal<T>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, (const float*)(horizontalFilter), horizontalFilterSize, (const float*)(verticalFilter), verticalFilterSize, worker);
3559 return true;
3560 }
3561 else
3562 {
3563 if constexpr (std::is_same<unsigned int, TFilter>::value)
3564 {
3565 const TFilter horizontalNormalization = sumFilterValues(horizontalFilter, horizontalFilterSize);
3566 ocean_assert(horizontalNormalization != TFilter(0));
3567
3568 std::vector<float> horizontalFloatFilter(horizontalFilterSize);
3569 for (size_t n = 0; n < horizontalFloatFilter.size(); ++n)
3570 {
3571 horizontalFloatFilter[n] = float(horizontalFilter[n]) / float(horizontalNormalization);
3572 }
3573
3574 const TFilter verticalNormalization = sumFilterValues(verticalFilter, verticalFilterSize);
3575 ocean_assert(verticalNormalization != TFilter(0));
3576
3577 std::vector<float> verticalFloatFilter(verticalFilterSize);
3578 for (size_t n = 0; n < verticalFloatFilter.size(); ++n)
3579 {
3580 verticalFloatFilter[n] = float(verticalFilter[n]) / float(verticalNormalization);
3581 }
3582
3583 return filterUniversal<T>(source, target, width, height, channels, sourcePaddingElements, targetPaddingElements, horizontalFloatFilter.data(), (unsigned int)horizontalFloatFilter.size(), verticalFloatFilter.data(), (unsigned int)verticalFloatFilter.size(), worker);
3584 }
3585 }
3586
3587 ocean_assert(false && "Invalid combination of parameters!");
3588 return false;
3589}
3590
3591template <typename T>
3592bool FrameFilterSeparable::filterUniversal(const T* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const float* horizontalFilter, const unsigned int horizontalFilterSize, const float* verticalFilter, const unsigned int verticalFilterSize, Worker* worker)
3593{
3594 ocean_assert(source != nullptr && target != nullptr);
3595 ocean_assert(width >= 1u && height >= 1u);
3596 ocean_assert(channels != 0u);
3597
3598 ocean_assert(horizontalFilter != nullptr && verticalFilter != nullptr);
3599 ocean_assert(horizontalFilterSize % 2u == 1u);
3600 ocean_assert(verticalFilterSize % 2u == 1u);
3601
3602 if (source == nullptr || target == nullptr
3603 || verticalFilter == nullptr || horizontalFilter == nullptr
3604 || horizontalFilterSize > width || verticalFilterSize > height
3605 || horizontalFilterSize % 2u != 1u || verticalFilterSize % 2u != 1u)
3606 {
3607 return false;
3608 }
3609
3610 typedef typename FloatTyper<T>::Type TIntermediate;
3611
3612 Frame intermediateFrame(FrameType(width, height, FrameType::genericPixelFormat<TIntermediate>(channels), FrameType::ORIGIN_UPPER_LEFT));
3613
3614 if (worker)
3615 {
3616 worker->executeFunction(Worker::Function::createStatic(&filterUniversalHorizontalSubset<T, TIntermediate>, source, intermediateFrame.data<TIntermediate>(), width, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame.paddingElements(), 0u, 0u), 0u, height);
3617 worker->executeFunction(Worker::Function::createStatic(&filterUniversalVerticalSubset<T, TIntermediate>, intermediateFrame.constdata<TIntermediate>(), target, width, height, channels, verticalFilter, verticalFilterSize, intermediateFrame.paddingElements(), targetPaddingElements, 0u, 0u), 0u, height);
3618 }
3619 else
3620 {
3621 filterUniversalHorizontalSubset<T, TIntermediate>(source, intermediateFrame.data<TIntermediate>(), width, channels, horizontalFilter, horizontalFilterSize, sourcePaddingElements, intermediateFrame.paddingElements(), 0u, height);
3622 filterUniversalVerticalSubset<T, TIntermediate>(intermediateFrame.data<TIntermediate>(), target, width, height, channels, verticalFilter, verticalFilterSize, intermediateFrame.paddingElements(), targetPaddingElements, 0u, height);
3623 }
3624
3625 return true;
3626}
3627
3628template <typename T, typename TIntermediate>
3629void FrameFilterSeparable::filterUniversalHorizontalSubset(const T* source, TIntermediate* target, const unsigned int width, unsigned int channels, const float* horizontalFilter, unsigned int horizontalFilterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
3630{
3631 ocean_assert(source != nullptr && target != nullptr);
3632 ocean_assert(width >= 1u);
3633 ocean_assert(channels != 0u);
3634
3635 ocean_assert(horizontalFilterSize <= size_t(width));
3636 ocean_assert(horizontalFilterSize % 2u == 1u);
3637
3638 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3639 const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3640
3641 const unsigned int filterSize = horizontalFilterSize;
3642 const unsigned int filterSize_2 = filterSize / 2u;
3643 ocean_assert(filterSize_2 * 2u <= width);
3644
3645 std::vector<TIntermediate> filterCopy;
3646
3647 if (!std::is_same<TIntermediate, float>::value)
3648 {
3649 filterCopy.resize(horizontalFilterSize);
3650 for (size_t n = 0; n < filterCopy.size(); ++n)
3651 {
3652 filterCopy[n] = TIntermediate(horizontalFilter[n]);
3653 }
3654 }
3655
3656 const TIntermediate* const filter = filterCopy.empty() ? (const TIntermediate*)horizontalFilter : filterCopy.data();
3657
3658 source += firstRow * sourceStrideElements;
3659 target += firstRow * targetStrideElements;
3660
3661 TIntermediate* const targetEnd = target + numberRows * targetStrideElements;
3662
3663 while (target != targetEnd)
3664 {
3665 ocean_assert(target < targetEnd);
3666
3667 // left border: [0, filterSize_2 - 1]
3668
3669 for (unsigned int x = 0u; x < filterSize_2; ++x)
3670 {
3671 for (unsigned int n = 0u; n < channels; ++n)
3672 {
3673 TIntermediate response = TIntermediate(source[channels * mirroredBorderLocationLeft(-int(filterSize_2) + int(x)) + n]) * filter[0];
3674
3675 for (unsigned int s = 1u; s < filterSize; ++s)
3676 response += TIntermediate(source[channels * mirroredBorderLocationLeft(-int(filterSize_2) + int(x + s)) + n]) * filter[s];
3677
3678 target[n] = response;
3679 }
3680
3681 target += channels;
3682 // we keep the location of source
3683 }
3684
3685 // center block: [filterSize_2, width - filterSize - 2)
3686
3687 for (unsigned int x = filterSize_2; x < width - filterSize_2; ++x)
3688 {
3689 for (unsigned int n = 0u; n < channels; ++n)
3690 {
3691 TIntermediate response = TIntermediate(source[channels * 0u + n]) * filter[0];
3692
3693 for (unsigned int s = 1u; s < filterSize; ++s)
3694 response += TIntermediate(source[channels * s + n]) * filter[s];
3695
3696 target[n] = response;
3697 }
3698
3699 target += channels;
3700 source += channels;
3701 }
3702
3703 // right border: [width - filterSize_2, width - 1]
3704
3705 for (unsigned int x = 0u; x < filterSize_2; ++x)
3706 {
3707 for (unsigned int n = 0u; n < channels; ++n)
3708 {
3709 TIntermediate response = TIntermediate(source[channels * mirroredBorderLocationRight(x, filterSize_2 * 2u) + n]) * filter[0];
3710
3711 for (unsigned int s = 1u; s < filterSize; ++s)
3712 response += TIntermediate(source[channels * mirroredBorderLocationRight(x + s, filterSize_2 * 2u) + n]) * filter[s];
3713
3714 target[n] = response;
3715 }
3716
3717 target += channels;
3718 // we keep the location of source
3719 }
3720
3721 source += filterSize_2 * 2u * channels + sourcePaddingElements;
3722 target += targetPaddingElements;
3723 }
3724}
3725
3726template <typename T, typename TIntermediate>
3727void FrameFilterSeparable::filterUniversalVerticalSubset(const TIntermediate* source, T* target, const unsigned int width, const unsigned int height, const unsigned int channels, const float* verticalFilter, const unsigned int verticalFilterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
3728{
3729 ocean_assert(source != nullptr && target != nullptr);
3730 ocean_assert(width >= 1u && height >= 1u);
3731 ocean_assert(channels != 0u);
3732
3733 ocean_assert(verticalFilterSize <= height);
3734 ocean_assert(verticalFilterSize % 2u == 1u);
3735
3736 const unsigned int sourceStrideElements = width * channels + sourcePaddingElements;
3737 const unsigned int targetStrideElements = width * channels + targetPaddingElements;
3738
3739 const TIntermediate* const sourceStart = source;
3740
3741 const unsigned int filterSize = verticalFilterSize;
3742 const unsigned int filterSize_2 = filterSize / 2u;
3743 ocean_assert(filterSize_2 * 2u <= height);
3744
3745 std::vector<TIntermediate> filterCopy;
3746
3747 if (!std::is_same<TIntermediate, float>::value)
3748 {
3749 filterCopy.resize(verticalFilterSize);
3750
3751 for (size_t n = 0; n < filterCopy.size(); ++n)
3752 {
3753 filterCopy[n] = TIntermediate(verticalFilter[n]);
3754 }
3755 }
3756
3757 const TIntermediate* const filter = filterCopy.empty() ? (const TIntermediate*)verticalFilter : filterCopy.data();
3758
3759 source += max(0, int(firstRow) - int(filterSize_2)) * sourceStrideElements;
3760 target += firstRow * targetStrideElements;
3761
3762 unsigned int y = firstRow;
3763
3764 // top border: [0, filterSize_2 - 1]
3765
3766 while (y < min(filterSize_2, firstRow + numberRows))
3767 {
3768 ocean_assert(source == sourceStart);
3769 const TIntermediate* sourceCopy = source;
3770
3771 for (unsigned int x = 0u; x < width; ++x)
3772 {
3773 for (unsigned int n = 0u; n < channels; ++n)
3774 {
3775 TIntermediate response = TIntermediate(source[sourceStrideElements * mirroredBorderLocationLeft(-int(filterSize_2) + int(y)) + n]) * filter[0];
3776
3777 for (unsigned int s = 1u; s < filterSize; ++s)
3778 response += TIntermediate(source[sourceStrideElements * mirroredBorderLocationLeft(-int(filterSize_2) + int(y + s)) + n]) * filter[s];
3779
3780 target[n] = T(response);
3781 }
3782
3783 target += channels;
3784 source += channels;
3785 }
3786
3787 target += targetPaddingElements;
3788
3789 // we set back the location of the source pointer
3790 source = sourceCopy;
3791 ++y;
3792 }
3793
3794 // center block: [filterSize_2, height - filterSize - 2)
3795
3796 const unsigned int centerRows = (unsigned int)max(0, int(min(firstRow + numberRows, height - filterSize_2)) - int(y));
3797
3798 for (unsigned int yCenter = 0u; yCenter < centerRows; ++yCenter)
3799 {
3800 for (unsigned int x = 0u; x < width; ++x)
3801 {
3802 for (unsigned int c = 0u; c < channels; ++c)
3803 {
3804 TIntermediate response = TIntermediate(source[channels * 0u + c]) * filter[0];
3805
3806 for (unsigned int s = 1u; s < filterSize; ++s)
3807 response += TIntermediate(source[sourceStrideElements * s + c]) * filter[s];
3808
3809 target[c] = T(response);
3810 }
3811
3812 source += channels;
3813 target += channels;
3814 }
3815
3816 source += sourcePaddingElements;
3817 target += targetPaddingElements;
3818 }
3819
3820 y += centerRows;
3821
3822 // bottom border: [height - filterSize_2, height - 1]
3823
3824 while (y < firstRow + numberRows)
3825 {
3826 ocean_assert(y >= height - filterSize_2 && y < height);
3827 source = sourceStart + (height - filterSize_2 * 2u) * sourceStrideElements;
3828
3829 const unsigned int yy = y - (height - filterSize_2);
3830 ocean_assert(yy < filterSize_2);
3831
3832 for (unsigned int x = 0u; x < width; ++x)
3833 {
3834 for (unsigned int n = 0u; n < channels; ++n)
3835 {
3836 TIntermediate response = TIntermediate(source[sourceStrideElements * mirroredBorderLocationRight(yy, filterSize_2 * 2u) + n]) * filter[0];
3837
3838 for (unsigned int s = 1u; s < filterSize; ++s)
3839 {
3840 response += TIntermediate(source[sourceStrideElements * mirroredBorderLocationRight(yy + s, filterSize_2 * 2u) + n]) * filter[s];
3841 }
3842
3843 target[n] = T(response);
3844 }
3845
3846 target += channels;
3847 source += channels;
3848 }
3849
3850 target += targetPaddingElements;
3851
3852 ++y;
3853 }
3854}
3855
3856inline unsigned int FrameFilterSeparable::mirroredBorderLocationLeft(const int value)
3857{
3858 // Original: -3 -2 -1 | 0 1 2 3 4 5 6
3859 // Result: 2 1 0 | 0 1 2 3 4 5 6
3860
3861 if (value >= 0)
3862 {
3863 return value;
3864 }
3865 else
3866 {
3867 return -value - 1;
3868 }
3869}
3870
3871inline unsigned int FrameFilterSeparable::mirroredBorderLocationRight(const unsigned int value, const unsigned int size)
3872{
3873 ocean_assert(value < 2u * size);
3874
3875 // Original: 4 5 6 ... s-2 s-1 | s s+1 s+2
3876 // Result: 4 5 6 ... s-2 s-1 | s-1 s-2 s-3
3877
3878 if (value < size)
3879 {
3880 return value;
3881 }
3882 else
3883 {
3884 ocean_assert(size * 2u - value - 1u < size);
3885 return size * 2u - value - 1u;
3886 }
3887}
3888
3889}
3890
3891}
3892
3893#endif // META_OCEAN_CV_FRAME_FILTER_BINOMIAL_H
This class holds re-usable memory for the filtering process.
Definition FrameFilterSeparable.h:40
ReusableMemory()=default
Default constructor.
std::vector< float > filterFactors_
Float-based filter factors which can be re-used during filtering.
Definition FrameFilterSeparable.h:56
std::vector< float > normalizedVerticalFilter_
Normalized vertical filter factors which can be re-used during filtering.
Definition FrameFilterSeparable.h:62
Frame intermediateFrame_
An intermediate frame which can be re-used during filtering.
Definition FrameFilterSeparable.h:53
std::vector< float > normalizedHorizontalFilter_
Normalized horizontal filter factors which can be re-used during filtering.
Definition FrameFilterSeparable.h:59
This class implements separable filter.
Definition FrameFilterSeparable.h:33
static void filterVerticalSubset(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int channels, const float *filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, unsigned int firstRow, const unsigned int numberRows)
Applies the vertical filtering for a subset of the frame with a specified 1D filter kernel for frames...
Definition FrameFilterSeparable.h:3303
static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource *sourceLeft, const TSource *sourceRight, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4)
Determines the filter responses for one filter factor of a symmetric filter for 4 successive frame el...
static bool filterUniversal(const T *source, T *target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const float *horizontalFilter, const unsigned int horizontalFilterSize, const float *verticalFilter, const unsigned int verticalFilterSize, Worker *worker=nullptr)
Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a hori...
Definition FrameFilterSeparable.h:3592
static OCEAN_FORCE_INLINE void symmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource *sourceLeft, const TSource *sourceRight, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4a, typename SIMD32x4< TFilter >::Type &target_32x4b)
Determines the filter responses for one filter factor of a symmetric filter for 8 successive frame el...
static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor8Elements(const TSource *source, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4a, typename SIMD32x4< TFilter >::Type &target_32x4b)
Determines the filter responses for one filter factor of an asymmetric filter for 8 successive frame ...
static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith8Elements(const TSource *const source, TFilter *const target, const unsigned int channels, const TFilter *const filter, const unsigned int filterSize, const bool isSymmetric)
Determines the horizontal filter responses for one block with 8 successive frame elements (8 elements...
Definition FrameFilterSeparable.h:2556
static OCEAN_FORCE_INLINE void writeSIMD(const typename SIMD32x4< T >::Type &value, T *target)
Writes a SIMD with four 32 bit values to (not aligned) memory.
static void filterUniversalHorizontalSubset(const T *source, TIntermediate *target, const unsigned int width, const unsigned int channels, const float *horizontalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies an horizontal filter to a subset of an image with almost arbitrary data type.
Definition FrameFilterSeparable.h:3629
static void filterUniversalVerticalSubset(const TIntermediate *source, T *target, const unsigned int width, const unsigned int height, const unsigned int channels, const float *verticalFilter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies an vertical filter to a subset of an image with almost arbitrary data type.
Definition FrameFilterSeparable.h:3727
static void filterHorizontalSubset(const TSource *source, TFilter *target, const unsigned int width, const unsigned int height, const unsigned int channels, const TFilter *filter, const unsigned int filterSize, const unsigned int sourcePaddingElements, const unsigned int targetPaddingElements, const unsigned int firstRow, const unsigned int numberRows)
Applies the horizontal filtering in a subset of a frame with a specified 1D filter kernel for frames ...
Definition FrameFilterSeparable.h:3171
static OCEAN_FORCE_INLINE void filterVerticalCoreRow32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int channels, const float *filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
Determines the vertical filter responses for the inner core of a frame for one row.
Definition FrameFilterSeparable.h:804
static T sumFilterValues(const T *filterValues, const size_t size)
Determines the sum of all elements of a given 1D filter.
Definition FrameFilterSeparable.h:706
static bool isFilterSymmetric(const T *filterValues, const size_t size)
Returns whether a given 1D filter is symmetric.
Definition FrameFilterSeparable.h:689
static OCEAN_FORCE_INLINE void filterVerticalBorderRow16Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses near the (vertical) border of a frame for one row while proc...
static OCEAN_FORCE_INLINE void filterVerticalCoreRow4Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
static OCEAN_FORCE_INLINE void filterVerticalCoreRow8Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
static OCEAN_FORCE_INLINE void filterVerticalCoreRow16Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses for the inner core of a frame for one row while processing a...
static void fillLeftExtraBorder(const T *source, const unsigned int channels, const unsigned int pixels, T *extendedRowLeft)
Fills the left border area of an extended row with mirrored pixel information (from the left image re...
Definition FrameFilterSeparable.h:782
static OCEAN_FORCE_INLINE void asymmetricFilterHorizontalRowMultiplyOneFilterFactor4Elements(const TSource *source, const TFilter &filterFactor, typename SIMD32x4< TFilter >::Type &target_32x4)
Determines the filter responses for one filter factor of an asymmetric filter for 4 successive frame ...
static OCEAN_FORCE_INLINE void filterVerticalBorderRow32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int width, const unsigned int height, const unsigned int channels, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric, const unsigned int sourcePaddingElements)
Determines the vertical filter responses near the (vertical) border of a frame for one row.
static void fillRightExtraBorder(const T *sourceEnd, const unsigned int channels, const unsigned int pixels, T *extendedRowRight)
Fills the right border area of an extended row with mirrored pixel information (from the right image ...
Definition FrameFilterSeparable.h:793
static OCEAN_FORCE_INLINE void filterVerticalBorderRow8Elements32BitPerChannelFloat(const TSource *source, TTarget *target, const unsigned int sourceStrideElements, const unsigned int height, const unsigned int row, const float *filter, const unsigned int filterSize, const bool isSymmetric)
Determines the vertical filter responses near the (vertical) border of a frame for one row while proc...
static OCEAN_FORCE_INLINE void filterHorizontalRowOneBlockWith4Elements(const TSource *const source, TFilter *const target, const unsigned int channels, const TFilter *const filter, const unsigned int filterSize, const bool isSymmetric)
Determines the horizontal filter responses for one block with 4 successive frame elements (4 elements...
Definition FrameFilterSeparable.h:2494
static bool filter(const Frame &source, Frame &target, const std::vector< unsigned int > &horizontalFilter, const std::vector< unsigned int > &verticalFilter, Worker *worker=nullptr, ReusableMemory *reusableMemory=nullptr, const ProcessorInstructions processorInstructions=Processor::get().instructions())
Applies a horizontal and vertical filtering with a (separable) 2D filter kernel separated into a hori...
static OCEAN_FORCE_INLINE void setSIMDZero(typename SIMD32x4< T >::Type &value)
Sets a given SIMD value to zero.
static Caller< void > createStatic(typename StaticFunctionPointerMaker< void, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass >::Type function)
Creates a new caller container for a static function with no function parameter.
Definition Caller.h:2876
This class implements Ocean's image class.
Definition Frame.h:1808
const T * constdata(const unsigned int planeIndex=0u) const
Returns a pointer to the read-only pixel data of a specific plane.
Definition Frame.h:4248
T * data(const unsigned int planeIndex=0u)
Returns a pointer to the pixel data of a specific plane.
Definition Frame.h:4239
bool set(const FrameType &frameType, const bool forceOwner, const bool forceWritable=false, const Indices32 &planePaddingElements=Indices32(), const Timestamp &timestamp=Timestamp(false), bool *reallocated=nullptr)
Sets a new frame type for this frame.
unsigned int paddingElements(const unsigned int planeIndex=0u) const
Returns the optional number of padding elements at the end of each row for a specific plane.
Definition Frame.h:4122
Definition of a frame type composed by the frame dimension, pixel format and pixel origin.
Definition Frame.h:30
@ ORIGIN_UPPER_LEFT
The first pixel lies in the upper left corner, the last pixel in the lower right corner.
Definition Frame.h:1050
This class implements an object able to allocate memory.
Definition base/Memory.h:22
void * data()
Returns the pointer to the writable memory which is allocated by this object.
Definition base/Memory.h:303
This class provides basic numeric functionalities.
Definition Numeric.h:57
This class implements a worker able to distribute function calls over different threads.
Definition Worker.h:33
bool executeFunction(const Function &function, const unsigned int first, const unsigned int size, const unsigned int firstIndex=(unsigned int)(-1), const unsigned int sizeIndex=(unsigned int)(-1), const unsigned int minimalIterations=1u, const unsigned int threadIndex=(unsigned int)(-1))
Executes a callback function separable by two function parameters.
ProcessorInstructions
Definition of individual processor instruction types.
Definition base/Processor.h:22
static unsigned int mirroredBorderLocationRight(const unsigned int value, const unsigned int size)
Mirrors a given value at the right border if necessary.
Definition FrameFilterSeparable.h:3871
static unsigned int mirroredBorderLocationLeft(const int value)
Mirrors a given value at the left border if necessary.
Definition FrameFilterSeparable.h:3856
@ PI_NONE
Unknown processor instruction set.
Definition base/Processor.h:24
@ PI_GROUP_AVX_2_SSE_2
All AVX instructions between (including) AVX and AVX2 and SSE instructions between (including) SSE an...
Definition base/Processor.h:64
@ PI_GROUP_SSE_4_1
All SSE instructions between (including) SSE and SSE4.1.
Definition base/Processor.h:60
@ PI_SSE_2
SEE2 instructions.
Definition base/Processor.h:28
@ PI_NEON
NEON instructions.
Definition base/Processor.h:50
@ PI_GROUP_AVX_2_SSE_4_1
All AVX instructions between (including) AVX and AVX2 and SSE instructions between (including) SSE an...
Definition base/Processor.h:68
@ PI_GROUP_SSE_2
All SSE instructions between (including) SSE and SSE2.
Definition base/Processor.h:58
@ PI_GROUP_NEON
All NEON instructions (which is currently NEON only).
Definition base/Processor.h:66
The namespace covering the entire Ocean framework.
Definition Accessor.h:15
float32x4_t Type
Definition FrameFilterSeparable.h:683
__m128 Type
Definition FrameFilterSeparable.h:663
uint32x4_t Type
Definition FrameFilterSeparable.h:674
__m128i Type
Definition FrameFilterSeparable.h:654
Definition of a 128 bit SIMD data type holding four 32 bit values.
Definition FrameFilterSeparable.h:72
DataType< uint32_t, 4u >::Type Type
Definition FrameFilterSeparable.h:73
Default definition of a type with tBytes bytes.
Definition DataType.h:32
float Type
The 32 bit floating point data type for any data type T but 'double'.
Definition DataType.h:373