Ocean
Loading...
Searching...
No Matches
SumAbsoluteDifferencesNEON.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_CV_SUM_ABSOLUTE_DIFFERENCES_NEON_H
9#define META_OCEAN_CV_SUM_ABSOLUTE_DIFFERENCES_NEON_H
10
11#include "ocean/cv/CV.h"
12
14
15#if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
16
17#include "ocean/cv/NEON.h"
18
19namespace Ocean
20{
21
22namespace CV
23{
24
25/**
26 * This class implements functions calculating the sum of absolute differences with NEON instructions.
27 * @ingroup cv
28 */
30{
31 public:
32
33 /**
34 * Returns the sum of absolute differences between two memory buffers.
35 * @param buffer0 The first memory buffer, must be valid
36 * @param buffer1 The second memory buffer, must be valid
37 * @return The resulting sum of absolute differences
38 * @tparam tSize The size of the buffers in elements, with range [1, infinity)
39 */
40 template <unsigned int tSize>
41 static inline uint32_t buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1);
42
43 /**
44 * Returns the sum of absolute differences between two patches within an image.
45 * @param patch0 The top left start position of the first image patch, must be valid
46 * @param patch1 The top left start position of the second image patch, must be valid
47 * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
48 * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
49 * @return The resulting sum of absolute differences
50 * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
51 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
52 */
53 template <unsigned int tChannels, unsigned int tPatchSize>
54 static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
55
56 /**
57 * Returns the sum of absolute differences between an image patch and a buffer.
58 * @param patch0 The top left start position of the image patch, must be valid
59 * @param buffer1 The memory buffer, must be valid
60 * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
61 * @return The resulting sum of absolute differences
62 * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
63 * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
64 */
65 template <unsigned int tChannels, unsigned int tPatchSize>
66 static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements);
67};
68
69template <unsigned int tSize>
70inline uint32_t SumAbsoluteDifferencesNEON::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1)
71{
72 static_assert(tSize >= 1u, "Invalid buffer size!");
73
74 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
75
76 // first, we handle blocks with 16 elements
77
78 constexpr unsigned int blocks16 = tSize / 16u;
79
80 for (unsigned int n = 0u; n < blocks16; ++n)
81 {
82 // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
83 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(buffer0), vld1q_u8(buffer1));
84
85 const uint16x8_t absDifference_u_16x8 = vaddl_u8(vget_low_u8(absDifference_u_8x16), vget_high_u8(absDifference_u_8x16));
86
87 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
88
89 buffer0 += 16;
90 buffer1 += 16;
91 }
92
93 // we may handle at most one block with 8 elements
94
95 constexpr unsigned int blocks8 = (tSize % 16u) / 8u;
96 static_assert(blocks8 <= 1u, "Invalid number of blocks!");
97
98 if (blocks8 == 1u)
99 {
100 // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
101 const uint16x8_t absDifference_u_16x8 = vabdl_u8(vld1_u8(buffer0), vld1_u8(buffer1));
102
103 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
104
105 buffer0 += 8;
106 buffer1 += 8;
107 }
108
109 constexpr unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
110 static_assert(remainingElements < 8u, "Invalid number of remaining elements!");
111
112 uint32_t result = NEON::sumHorizontal_u_32x4(sum_u_32x4);
113
114 // we apply the remaining elements (at most 7)
115
116 for (unsigned int n = 0u; n < remainingElements; ++n)
117 {
118 result += uint32_t(abs(int32_t(buffer0[n]) - int32_t(buffer1[n])));
119 }
120
121 return result;
122}
123
124template <unsigned int tChannels, unsigned int tPatchSize>
125inline uint32_t SumAbsoluteDifferencesNEON::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
126{
127 static_assert(tChannels >= 1u, "Invalid channel number!");
128 static_assert(tPatchSize >= 5u, "Invalid patch size!");
129
130 ocean_assert(patch0 != nullptr && patch1 != nullptr);
131
132 ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
133 ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
134
135 constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
136
137 constexpr unsigned int blocks16 = patchWidthElements / 16u;
138 constexpr unsigned int blocks8 = (patchWidthElements - blocks16 * 16u) / 8u;
139 constexpr unsigned int blocks1 = patchWidthElements - blocks16 * 16u - blocks8 * 8u;
140
141 static_assert(blocks1 <= 7u, "Invalid block size!");
142
143 const uint8x8_t maskRight_u_8x8 = vcreate_u8(uint64_t(-1) >> (8u - blocks1) * 8u);
144 const uint8x8_t maskLeft_u_8x8 = vcreate_u8(uint64_t(-1) << (8u - blocks1) * 8u);
145
146 uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
147
148 uint32_t sumIndividual = 0u;
149
150 for (unsigned int y = 0u; y < tPatchSize; ++y)
151 {
152 for (unsigned int n = 0u; n < blocks16; ++n)
153 {
154 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
155 const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(patch0), vld1q_u8(patch1));
156
157 const uint16x8_t absDifference_u_16x8 = vaddl_u8(vget_low_u8(absDifference_u_8x16), vget_high_u8(absDifference_u_8x16));
158
159 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
160
161 patch0 += 16;
162 patch1 += 16;
163 }
164
165 for (unsigned int n = 0u; n < blocks8; ++n)
166 {
167 // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
168 const uint16x8_t absDifference_u_16x8 = vabdl_u8(vld1_u8(patch0), vld1_u8(patch1));
169
170 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
171
172 patch0 += 8;
173 patch1 += 8;
174 }
175
176 if constexpr (blocks1 != 0u)
177 {
178 if (blocks1 >= 3u)
179 {
180 // we have enough elements left so that using NEON is still faster than handling each element individually
181
182 if (y < tPatchSize - 1u)
183 {
184 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0), maskRight_u_8x8);
185 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1), maskRight_u_8x8);
186
187 const uint16x8_t absDifference_u_16x8 = vabdl_u8(remaining0_u_8x8, remaining1_u_8x8);
188
189 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
190 }
191 else
192 {
193 constexpr unsigned int overlapElements = 8u - blocks1;
194 static_assert(overlapElements >= 1u && overlapElements < 8u, "Invalid number!");
195
196 const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0 - overlapElements), maskLeft_u_8x8);
197 const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1 - overlapElements), maskLeft_u_8x8);
198
199 const uint16x8_t absDifference_u_16x8 = vabdl_u8(remaining0_u_8x8, remaining1_u_8x8);
200
201 sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
202 }
203 }
204 else
205 {
206 for (unsigned int n = 0u; n < blocks1; ++n)
207 {
208 sumIndividual += uint32_t(abs(int32_t(patch0[n]) - int32_t(patch1[n])));
209 }
210 }
211
212 patch0 += blocks1;
213 patch1 += blocks1;
214 }
215
216 patch0 += patch0StrideElements - patchWidthElements;
217 patch1 += patch1StrideElements - patchWidthElements;
218 }
219
220 return NEON::sumHorizontal_u_32x4(sum_u_32x4) + sumIndividual;
221}
222
223template <unsigned int tChannels, unsigned int tPatchSize>
224inline uint32_t SumAbsoluteDifferencesNEON::patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements)
225{
226 return patch8BitPerChannel<tChannels, tPatchSize>(patch0, buffer1, patch0StrideElements, tChannels * tPatchSize);
227}
228
229}
230
231}
232
233#endif // OCEAN_HARDWARE_NEON_VERSION >= 10
234
235#endif // META_OCEAN_CV_SUM_ABSOLUTE_DIFFERENCES_3_CHANNEL_24_BIT_NEON_H
static OCEAN_FORCE_INLINE uint32_t sumHorizontal_u_32x4(const uint32x4_t &value)
Horizontally sums the four 32 bit values and returns the result.
Definition NEON.h:1084
This class implements functions calculating the sum of absolute differences with NEON instructions.
Definition SumAbsoluteDifferencesNEON.h:30
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the sum of absolute differences between an image patch and a buffer.
Definition SumAbsoluteDifferencesNEON.h:224
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the sum of absolute differences between two patches within an image.
Definition SumAbsoluteDifferencesNEON.h:125
static uint32_t buffer8BitPerChannel(const uint8_t *buffer0, const uint8_t *buffer1)
Returns the sum of absolute differences between two memory buffers.
Definition SumAbsoluteDifferencesNEON.h:70
The namespace covering the entire Ocean framework.
Definition Accessor.h:15