Ocean
SumAbsoluteDifferencesNEON.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_SUM_ABSOLUTE_DIFFERENCES_NEON_H
9 #define META_OCEAN_CV_SUM_ABSOLUTE_DIFFERENCES_NEON_H
10 
11 #include "ocean/cv/CV.h"
12 
13 #include "ocean/base/Utilities.h"
14 
15 #if defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
16 
17 #include "ocean/cv/NEON.h"
18 
19 namespace Ocean
20 {
21 
22 namespace CV
23 {
24 
25 /**
26  * This class implements functions calculation the sum of absolute differences with NEON instructions.
27  * @ingroup cv
28  */
30 {
31  public:
32 
33  /**
34  * Returns the sum of absolute differences between two memory buffers.
35  * @param buffer0 The first memory buffer, must be valid
36  * @param buffer1 The second memory buffer, must be valid
37  * @return The resulting sum of square differences
38  * @tparam tSize The size of the buffers in elements, with range [1, infinity)
39  */
40  template <unsigned int tSize>
41  static inline uint32_t buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1);
42 
43  /**
44  * Returns the sum of absolute differences between two patches within an image.
45  * @param patch0 The top left start position of the first image patch, must be valid
46  * @param patch1 The top left start position of the second image patch, must be valid
47  * @param patch0StrideElements The number of elements between two rows for the first patch, in elements, with range [tChannels, tPatchSize, infinity)
48  * @param patch1StrideElements The number of elements between two rows for the second patch, in elements, with range [tChannels, tPatchSize, infinity)
49  * @return The resulting sum of square differences
50  * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
51  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
52  */
53  template <unsigned int tChannels, unsigned int tPatchSize>
54  static inline uint32_t patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements);
55 
56  /**
57  * Returns the sum of absolute differences between an image patch and a buffer.
58  * @param patch0 The top left start position of the image patch, must be valid
59  * @param buffer1 The memory buffer, must be valid
60  * @param patch0StrideElements The number of elements between two rows for the image patch, in elements, with range [tChannels, tPatchSize, infinity)
61  * @return The resulting sum of square differences
62  * @tparam tChannels The number of channels for the given frames, with range [1, infinity)
63  * @tparam tPatchSize The size of the square patch (the edge length) in pixel, with range [1, infinity), must be odd
64  */
65  template <unsigned int tChannels, unsigned int tPatchSize>
66  static inline uint32_t patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements);
67 };
68 
69 template <unsigned int tSize>
70 inline uint32_t SumAbsoluteDifferencesNEON::buffer8BitPerChannel(const uint8_t* buffer0, const uint8_t* buffer1)
71 {
72  static_assert(tSize >= 1u, "Invalid buffer size!");
73 
74  uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
75 
76  // first, we handle blocks with 16 elements
77 
78  constexpr unsigned int blocks16 = tSize / 16u;
79 
80  for (unsigned int n = 0u; n < blocks16; ++n)
81  {
82  // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
83  const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(buffer0), vld1q_u8(buffer1));
84 
85  const uint16x8_t absDifference_u_16x8 = vaddl_u8(vget_low_u8(absDifference_u_8x16), vget_high_u8(absDifference_u_8x16));
86 
87  sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
88 
89  buffer0 += 16;
90  buffer1 += 16;
91  }
92 
93  // we may handle at most one block with 8 elements
94 
95  constexpr unsigned int blocks8 = (tSize % 16u) / 8u;
96  static_assert(blocks8 <= 1u, "Invalid number of blocks!");
97 
98  if (blocks8 == 1u)
99  {
100  // [|buffer0[0] - buffer1[0]|, |buffer0[1] - buffer1[1]|, ..]
101  const uint16x8_t absDifference_u_16x8 = vabdl_u8(vld1_u8(buffer0), vld1_u8(buffer1));
102 
103  sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
104 
105  buffer0 += 8;
106  buffer1 += 8;
107  }
108 
109  uint32_t results[4];
110  vst1q_u32(results, sum_u_32x4);
111 
112  constexpr unsigned int remainingElements = tSize - blocks16 * 16u - blocks8 * 8u;
113  static_assert(remainingElements < 8u, "Invalid number of remaining elements!");
114 
115  uint32_t result = results[0] + results[1] + results[2] + results[3];
116 
117  // we apply the remaining elements (at most 7)
118 
119  for (unsigned int n = 0u; n < remainingElements; ++n)
120  {
121  result += uint32_t(abs(int32_t(buffer0[n]) - int32_t(buffer1[n])));
122  }
123 
124  return result;
125 }
126 
127 template <unsigned int tChannels, unsigned int tPatchSize>
128 inline uint32_t SumAbsoluteDifferencesNEON::patch8BitPerChannel(const uint8_t* patch0, const uint8_t* patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
129 {
130  static_assert(tChannels >= 1u, "Invalid channel number!");
131  static_assert(tPatchSize >= 5u, "Invalid patch size!");
132 
133  ocean_assert(patch0 != nullptr && patch1 != nullptr);
134 
135  ocean_assert(patch0StrideElements >= tChannels * tPatchSize);
136  ocean_assert(patch1StrideElements >= tChannels * tPatchSize);
137 
138  constexpr unsigned int patchWidthElements = tChannels * tPatchSize;
139 
140  constexpr unsigned int blocks16 = patchWidthElements / 16u;
141  constexpr unsigned int blocks8 = (patchWidthElements - blocks16 * 16u) / 8u;
142  constexpr unsigned int blocks1 = patchWidthElements - blocks16 * 16u - blocks8 * 8u;
143 
144  static_assert(blocks1 <= 7u, "Invalid block size!");
145 
146  const uint8x8_t maskRight_u_8x8 = vcreate_u8(uint64_t(-1) >> (8u - blocks1) * 8u);
147  const uint8x8_t maskLeft_u_8x8 = vcreate_u8(uint64_t(-1) << (8u - blocks1) * 8u);
148 
149  uint32x4_t sum_u_32x4 = vdupq_n_u32(0u);
150 
151  uint32_t sumIndividual = 0u;
152 
153  for (unsigned int y = 0u; y < tPatchSize; ++y)
154  {
155  for (unsigned int n = 0u; n < blocks16; ++n)
156  {
157  // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
158  const uint8x16_t absDifference_u_8x16 = vabdq_u8(vld1q_u8(patch0), vld1q_u8(patch1));
159 
160  const uint16x8_t absDifference_u_16x8 = vaddl_u8(vget_low_u8(absDifference_u_8x16), vget_high_u8(absDifference_u_8x16));
161 
162  sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
163 
164  patch0 += 16;
165  patch1 += 16;
166  }
167 
168  for (unsigned int n = 0u; n < blocks8; ++n)
169  {
170  // [|patch0[0] - patch1[0]|, |patch0[1] - patch1[1]|, ..]
171  const uint16x8_t absDifference_u_16x8 = vabdl_u8(vld1_u8(patch0), vld1_u8(patch1));
172 
173  sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
174 
175  patch0 += 8;
176  patch1 += 8;
177  }
178 
179  if constexpr (blocks1 != 0u)
180  {
181  if (blocks1 >= 3u)
182  {
183  // we have enough elements left so that using NEON is still faster than handling each element individually
184 
185  if (y < tPatchSize - 1u)
186  {
187  const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0), maskRight_u_8x8);
188  const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1), maskRight_u_8x8);
189 
190  const uint16x8_t absDifference_u_16x8 = vabdl_u8(remaining0_u_8x8, remaining1_u_8x8);
191 
192  sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
193  }
194  else
195  {
196  constexpr unsigned int overlapElements = 8u - blocks1;
197  static_assert(overlapElements >= 1u && overlapElements < 8u, "Invalid number!");
198 
199  const uint8x8_t remaining0_u_8x8 = vand_u8(vld1_u8(patch0 - overlapElements), maskLeft_u_8x8);
200  const uint8x8_t remaining1_u_8x8 = vand_u8(vld1_u8(patch1 - overlapElements), maskLeft_u_8x8);
201 
202  const uint16x8_t absDifference_u_16x8 = vabdl_u8(remaining0_u_8x8, remaining1_u_8x8);
203 
204  sum_u_32x4 = vpadalq_u16(sum_u_32x4, absDifference_u_16x8);
205  }
206  }
207  else
208  {
209  for (unsigned int n = 0u; n < blocks1; ++n)
210  {
211  sumIndividual += uint32_t(abs(int32_t(patch0[n]) - int32_t(patch1[n])));
212  }
213  }
214 
215  patch0 += blocks1;
216  patch1 += blocks1;
217  }
218 
219  patch0 += patch0StrideElements - patchWidthElements;
220  patch1 += patch1StrideElements - patchWidthElements;
221  }
222 
223  uint32_t results[4];
224  vst1q_u32(results, sum_u_32x4);
225 
226  return results[0] + results[1] + results[2] + results[3] + sumIndividual;
227 }
228 
229 template <unsigned int tChannels, unsigned int tPatchSize>
230 inline uint32_t SumAbsoluteDifferencesNEON::patchBuffer8BitPerChannel(const uint8_t* patch0, const uint8_t* buffer1, const unsigned int patch0StrideElements)
231 {
232  return patch8BitPerChannel<tChannels, tPatchSize>(patch0, buffer1, patch0StrideElements, tChannels * tPatchSize);
233 }
234 
235 }
236 
237 }
238 
239 #endif // OCEAN_HARDWARE_NEON_VERSION >= 10
240 
241 #endif // META_OCEAN_CV_SUM_ABSOLUTE_DIFFERENCES_3_CHANNEL_24_BIT_NEON_H
This class implements functions calculation the sum of absolute differences with NEON instructions.
Definition: SumAbsoluteDifferencesNEON.h:30
static uint32_t patchBuffer8BitPerChannel(const uint8_t *patch0, const uint8_t *buffer1, const unsigned int patch0StrideElements)
Returns the sum of absolute differences between an image patch and a buffer.
Definition: SumAbsoluteDifferencesNEON.h:230
static uint32_t patch8BitPerChannel(const uint8_t *patch0, const uint8_t *patch1, const unsigned int patch0StrideElements, const unsigned int patch1StrideElements)
Returns the sum of absolute differences between two patches within an image.
Definition: SumAbsoluteDifferencesNEON.h:128
static uint32_t buffer8BitPerChannel(const uint8_t *buffer0, const uint8_t *buffer1)
Returns the sum of absolute differences between two memory buffers.
Definition: SumAbsoluteDifferencesNEON.h:70
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15