Ocean
cv/detector/Descriptor.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_CV_DETECTOR_DESCRIPTOR_H
9 #define META_OCEAN_CV_DETECTOR_DESCRIPTOR_H
10 
12 
13 #include "ocean/cv/NEON.h"
14 #include "ocean/cv/SSE.h"
15 
16 #include <bitset>
17 
18 namespace Ocean
19 {
20 
21 namespace CV
22 {
23 
24 namespace Detector
25 {
26 
27 /**
28  * This class implements the abstract base for arbitrary descriptors.
29  * @ingroup cvdetector
30  */
31 class OCEAN_CV_DETECTOR_EXPORT Descriptor
32 {
33  public:
34 
35  /**
36  * Creates a new descriptor object.
37  */
38  inline Descriptor();
39 
40  /**
41  * Determines the hamming distance between two binary descriptors.
42  * @param descriptorA The first descriptor, must be valid
43  * @param descriptorB The second descriptor, must be valid
44  * @return The hamming distance between both descriptors (the number of not identical corresponding bits), with range [0, tBits]
45  * @tparam tBits The number of bits both descriptors have, with range [128, infinity), must be a multiple of 128
46  */
47  template <unsigned int tBits>
48  static OCEAN_FORCE_INLINE unsigned int calculateHammingDistance(const void* descriptorA, const void* descriptorB);
49 
50  protected:
51 
52 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 30
53 
54  /**
55  * Calculates a pop count of an m128i register in 8 bit groups.
56  * @param value Bit string to calculate pop count from
57  * @return Pop count
58  */
59  static OCEAN_FORCE_INLINE __m128i popcount8(const __m128i value);
60 
61  /**
62  * Calculates a pop count of an m128i register in 64 bit groups.
63  * @param value Bit string to calculate pop count from
64  * @return Pop count
65  */
66  static OCEAN_FORCE_INLINE __m128i popcount64(const __m128i value);
67 
68 #endif
69 
70 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 42
71 
72  /**
73  * Calculates a pop count of an m128i register in 64 bit groups.
74  * @param value Bit string to calculate pop count from
75  * @return Pop count
76  */
77  static OCEAN_FORCE_INLINE unsigned int popcount128(const __m128i value);
78 
79 #endif
80 };
81 
83 {
84  // nothing to here
85 }
86 
87 template <>
88 OCEAN_FORCE_INLINE unsigned int Descriptor::calculateHammingDistance<128u>(const void* descriptorA, const void* descriptorB)
89 {
90 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 42
91 
92  // the following code uses the following SSE instructions, and needs SSE4.2 or higher
93 
94  // SSE2:
95  // _mm_lddqu_si128
96  // _mm_xor_si128
97 
98  // see also popcount128()
99 
100  const __m128i descriptorA_m128 = SSE::load128i(descriptorA);
101  const __m128i descriptorB_m128 = SSE::load128i(descriptorB);
102 
103  const __m128i xor_m128 = _mm_xor_si128(descriptorA_m128, descriptorB_m128);
104 
105  return popcount128(xor_m128);
106 
107 #elif defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 30
108 
109  // the following code uses the following SSE instructions, and needs SSE3 or higher
110 
111  // SSE2:
112  // _mm_load1_pd
113  // _mm_loadu_pd
114  // _mm_mul_pd
115  // _mm_add_pd
116  // _mm_storeu_pd
117 
118  // see also popcount64()
119 
120  const __m128i descriptorA_m128 = SSE::load128i(descriptorA);
121  const __m128i descriptorB_m128 = SSE::load128i(descriptorB);
122 
123  const __m128i xor_m128 = _mm_xor_si128(descriptorA_m128, descriptorB_m128);
124 
125  const __m128i countLowHigh_m128_64 = popcount64(xor_m128);
126  const __m128i countHigh_m128_64 = _mm_unpackhi_epi64(countLowHigh_m128_64, countLowHigh_m128_64);
127  const __m128i count_m128 = _mm_add_epi32(countLowHigh_m128_64, countHigh_m128_64);
128 
129  return _mm_cvtsi128_si32(count_m128);
130 
131 #elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
132 
133  const uint8x16_t descriptorA_u_8x16 = vld1q_u8((const uint8_t*)(descriptorA));
134  const uint8x16_t descriptorB_u_8x16 = vld1q_u8((const uint8_t*)(descriptorB));
135 
136  const uint8x16_t xor_u_8x16 = veorq_u8(descriptorA_u_8x16, descriptorB_u_8x16);
137 
138  const uint8x16_t count_u_8x16 = vcntq_u8(xor_u_8x16);
139  const uint16x8_t count_u_16x8 = vpaddlq_u8(count_u_8x16);
140  const uint32x4_t count_u_32x4 = vpaddlq_u16(count_u_16x8);
141 
142  return NEON::sum32x4ByLanes(count_u_32x4);
143 
144 #else
145 
146  typedef std::bitset<128> Bitset;
147  static_assert(sizeof(Bitset) == 128u / 8u, "Invalid data type!");
148 
149  Bitset bitsetA, bitsetB;
150 
151  memcpy(&bitsetA, descriptorA, sizeof(Bitset));
152  memcpy(&bitsetB, descriptorB, sizeof(Bitset));
153 
154  return (unsigned int)(bitsetA ^ bitsetB).count();
155 
156 #endif
157 }
158 
159 template <>
160 OCEAN_FORCE_INLINE unsigned int Descriptor::calculateHammingDistance<256u>(const void* descriptorA, const void* descriptorB)
161 {
162 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 42
163 
164  // the following code uses the following SSE instructions, and needs SSE4.2 or higher
165 
166  // SSE2:
167  // _mm_lddqu_si128
168  // _mm_xor_si128
169 
170  // see also popcount128()
171 
172  const __m128i descriptorA_m128_0 = SSE::load128i(((const __m128i*)descriptorA) + 0);
173  const __m128i descriptorA_m128_1 = SSE::load128i(((const __m128i*)descriptorA) + 1);
174 
175  const __m128i descriptorB_m128_0 = SSE::load128i(((const __m128i*)descriptorB) + 0);
176  const __m128i descriptorB_m128_1 = SSE::load128i(((const __m128i*)descriptorB) + 1);
177 
178  const __m128i xor_m128_0 = _mm_xor_si128(descriptorA_m128_0, descriptorB_m128_0);
179  const __m128i xor_m128_1 = _mm_xor_si128(descriptorA_m128_1, descriptorB_m128_1);
180 
181  return popcount128(xor_m128_0) + popcount128(xor_m128_1);
182 
183 #elif defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 30
184 
185  // the following code uses the following SSE instructions, and needs SSE3 or higher
186 
187  // SSE2:
188  // _mm_load1_pd
189  // _mm_loadu_pd
190  // _mm_mul_pd
191  // _mm_add_pd
192  // _mm_storeu_pd
193 
194  // see also popcount64()
195 
196  const __m128i descriptorA_m128_0 = SSE::load128i(((const __m128i*)descriptorA) + 0);
197  const __m128i descriptorA_m128_1 = SSE::load128i(((const __m128i*)descriptorA) + 1);
198 
199  const __m128i descriptorB_m128_0 = SSE::load128i(((const __m128i*)descriptorB) + 0);
200  const __m128i descriptorB_m128_1 = SSE::load128i(((const __m128i*)descriptorB) + 1);
201 
202  const __m128i xor_m128_0 = _mm_xor_si128(descriptorA_m128_0, descriptorB_m128_0);
203  const __m128i xor_m128_1 = _mm_xor_si128(descriptorA_m128_1, descriptorB_m128_1);
204 
205  const __m128i countLowHigh_m128_64_0 = popcount64(xor_m128_0);
206  const __m128i countHigh_m128_64_0 = _mm_unpackhi_epi64(countLowHigh_m128_64_0, countLowHigh_m128_64_0);
207  const __m128i count_m128_0 = _mm_add_epi32(countLowHigh_m128_64_0, countHigh_m128_64_0);
208  const unsigned int hammingDistance_0 = _mm_cvtsi128_si32(count_m128_0);
209 
210  const __m128i countLowHigh_m128_64_1 = popcount64(xor_m128_1);
211  const __m128i countHigh_m128_64_1 = _mm_unpackhi_epi64(countLowHigh_m128_64_1, countLowHigh_m128_64_1);
212  const __m128i count_m128_1 = _mm_add_epi32(countLowHigh_m128_64_1, countHigh_m128_64_1);
213  const unsigned int hammingDistance_1 = _mm_cvtsi128_si32(count_m128_1);
214 
215  return hammingDistance_0 + hammingDistance_1;
216 
217 #elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
218 
219  const uint8x16_t descriptorA_u_8x16_0 = vld1q_u8((const uint8_t*)(descriptorA) + 0);
220  const uint8x16_t descriptorA_u_8x16_1 = vld1q_u8((const uint8_t*)(descriptorA) + 16);
221 
222  const uint8x16_t descriptorB_u_8x16_0 = vld1q_u8((const uint8_t*)(descriptorB) + 0);
223  const uint8x16_t descriptorB_u_8x16_1 = vld1q_u8((const uint8_t*)(descriptorB) + 16);
224 
225  const uint8x16_t xor_u_8x16_0 = veorq_u8(descriptorA_u_8x16_0, descriptorB_u_8x16_0);
226  const uint8x16_t xor_u_8x16_1 = veorq_u8(descriptorA_u_8x16_1, descriptorB_u_8x16_1);
227 
228  const uint8x16_t count_u_8x16 = vaddq_u8(vcntq_u8(xor_u_8x16_0), vcntq_u8(xor_u_8x16_1));
229  const uint16x8_t count_u_16x8 = vpaddlq_u8(count_u_8x16);
230  const uint32x4_t count_u_32x4 = vpaddlq_u16(count_u_16x8);
231 
232  return NEON::sum32x4ByLanes(count_u_32x4);
233 
234 #else
235 
236  typedef std::bitset<256> Bitset;
237  static_assert(sizeof(Bitset) == 256u / 8u, "Invalid data type!");
238 
239  Bitset bitsetA, bitsetB;
240 
241  memcpy(&bitsetA, descriptorA, sizeof(Bitset));
242  memcpy(&bitsetB, descriptorB, sizeof(Bitset));
243 
244  return (unsigned int)(bitsetA ^ bitsetB).count();
245 
246 #endif
247 }
248 
249 template <unsigned int tBits>
250 OCEAN_FORCE_INLINE unsigned int Descriptor::calculateHammingDistance(const void* descriptorA, const void* descriptorB)
251 {
252  static_assert(tBits >= 128u && tBits % 128u == 0u, "Invalid bit number!");
253 
254 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 42
255 
256  // the following code uses the following SSE instructions, and needs SSE4.2 or higher
257 
258  // SSE2:
259  // _mm_lddqu_si128
260  // _mm_xor_si128
261 
262  // see also popcount128()
263 
264  unsigned int result = 0u;
265 
266  for (unsigned int n = 0u; n < tBits / 128u; ++n)
267  {
268  const __m128i descriptorA_m128 = SSE::load128i(((const __m128i*)descriptorA) + n);
269  const __m128i descriptorB_m128 = SSE::load128i(((const __m128i*)descriptorB) + n);
270 
271  const __m128i xor_m128 = _mm_xor_si128(descriptorA_m128, descriptorB_m128);
272 
273  result += popcount128(xor_m128);
274  }
275 
276  return result;
277 
278 #elif defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 30
279 
280  // the following code uses the following SSE instructions, and needs SSE3 or higher
281 
282  // SSE2:
283  // _mm_load1_pd
284  // _mm_loadu_pd
285  // _mm_mul_pd
286  // _mm_add_pd
287  // _mm_storeu_pd
288 
289  // see also popcount64()
290 
291  unsigned int result = 0u;
292 
293  for (unsigned int n = 0u; n < tBits / 128u; ++n)
294  {
295  const __m128i descriptorA_m128 = SSE::load128i(((const __m128i*)descriptorA) + n);
296  const __m128i descriptorB_m128 = SSE::load128i(((const __m128i*)descriptorB) + n);
297 
298  const __m128i xor_m128 = _mm_xor_si128(descriptorA_m128, descriptorB_m128);
299 
300  const __m128i countLowHigh_m128_64 = popcount64(xor_m128);
301  const __m128i countHigh_m128_64 = _mm_unpackhi_epi64(countLowHigh_m128_64, countLowHigh_m128_64);
302  const __m128i count_m128 = _mm_add_epi32(countLowHigh_m128_64, countHigh_m128_64);
303 
304  result += _mm_cvtsi128_si32(count_m128);
305  }
306 
307  return result;
308 
309 #elif defined(OCEAN_HARDWARE_NEON_VERSION) && OCEAN_HARDWARE_NEON_VERSION >= 10
310 
311  uint32x4_t result_u_32x4 = vdupq_n_u32(0u);
312 
313  for (unsigned int n = 0u; n < tBits / 128u; ++n)
314  {
315  const uint8x16_t descriptorA_u_8x16 = vld1q_u8((const uint8_t*)(descriptorA) + 16u * n);
316  const uint8x16_t descriptorB_u_8x16 = vld1q_u8((const uint8_t*)(descriptorB) + 16u * n);
317 
318  const uint8x16_t xor_u_8x16 = veorq_u8(descriptorA_u_8x16, descriptorB_u_8x16);
319 
320  const uint8x16_t count_u_8x16 = vcntq_u8(xor_u_8x16);
321  const uint16x8_t count_u_16x8 = vpaddlq_u8(count_u_8x16);
322  const uint32x4_t count_u_32x4 = vpaddlq_u16(count_u_16x8);
323 
324  result_u_32x4 = vaddq_u32(result_u_32x4, count_u_32x4);
325  }
326 
327  return NEON::sum32x4ByLanes(result_u_32x4);
328 
329 #else
330 
331  typedef std::bitset<tBits> Bitset;
332  static_assert(sizeof(Bitset) == tBits / 8u, "Invalid data type!");
333 
334  Bitset bitsetA, bitsetB;
335 
336  memcpy(&bitsetA, descriptorA, sizeof(Bitset));
337  memcpy(&bitsetB, descriptorB, sizeof(Bitset));
338 
339  return (unsigned int)(bitsetA ^ bitsetB).count();
340 
341 #endif
342 }
343 
344 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 30
345 
346 OCEAN_FORCE_INLINE __m128i Descriptor::popcount8(const __m128i value)
347 {
348  // the following code uses the following SSE instructions, and needs SSE3 or higher
349 
350  // SSE2:
351  // _mm_set1_epi8
352  // _mm_setr_epi8
353  // _mm_and_si128
354  // _mm_add_epi8
355 
356  // SSE3:
357  // _mm_shuffle_epi8
358 
359  const __m128i popcount_mask = _mm_set1_epi8(0x0F);
360  const __m128i popcount_table = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
361  const __m128i pcnt0 = _mm_shuffle_epi8(popcount_table, _mm_and_si128(value, popcount_mask));
362  const __m128i pcnt1 = _mm_shuffle_epi8(popcount_table, _mm_and_si128(_mm_srli_epi16(value, 4), popcount_mask));
363  return _mm_add_epi8(pcnt0, pcnt1);
364 }
365 
366 OCEAN_FORCE_INLINE __m128i Descriptor::popcount64(const __m128i value)
367 {
368  // the following code uses the following SSE instructions, and needs SSE3 or higher
369 
370  // SSE2:
371  // _mm_sad_epu8
372  // _mm_setzero_si128
373 
374  // see also popcount8()
375 
376  const __m128i cnt8 = popcount8(value);
377  return _mm_sad_epu8(cnt8, _mm_setzero_si128());
378 }
379 
380 #endif
381 
382 #if defined(OCEAN_HARDWARE_SSE_VERSION) && OCEAN_HARDWARE_SSE_VERSION >= 42
383 
384 OCEAN_FORCE_INLINE unsigned int Descriptor::popcount128(const __m128i value)
385 {
386  // the following code uses the following SSE instructions, and needs SSE4.2 or higher
387 
388  // SSE2:
389  // _mm_cvtsi128_si64
390  // _mm_srli_si128
391 
392  // SSE4.2:
393  // __popcnt64
394  // __popcnt
395 
396 #if defined(_WIN64) || TARGET_OS_MAC == 1
397 
398  return (unsigned int)__popcnt64(_mm_cvtsi128_si64(value)) + (unsigned int)__popcnt64(_mm_cvtsi128_si64(_mm_srli_si128(value, 8)));
399 
400 #else
401 
402  return (unsigned int)__popcnt(_mm_cvtsi128_si32(value)) + (unsigned int)__popcnt(_mm_cvtsi128_si32(_mm_srli_si128(value, 4))) + (unsigned int)__popcnt(_mm_cvtsi128_si32(_mm_srli_si128(value, 4))) + (unsigned int)__popcnt(_mm_cvtsi128_si32(_mm_srli_si128(value, 4)));
403 
404 #endif
405 }
406 
407 #endif
408 
409 }
410 
411 }
412 
413 }
414 
415 #endif // META_OCEAN_CV_DETECTOR_DESCRIPTOR_H
This class implements the abstract base for arbitrary descriptors.
Definition: cv/detector/Descriptor.h:32
static OCEAN_FORCE_INLINE __m128i popcount8(const __m128i value)
Calculates a pop count of an m128i register in 8 bit groups.
Definition: cv/detector/Descriptor.h:346
static OCEAN_FORCE_INLINE unsigned int popcount128(const __m128i value)
Calculates a pop count of an m128i register in 64 bit groups.
Definition: cv/detector/Descriptor.h:384
Descriptor()
Creates a new descriptor object.
Definition: cv/detector/Descriptor.h:82
static OCEAN_FORCE_INLINE __m128i popcount64(const __m128i value)
Calculates a pop count of an m128i register in 64 bit groups.
Definition: cv/detector/Descriptor.h:366
static OCEAN_FORCE_INLINE unsigned int calculateHammingDistance(const void *descriptorA, const void *descriptorB)
Determines the hamming distance between two binary descriptors.
Definition: cv/detector/Descriptor.h:250
static OCEAN_FORCE_INLINE unsigned int sum32x4ByLanes(const uint32x4_t &value)
Sums the four 32 bit values and returns the result.
Definition: NEON.h:1085
static __m128i load128i(const void *const buffer)
Loads a 128i value from the memory.
Definition: SSE.h:3619
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15