Ocean
ClusteringKMeans.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #ifndef META_OCEAN_MATH_CLUSTERING_K_MEANS_H
9 #define META_OCEAN_MATH_CLUSTERING_K_MEANS_H
10 
11 #include "ocean/math/Math.h"
12 
13 #include "ocean/base/RandomI.h"
15 #include "ocean/base/Utilities.h"
16 #include "ocean/base/Worker.h"
17 
18 #include <climits>
19 
20 namespace Ocean
21 {
22 
23 /**
24  * This class implements the base class for all classes providing clustering algorithms.
25  * The data (the observations) that will be distributed into individual clusters can be provided with two different modes.<br>
26  * The first mode clusters the data elements by their indices, thus the data elements must be provided as joined data block (an array of elements).<br>
27  * Use the first mode by setting tUseIndices to 'True'.<br>
28  * The second mode clusters the data elements by their pointers, thus the data elements may lie at arbitrary positions in the memory.<br>
29  * Use the second mode by setting tUseIndices to 'False'.
30  * @tparam tUseIndices True,
31  * @ingroup math
32  */
33 template <bool tUseIndices>
35 {
36  public:
37 
38  /**
39  * This class implements the abstract data object which will be specialized for both data modes toggled by tUseIndices.
40  * @tparam T The data type of each element of an observation
41  * @tparam tDimension The dimension of each observation (the number of elements in each observation), with range [1, infinity)
42  */
43  template <typename T, size_t tDimension>
44  class Data
45  {
46  // nothing to do here
47  };
48 };
49 
50 /**
51  * Specialization for Clustering<true>::Data<T, tDimension>.
52  * This data class implements the first data mode of the clustering class identifying observations by their indices.<br>
53  * @tparam T The data type of each element of an observation
54  * @tparam tDimension The dimension of each observation (the number of elements in each observation), with range [1, infinity)
55  */
56 template <>
57 template <typename T, size_t tDimension>
58 class Clustering<true>::Data
59 {
60  public:
61 
62  /**
63  * Definition of an observation object.
64  */
66 
67  /**
68  * Definition of an index that addresses one specific observation element in the data object that stores all observations.
69  */
70  typedef size_t DataIndex;
71 
72  /**
73  * Definition of a vector holding indices to the data object.
74  */
75  typedef std::vector<DataIndex> DataIndices;
76 
77  public:
78 
79  /**
80  * Creates a new empty data object.
81  */
82  Data() = default;
83 
84  /**
85  * Creates a new data object by observations lying in a joined memory block as array.
86  * Due to performance issues: The given observations can be copied or used directly without any memory copy.<br>
87  * Beware: If no copy of the observations is created, the given observations must exist as long as this data object (or the corresponding clustering object) exists.
88  * @param observations The first observation in the given joined memory block
89  * @param numberObservations The number of observations that are provided, with range [1, infinity)
90  * @param copyObservations True, to copy the given observations; False, to simply use the given observations as reference
91  */
92  Data(const Observation* observations, const size_t numberObservations, const bool copyObservations = false);
93 
94  /**
95  * Move constructor for an data object.
96  * @param data The data object to be moved
97  */
98  inline Data(Data<T, tDimension>&& data) noexcept;
99 
100  /**
101  * Returns one specific observation of this data object specified by the data-index of this observation.
102  * @param dataIndex The data-index of the observation that will be returned, ensure that the given data-index is valid
103  * @return The specified observation
104  * @see isValidDataIndex().
105  */
106  inline const Observation& observation(const DataIndex& dataIndex) const;
107 
108  /**
109  * Returns the number of observations that are stored by this data object.
110  * @return Number of observations
111  */
112  inline size_t numberObservations() const;
113 
114  /**
115  * Returns whether a given data-index is valid and has a corresponding observation stored in this data object.
116  * @param dataIndex The data-index that will be checked
117  * @return True, if so
118  */
119  inline bool isValidDataIndex(const DataIndex& dataIndex) const;
120 
121  /**
122  * Returns one specific observation of this data object specified by the data-index of this observation.
123  * @param dataIndex The data-index of the observation that will be returned, ensure that the given data-index is valid
124  * @return The specified observation
125  * @see isValidDataIndex(), observation()..
126  */
127  inline const Observation& operator[](const DataIndex& dataIndex) const;
128 
129  /**
130  * Move operator.
131  * @param data The data object that will be moved to this object.
132  * @return Reference to this object.
133  */
135 
136  /**
137  * Returns whether this data object holds at least one observation.
138  * @return True, if so
139  */
140  explicit inline operator bool() const;
141 
142  protected:
143 
144  /// The optional observations that are stored as copy.
145  std::vector<Observation> copyObservations_;
146 
147  /// The observation objects of this data object.
148  const Observation* observations_ = nullptr;
149 
150  /// The number of observation elements of this data object.
151  size_t numberObservations_ = 0;
152 };
153 
154 /**
155  * Specialization for Clustering<true>::Data<T, tDimension>.
156  * This data class implements the second data mode of the clustering class identifying observations by their pointers.<br>
157  * @tparam T The data type of each element of an observation
158  * @tparam tDimension The dimension of each observation (the number of elements in each observation), with range [1, infinity)
159  */
160 template <>
161 template <typename T, size_t tDimension>
162 class Clustering<false>::Data
163 {
164  public:
165 
166  /**
167  * Definition of an observation object.
168  */
170 
171  /**
172  * Definition of an index that addresses one specific observation element in the data object that stores all observations.
173  */
174  typedef size_t DataIndex;
175 
176  /**
177  * Definition of a vector holding indices to the data object.
178  */
179  typedef std::vector<DataIndex> DataIndices;
180 
181  public:
182 
183  /**
184  * Creates a new empty data object.
185  */
186  Data() = default;
187 
188  /**
189  * Creates a new data object by observations lying at individual memory positions.
190  * Due to performance issues: The given observation !pointers! can be copied or used directly without any copy.<br>
191  * Beware: If no copy of the observation pointers is created, the given observation pointers must exist as long as this data object (or the corresponding clustering object) exists.<br>
192  * Beware: In any case, the observations (not the pointers) must exist as long as this data object (or the corresponding clustering object) exists.<br>
193  * @param observationPointers The first pointer of the observations
194  * @param numberObservations The number of observations/pointers that are provided, with range [1, infinity)
195  * @param copyPointers True, to copy the given observation pointers (not the observations); False, to simply use the given observation pointers as reference
196  */
197  Data(const Observation** observationPointers, const size_t numberObservations, const bool copyPointers = false);
198 
199  /**
200  * Move constructor for an data object.
201  * @param data The data object to be moved
202  */
203  inline Data(Data<T, tDimension>&& data) noexcept;
204 
205  /**
206  * Returns one specific observation of this data object specified by the data-index of this observation.
207  * @param dataIndex The data-index of the observation that will be returned, ensure that the given data-index is valid
208  * @return The specified observation
209  * @see isValidDataIndex().
210  */
211  inline const Observation& observation(const DataIndex& dataIndex) const;
212 
213  /**
214  * Returns the number of observations that are stored by this data object.
215  * @return Number of observations
216  */
217  inline size_t numberObservations() const;
218 
219  /**
220  * Returns whether a given data-index is valid and has a corresponding observation stored in this data object.
221  * @param dataIndex The data-index that will be checked
222  * @return True, if so
223  */
224  inline bool isValidDataIndex(const DataIndex& dataIndex) const;
225 
226  /**
227  * Returns one specific observation of this data object specified by the data-index of this observation.
228  * @param dataIndex The data-index of the observation that will be returned, ensure that the given data-index is valid
229  * @return The specified observation
230  * @see isValidDataIndex(), observation()..
231  */
232  inline const Observation& operator[](const DataIndex& dataIndex) const;
233 
234  /**
235  * Move operator.
236  * @param data The data object that will be moved to this object.
237  * @return Reference to this object.
238  */
240 
241  /**
242  * Returns whether this data object holds at least one observation.
243  * @return True, if so
244  */
245  explicit inline operator bool() const;
246 
247  protected:
248 
249  /// The optional observation pointers that are stored as copy.
250  std::vector<const Observation*> copyObservationPointers_;
251 
252  /// The observation pointers of this data object.
253  const Observation** observationPointers_ = nullptr;
254 
255  /// The number of observation elements of this data object.
256  size_t numberObservations_;
257 };
258 
259 /**
260  * This class implements a k-means clustering algorithm.
261  * Beware: Due to performance issues, this class does not copy the given observation values, this expects that the given observation values exist as long as the KMean object exists.<br>
262  * @tparam T The data type of each element of an observation
263  * @tparam tDimension The dimension of each observation (the number of elements in each observation), with range [1, infinity)
264  * @tparam TSum The data type of the intermediate sum values, that is necessary to determine e.g. the mean parameters
265  * @tparam TSquareDistance The data type of the square distance value, might be different from T
266  * @ingroup math
267  */
268 template <typename T, size_t tDimension, typename TSum = T, typename TSquareDistance = T, bool tUseIndices = true>
269 class ClusteringKMeans : public Clustering<tUseIndices>
270 {
271  public:
272 
273  /**
274  * Definition of individual initialization strategies.
275  */
277  {
278  /// The first cluster is determined by selection of the (euclidean) smallest observation, the remaining clusters are defined by observations with largest distance to the already existing clusters.
280  /// All clusters are selected randomly.
281  IS_RANDOM
282  };
283 
284  /**
285  * (Re-)Definition of a data object providing the data which will be clustered.
286  */
288 
289  /**
290  * (Re-)Definition of an index that addresses one specific observation element in the data object that stores all observations.
291  */
292  typedef typename Data::DataIndex DataIndex;
293 
294  /**
295  * (Re-)Definition of a vector holding (size_t) indices.
296  */
297  typedef typename Data::DataIndices DataIndices;
298 
299  /**
300  * (Re-)Definition of an observation object.
301  */
302  typedef typename Data::Observation Observation;
303 
304  /**
305  * This class implements one cluster that holds the mean values of all observations belonging to this cluster and the indices of all observations belonging to this cluster.
306  */
307  class Cluster
308  {
309  // Two friend class declaration, necessary as the constructors of this class are all protected.
310  friend class ClusteringKMeans<T, tDimension, TSum, TSquareDistance, tUseIndices>;
311  friend class std::allocator<Cluster>;
312 
313  public:
314 
315  /**
316  * Move constructor for another cluster object.
317  * @param cluster The cluster object to be moved
318  */
319  inline Cluster(Cluster&& cluster) noexcept;
320 
321  /**
322  * Returns the mean observation value of this cluster.
323  * @return The cluster's mean observation value
324  */
325  inline const Observation& mean() const;
326 
327  /**
328  * Returns the indices of the observations that belong to this cluster.
329  * @return The cluster's observation indices
330  */
331  inline const DataIndices& dataIndices() const;
332 
333  /**
334  * Returns the square distance between a given observation and this cluster (the mean observation value of this cluster).
335  * @param observation The observation for that the distance is returned
336  * @return Resulting square distance
337  */
338  inline TSquareDistance sqrDistance(const Observation& observation) const;
339 
340  /**
341  * Calculates the maximal square distance between the mean observation value of this cluster and all observations which belong to this cluster.
342  * @param observationIndex Optional resulting index of the observation with maximal distance to the mean observation value, if defined
343  * @return Maximal square distance
344  */
345  TSquareDistance maximalSqrDistance(DataIndex* observationIndex = nullptr) const;
346 
347  /**
348  * Calculates the average square distance between the mean observation value of this cluster and all observations which belong to this cluster.
349  * @return Average square distance
350  */
351  TSquareDistance averageSqrDistance() const;
352 
353  /**
354  * Move operator moving a cluster object to this object.
355  * @param cluster The cluster object to be moved
356  * @return Reference to this object
357  */
358  inline Cluster& operator=(Cluster&& cluster) noexcept;
359 
360  /**
361  * Returns whether the left cluster has less elements than the right cluster.
362  * @param cluster The right cluster to compare
363  * @return True, if so
364  */
365  inline bool operator<(const Cluster& cluster) const;
366 
367  protected:
368 
369  /**
370  * Creates a new cluster object by a given mean value.
371  * @param owner The owner of this cluster object
372  * @param mean The mean observation value that will define this cluster
373  */
375 
376  /**
377  * Creates a new cluster object by a given mean value and several indices of observations that belong to this cluster.
378  * @param owner The owner of this cluster object
379  * @param mean The mean observation value that will define this cluster
380  * @param dataIndices The data-indices of the observations that belong to this cluster
381  */
383 
384  /**
385  * Creates a new cluster object by a given mean value and several indices of observations that belong to this cluster.
386  * @param owner The owner of this cluster object
387  * @param mean The mean observation value that will define this cluster
388  * @param dataIndices The data-indices of the observations that belong to this cluster; beware: the indices will be moved to this cluster object
389  */
391 
392  /**
393  * Returns the data-indices of the observations that belong to this cluster.
394  * @return The cluster's observation indices
395  */
397 
398  /**
399  * Updates the mean observation value of this cluster by application of the stored indices of all observations that belong to this cluster.
400  */
401  void updateMean();
402 
403  protected:
404 
405  /// The owner of this cluster.
407 
408  /// The mean observation value of this cluster.
410 
411  /// The data indices of all observation that belong to this cluster.
413  };
414 
415  /**
416  * Definition of a vector holding cluster objects.
417  */
418  typedef std::vector<Cluster> Clusters;
419 
420  public:
421 
422  /**
423  * Creates an empty k-means object.
424  */
426 
427  /**
428  * Move constructor.
429  * @param clustering The clustering object to be moved
430  */
431  inline ClusteringKMeans(ClusteringKMeans&& clustering) noexcept;
432 
433  /**
434  * Creates a new k-means object by a given data object.
435  * @param data The data object to be used to determine the clusters.
436  * @see determineClusters().
437  */
438  inline explicit ClusteringKMeans(const Data &data);
439 
440  /**
441  * Creates a new k-means object by a given data object.
442  * @param data The data object that will be moved and used to determine the clusters.
443  * @see determineClusters().
444  */
445  inline explicit ClusteringKMeans(Data &&data);
446 
447  /**
448  * Returns the clusters of this k-means clustering object.
449  * @return The determined k-means clusters
450  */
451  inline const Clusters& clusters() const;
452 
453  /**
454  * Sorts the clusters regarding their number of elements.
455  */
456  void sortClusters();
457 
458  /**
459  * Calculates the maximal square distance between the mean observation value of each clusters and all observations belonging to the cluster.
460  * @return Maximal square distance for all clusters
461  */
462  TSquareDistance maximalSqrDistance() const;
463 
464  /**
465  * Determines the clusters for this object, ensure that this object has been initialized with a valid set of observations.
466  * @param numberClusters The number of clusters that will be created, with range [1, numberObservations())
467  * @param strategy The initialization strategy for the first clusters
468  * @param iterations The number of optimization iterations that are applied after the initial clusters have been determined, with range [1, infinity)
469  * @param worker Optional worker object to distribute the computation
470  * @see clusters().
471  */
472  void determineClustersByNumber(const size_t numberClusters, const InitializationStrategy strategy = IS_LARGEST_DISTANCE, const size_t iterations = 5, Worker* worker = nullptr);
473 
474  /**
475  * Determines the clusters for this object, ensure that this object has been initialized with a valid set of observations.
476  * This function adds new clusters within several iterations until the defined maximalSqrDistance is larger than the distance within all clusters or until the defined maximal number of clusters is reached.<br>
477  * @param maximalSqrDistance The maximal square distance in the final clusters between the clusters' mean observation values and the observations in the clusters
478  * @param maximalClusters The maximal number of clusters that will be created (even if maximalSqrDistance is not reached), with range [0, infinity), define 0 to ignore this parameter
479  * @param iterations The number of optimization iterations that are applied after each time a new cluster is added [1, infinity)
480  * @param worker Optional worker object to distribute the computation
481  */
482  void determineClustersByDistance(const TSquareDistance maximalSqrDistance, size_t maximalClusters = 0, const size_t iterations = 5, Worker* worker = nullptr);
483 
484  /**
485  * Adds a new clusters for this object.
486  * @param iterations The number of optimization iterations that are applied after the new cluster has been added, with range [1, infinity)
487  * @param sqrDistance The minimal square distance between the cluster's mean and an observation of this cluster so that this cluster is divided into two clusters
488  * @param worker Optional worker object to distribute the computation
489  * @return True, if a new cluster have been added, False if no further cluster could be added or if the provided distance was too large
490  */
491  bool addCluster(const size_t iterations = 5, TSquareDistance sqrDistance = TSquareDistance(0), Worker* worker = nullptr);
492 
493  /**
494  * Removes one cluster from this object.
495  * The cluster with smallest maximal distance of all observations to the mean observation value of the clusters is removed.
496  * @param iterations The number of optimization iterations that are applied after the cluster has been removed, with range [1, infinity)
497  * @param worker Optional worker object to distribute the computation
498  */
499  void removeCluster(const size_t iterations = 5, Worker* worker = nullptr);
500 
501  /**
502  * Finds a best matching cluster for a given independent observation.
503  * However, the observation is not added to this cluster, it's simply a lookup for the best matching cluster.
504  * @param observation The observation for that the best matching cluster is determined
505  * @return The index of the best matching cluster, -1 if no cluster could be found
506  * @see clusters().
507  */
508  size_t findCluster(const Observation& observation);
509 
510  /**
511  * Explicitly applies one further optimization iteration for an existing set of clusters.
512  * Do not call this function before initial clusters have been found.
513  * @see clusters(), determineCluster().
514  */
516 
517  /**
518  * Explicitly applies one further optimization iteration for an existing set of clusters.
519  * Do not call this function before initial clusters have been found.
520  * @param worker The worker object to distribute the computation
521  * @see clusters(), determineCluster().
522  */
524 
525  /**
526  * Clears all determined clusters but registered the data information is untouched.
527  */
528  void clear();
529 
530  /**
531  * Returns whether this object holds a valid set of observations.
532  * @return True, if so
533  */
534  inline bool isValid() const;
535 
536  /**
537  * Returns whether this object holds a valid set of observations.
538  * @return True, if so
539  */
540  explicit inline operator bool() const;
541 
542  /**
543  * Move operator.
544  * @param clustering The clustering object to be moved
545  * @return Reference to this object
546  */
548 
549  protected:
550 
551  /**
552  * Determines the initial clusters for this object with the IS_LARGEST_DISTANCE strategy.
553  * First the smallest observation object is selected as first cluster,<br>
554  * all following clusters are determined by observations that have the largest distance to the already existing clusters.<br>
555  * @param numberClusters The number of initial clusters that will be created.
556  */
557  void determineInitialClustersLargestDistance(const size_t numberClusters);
558 
559  /**
560  * Determines the initial clusters for this object with the IS_RANDOM strategy.
561  * All clusters are created randomly.br>
562  * @param numberClusters The number of initial clusters that will be created.
563  */
564  void determineInitialClustersRandom(const size_t numberClusters);
565 
566  /**
567  * Explicitly applies one further optimization iteration for an existing set of clusters.
568  * This functions operates on a subset of all observations.<br>
569  * @param lock Optional lock object if this function is executed on multiple threads in parallel
570  * @param firstObservation The first observation that will be handled
571  * @param numberObservations The number of observations that will be handled
572  * @see clusters(), determineCluster().
573  */
574  void applyOptimizationIterationSubset(Lock* lock, const unsigned int firstObservation, const unsigned int numberObservations);
575 
576  /**
577  * Determines the smallest observation (euclidean distance to origin) from a set of observations.
578  * @param data The observation data in which the smallest observation is determined, must be valid
579  * @return The index of the smallest observation
580  */
581  static inline DataIndex smallestObservation(const Data& data);
582 
583  /**
584  * Returns the square distance between an observation and the origin.
585  * @param observation The observation for that the square distance is determined
586  * @return Resulting square distance
587  */
588  static inline TSquareDistance sqrDistance(const Observation& observation);
589 
590  protected:
591 
592  /// The data that stores the observations of this clustering object, either with index-access or pointer-access.
594 
595  /// The current clusters of this object.
597 };
598 
599 template <>
600 template <typename T, size_t tDimension>
601 Clustering<true>::Data<T, tDimension>::Data(const Observation* observations, const size_t numberObservations, const bool copyObservations) :
602  copyObservations_(copyObservations ? numberObservations : 0),
603  observations_(nullptr),
604  numberObservations_(0)
605 {
606  ocean_assert(observations && numberObservations > 0);
607 
608  if (copyObservations)
609  {
610  ocean_assert(copyObservations_.size() == numberObservations);
611  memcpy(copyObservations_.data(), observations, sizeof(Observation) * numberObservations);
612 
615  }
616  else
617  {
618  observations_ = observations;
620  }
621 }
622 
623 template <>
624 template <typename T, size_t tDimension>
626  copyObservations_(std::move(data.copyObservations_)),
627  observations_(data.observations_),
628  numberObservations_(data.numberObservations_)
629 {
630  data.observations_ = nullptr;
631  data.numberObservations_ = 0;
632 }
633 
634 template <>
635 template <typename T, size_t tDimension>
636 inline const typename Clustering<true>::Data<T, tDimension>::Observation& Clustering<true>::Data<T, tDimension>::observation(const DataIndex& dataIndex) const
637 {
638  ocean_assert(isValidDataIndex(dataIndex));
639  return observations_[dataIndex];
640 }
641 
642 template <>
643 template <typename T, size_t tDimension>
645 {
646  return numberObservations_;
647 }
648 
649 template <>
650 template <typename T, size_t tDimension>
652 {
653  return dataIndex < numberObservations_;
654 }
655 
656 template <>
657 template <typename T, size_t tDimension>
658 inline const typename Clustering<true>::Data<T, tDimension>::Observation& Clustering<true>::Data<T, tDimension>::operator[](const DataIndex& dataIndex) const
659 {
660  ocean_assert(isValidDataIndex(dataIndex));
661  return observations_[dataIndex];
662 }
663 
664 template <>
665 template <typename T, size_t tDimension>
667 {
668  if (this != &data)
669  {
670  copyObservations_ = std::move(data.copyObservations_);
671  observations_ = data.observations_;
672  numberObservations_ = data.numberObservations_;
673 
674  data.observations_ = nullptr;
675  data.numberObservations_ = 0;
676  }
677 
678  return *this;
679 }
680 
681 template <>
682 template <typename T, size_t tDimension>
683 inline Clustering<true>::Data<T, tDimension>::operator bool() const
684 {
685  return observations_ != nullptr;
686 }
687 
688 template <>
689 template <typename T, size_t tDimension>
690 Clustering<false>::Data<T, tDimension>::Data(const Observation** observationPointers, const size_t numberObservations, const bool copyPointers) :
691  copyObservationPointers_(copyPointers ? numberObservations : 0),
692  observationPointers_(nullptr),
693  numberObservations_(0)
694 {
695  ocean_assert(observationPointers && numberObservations > 0);
696 
697  if (copyPointers)
698  {
699  ocean_assert(copyObservationPointers_.size() == numberObservations);
700  memcpy(copyObservationPointers_.data(), observationPointers, sizeof(Observation*) * numberObservations);
701 
704  }
705  else
706  {
707  observationPointers_ = observationPointers;
709  }
710 }
711 
712 template <>
713 template <typename T, size_t tDimension>
715  copyObservationPointers_(std::move(data.copyObservationPointers_)),
716  observationPointers_(data.observationPointers_),
717  numberObservations_(data.numberObservations_)
718 {
719  data.observationPointers_ = nullptr;
720  data.numberObservations_ = 0;
721 }
722 
723 template <>
724 template <typename T, size_t tDimension>
726 {
727  ocean_assert(isValidDataIndex(dataIndex));
728  return *(observationPointers_[dataIndex]);
729 }
730 
731 template <>
732 template <typename T, size_t tDimension>
734 {
735  return numberObservations_;
736 }
737 
738 template <>
739 template <typename T, size_t tDimension>
741 {
742  return dataIndex < numberObservations_;
743 }
744 
745 template <>
746 template <typename T, size_t tDimension>
747 inline const typename Clustering<false>::Data<T, tDimension>::Observation& Clustering<false>::Data<T, tDimension>::operator[](const DataIndex& dataIndex) const
748 {
749  ocean_assert(isValidDataIndex(dataIndex));
750  return *(observationPointers_[dataIndex]);
751 }
752 
753 template <>
754 template <typename T, size_t tDimension>
756 {
757  if (this != &data)
758  {
759  copyObservationPointers_ = std::move(data.copyObservationPointers_);
760  observationPointers_ = data.observationPointers_;
761  numberObservations_ = data.numberObservations_;
762 
763  data.observationPointers_ = nullptr;
764  data.numberObservations_ = 0;
765  }
766 
767  return *this;
768 }
769 
770 template <>
771 template <typename T, size_t tDimension>
772 inline Clustering<false>::Data<T, tDimension>::operator bool() const
773 {
774  return observationPointers_ != nullptr;
775 }
776 
777 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
779  owner_(cluster.owner_),
780  mean_(std::move(cluster.mean_)),
781  dataIndices_(std::move(cluster.dataIndices_))
782 {
783  cluster.owner_ = nullptr;
784 }
785 
786 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
788  owner_(&owner),
789  mean_(mean)
790 {
791  // nothing to do here
792 }
793 
794 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
796  owner_(&owner),
797  mean_(mean),
798  dataIndices_(dataIndices)
799 {
800  static_assert(tDimension != 0, "Invalid observation dimension!");
801 }
802 
803 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
805  owner_(&owner),
806  mean_(mean),
807  dataIndices_(dataIndices)
808 {
809  static_assert(tDimension != 0, "Invalid observation dimension!");
810 }
811 
812 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
814 {
815  TSquareDistance result = TSquareDistance(0);
816 
817  for (size_t n = 0; n < tDimension; ++n)
818  {
819  result += (mean_[n] - observation[n]) * (mean_[n] - observation[n]);
820  }
821 
822  return result;
823 }
824 
825 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
827 {
828  ocean_assert(owner_);
829 
830  if (dataIndices_.empty())
831  {
832  return TSquareDistance(0);
833  }
834 
835  const Data& data = owner_->data_;
836  TSquareDistance result = TSquareDistance(0);
837 
838  for (typename DataIndices::const_iterator i = dataIndices_.begin(); i != dataIndices_.end(); ++i)
839  {
840  const TSquareDistance localDistance = sqrDistance(data[*i]);
841 
842  if (localDistance > result)
843  {
844  result = localDistance;
845 
846  if (observationIndex)
847  {
848  *observationIndex = *i;
849  }
850  }
851  }
852 
853  return result;
854 }
855 
856 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
858 {
859  ocean_assert(owner_);
860 
861  if (dataIndices_.empty())
862  {
863  return TSquareDistance(0);
864  }
865 
866  const Data& data = owner_->data_;
867  TSquareDistance result = TSquareDistance(0);
868 
869  for (typename DataIndices::const_iterator i = dataIndices_.begin(); i != dataIndices_.end(); ++i)
870  {
871  result += sqrDistance(data[*i]);
872  }
873 
874  ocean_assert(dataIndices_.size() != 0);
875  return result / TSquareDistance(dataIndices_.size());
876 }
877 
878 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
880 {
881  return mean_;
882 }
883 
884 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
886 {
887  return dataIndices_;
888 }
889 
890 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
892 {
893  return dataIndices_;
894 }
895 
896 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
898 {
899  if (this != &cluster)
900  {
901  mean_ = std::move(cluster.mean_);
902  dataIndices_ = std::move(cluster.dataIndices_);
903  owner_ = cluster.owner_;
904 
905  cluster.owner_ = nullptr;
906  }
907 
908  return *this;
909 }
910 
911 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
913 {
914  return dataIndices_.size() < cluster.dataIndices_.size();
915 }
916 
917 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
919 {
920  if (dataIndices_.empty())
921  {
922  for (size_t d = 0; d < tDimension; ++d)
923  mean_[d] = T();
924 
925  return;
926  }
927 
928  StaticBuffer<TSum, tDimension> sumObservation(tDimension, TSum());
929 
930  ocean_assert(owner_);
931  const Data& data = owner_->data_;
932 
933  for (typename DataIndices::const_iterator i = dataIndices_.begin(); i != dataIndices_.end(); ++i)
934  {
935  ocean_assert(*i < data.numberObservations());
936 
937  for (size_t d = 0; d < tDimension; ++d)
938  {
939  sumObservation[d] += data[*i][d];
940  }
941  }
942 
943  const TSum count = TSum(dataIndices_.size());
944 
945  for (size_t d = 0; d < tDimension; ++d)
946  {
947  mean_[d] = T(sumObservation[d] / count);
948  }
949 }
950 
951 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
953 {
954  static_assert(tDimension != 0, "Invalid observation dimension!");
955 }
956 
957 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
959  data_(std::move(clustering.data_)),
960  clusters_(std::move(clustering.clusters_))
961 {
962  static_assert(tDimension != 0, "Invalid observation dimension!");
963 }
964 
965 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
967  data_(data)
968 {
969  static_assert(tDimension != 0, "Invalid observation dimension!");
970 
971  ocean_assert(data_ && "The data element is invalid!");
972 }
973 
974 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
976  data_(std::move(data))
977 {
978  static_assert(tDimension != 0, "Invalid observation dimension!");
979 
980  ocean_assert(data_ && "The data element is invalid!");
981 }
982 
983 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
984 void ClusteringKMeans<T, tDimension, TSum, TSquareDistance, tUseIndices>::determineClustersByNumber(const size_t numberClusters, const InitializationStrategy strategy, const size_t iterations, Worker* worker)
985 {
986  ocean_assert(clusters_.empty());
987 
988  if (strategy == IS_LARGEST_DISTANCE)
989  {
990  determineInitialClustersLargestDistance(numberClusters);
991  }
992  else
993  {
994  ocean_assert(strategy == IS_RANDOM);
995  determineInitialClustersRandom(numberClusters);
996  }
997 
998  ocean_assert(iterations >= 1);
999  applyOptimizationIteration(worker);
1000 
1001  for (size_t i = 0; i < iterations; i++)
1002  {
1003  applyOptimizationIteration(worker);
1004  }
1005 }
1006 
1007 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1008 void ClusteringKMeans<T, tDimension, TSum, TSquareDistance, tUseIndices>::determineClustersByDistance(const TSquareDistance maximalSqrDistance, size_t maximalClusters, size_t iterations, Worker* worker)
1009 {
1010  ocean_assert(data_);
1011  ocean_assert(clusters_.empty());
1012 
1013  // find the smallest observation (euclidean distance to the origin)
1014  const size_t firstDataIndex = smallestObservation(data_);
1015  ocean_assert(firstDataIndex != size_t(-1));
1016 
1017  clusters_.push_back(Cluster(*this, data_[firstDataIndex], std::move(createIndices<size_t>(data_.numberObservations(), 0))));
1018 
1019  while (maximalClusters == 0 || clusters_.size() < maximalClusters)
1020  {
1021  if (!addCluster(iterations, maximalSqrDistance, worker))
1022  {
1023  break;
1024  }
1025  }
1026 }
1027 
1028 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1030 {
1031  size_t maximalIndex = size_t(-1);
1032  TSquareDistance maximalDistance = TSquareDistance(0);
1033 
1034  for (size_t c = 0; c < clusters_.size(); ++c)
1035  {
1036  size_t localIndex = size_t(-1);
1037  const TSquareDistance localDistance = clusters_[c].maximalSqrDistance(&localIndex);
1038 
1039  if (localDistance > maximalDistance)
1040  {
1041  maximalDistance = localDistance;
1042  maximalIndex = localIndex;
1043  }
1044  }
1045 
1046  if (maximalIndex != size_t(-1) && maximalDistance >= sqrDistance)
1047  {
1048  clusters_.push_back(Cluster(*this, data_[maximalIndex]));
1049 
1050  ocean_assert(iterations >= 1);
1051  applyOptimizationIteration(worker);
1052 
1053  for (size_t n = 1; n < iterations; ++n)
1054  {
1055  applyOptimizationIteration(worker);
1056  }
1057 
1058  return true;
1059  }
1060 
1061  return false;
1062 }
1063 
1064 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1066 {
1067  ocean_assert(!clusters_.empty());
1068 
1069  if (clusters_.size() <= 1)
1070  {
1071  clusters_.clear();
1072  }
1073  else
1074  {
1075  size_t minimalCluster = 0;
1076  TSquareDistance minimalDistance = NumericT<TSquareDistance>::maxValue();
1077 
1078  for (size_t c = 0; c < clusters_.size(); ++c)
1079  {
1080  const TSquareDistance localDistance = clusters_[c].maximalSqrDistance();
1081 
1082  if (localDistance < minimalDistance)
1083  {
1084  minimalDistance = localDistance;
1085  minimalCluster = c;
1086  }
1087  }
1088 
1089  Clusters tmpClusters(std::move(clusters_));
1090  ocean_assert(clusters_.empty());
1091 
1092  ocean_assert(tmpClusters.size() >= 1);
1093  clusters_.reserve(tmpClusters.size() - 1);
1094 
1095  for (size_t c = 0; c < tmpClusters.size(); ++c)
1096  {
1097  if (c != minimalCluster)
1098  {
1099  clusters_.push_back(std::move(tmpClusters[c]));
1100  }
1101  }
1102 
1103  ocean_assert(!clusters_.empty());
1104 
1105  ocean_assert(iterations >= 1);
1106  applyOptimizationIteration(worker);
1107 
1108  for (size_t n = 1; n < iterations; ++n)
1109  {
1110  applyOptimizationIteration(worker);
1111  }
1112  }
1113 }
1114 
1115 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1117 {
1118  TSquareDistance minimalDistance = NumericT<TSquareDistance>::maxValue();
1119  size_t minimalIndex = size_t(-1);
1120 
1121  for (size_t n = 0; n < clusters_.size(); ++n)
1122  {
1123  const TSquareDistance localDistance = clusters_[n].sqrDistance(observation);
1124 
1125  if (localDistance < minimalDistance)
1126  {
1127  minimalDistance = localDistance;
1128  minimalIndex = n;
1129  }
1130  }
1131 
1132  return minimalIndex;
1133 }
1134 
1135 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1137 {
1138  return clusters_;
1139 }
1140 
1141 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1143 {
1144  std::sort(clusters_.rbegin(), clusters_.rend());
1145 }
1146 
1147 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1149 {
1150  TSquareDistance maximalDistance = TSquareDistance(0);
1151 
1152  for (typename Clusters::const_iterator i = clusters_.begin(); i != clusters_.end(); ++i)
1153  {
1154  maximalDistance = max(maximalDistance, i->maximalSqrDistance());
1155  }
1156 
1157  return maximalDistance;
1158 }
1159 
1160 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1162 {
1163  ocean_assert(data_.numberObservations() != 0);
1164  ocean_assert(clusters_.empty());
1165 
1166  // find the smallest observation (euclidean distance to the origin)
1167  const DataIndex firstDataIndex = smallestObservation(data_);
1168  ocean_assert(firstDataIndex != DataIndex(-1));
1169 
1170  clusters_.push_back(Cluster(*this, data_[firstDataIndex]));
1171 
1172  while (clusters_.size() < numberClusters)
1173  {
1174  TSquareDistance largestDistance = TSquareDistance(0);
1175  size_t largestIndex = size_t(-1);
1176 
1177  for (size_t o = 0; o < data_.numberObservations(); ++o)
1178  {
1179  const Observation& observation = data_[o];
1180 
1181  TSquareDistance localDistance = NumericT<TSquareDistance>::maxValue();
1182 
1183  for (size_t c = 0; c < clusters_.size(); ++c)
1184  {
1185  localDistance = min(localDistance, clusters_[c].sqrDistance(observation));
1186  }
1187 
1188  if (localDistance > largestDistance)
1189  {
1190  largestDistance = localDistance;
1191  largestIndex = o;
1192  }
1193  }
1194 
1195  // check whether no observation is left
1196  if (largestIndex == size_t(-1))
1197  {
1198  break;
1199  }
1200 
1201  ocean_assert(largestDistance != TSquareDistance(0));
1202 
1203 #ifdef OCEAN_DEBUG
1204  for (size_t c = 0; c < clusters_.size(); ++c)
1205  {
1206  ocean_assert(clusters_[c].mean() != data_[largestIndex]);
1207  }
1208 #endif
1209 
1210  clusters_.push_back(Cluster(*this, data_[largestIndex]));
1211  }
1212 
1213  ocean_assert(!clusters_.empty());
1214 
1215  for (size_t c = 0; c < clusters_.size(); ++c)
1216  {
1217  ocean_assert(clusters_[c].dataIndices().empty());
1218  clusters_[c].dataIndices().reserve(data_.numberObservations() * 2 / clusters_.size());
1219  }
1220 }
1221 
1222 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1224 {
1225  ocean_assert(data_.numberObservations() != 0);
1226  ocean_assert(clusters_.empty());
1227 
1228  const bool random64 = data_.numberObservations() > NumericT<unsigned int>::maxValue();
1229 
1230  // find the smallest observation (euclidean distance to the origin)
1231  const size_t firstDataIndex = smallestObservation(data_);
1232  ocean_assert(firstDataIndex != size_t(-1));
1233 
1234  clusters_.push_back(Cluster(*this, data_[firstDataIndex]));
1235 
1236  size_t iterations = 0;
1237  while (clusters_.size() < numberClusters && iterations++ < numberClusters * 100)
1238  {
1239  TSquareDistance largestDistance = 0u;
1240  size_t largestIndex = size_t(-1);
1241 
1242  for (size_t n = 0; n < max<size_t>(1, data_.numberObservations() / 128); ++n)
1243  {
1244  const size_t index = random64 ? RandomI::random64() % data_.numberObservations() : RandomI::random32() % (unsigned int)data_.numberObservations();
1245  const Observation& candidate = data_[index];
1246 
1247  TSquareDistance smallestDistance = NumericT<TSquareDistance>::maxValue();
1248 
1249  for (size_t c = 0; c < clusters_.size(); ++c)
1250  {
1251  smallestDistance = min(smallestDistance, clusters_[c].sqrDistance(candidate));
1252  }
1253 
1254  if (smallestDistance > largestDistance)
1255  {
1256  largestDistance = smallestDistance;
1257  largestIndex = index;
1258  }
1259  }
1260 
1261  // check whether no observation is left
1262  if (largestIndex == size_t(-1))
1263  {
1264  break;
1265  }
1266 
1267  ocean_assert(largestDistance != TSquareDistance(0));
1268 
1269 #ifdef OCEAN_DEBUG
1270  for (size_t c = 0; c < clusters_.size(); ++c)
1271  {
1272  ocean_assert(clusters_[c].mean() != data_[largestIndex]);
1273  }
1274 #endif
1275 
1276  clusters_.push_back(Cluster(*this, data_[largestIndex]));
1277  }
1278 
1279  ocean_assert(!clusters_.empty());
1280 
1281  for (size_t c = 0; c < clusters_.size(); ++c)
1282  {
1283  ocean_assert(clusters_[c].dataIndices().empty());
1284  clusters_[c].dataIndices().reserve(data_.numberObservations() * 2 / clusters_.size());
1285  }
1286 }
1287 
1288 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1290 {
1291  ocean_assert(!clusters_.empty());
1292 
1293  // remove the old indices, we determine the new distribution
1294  for (size_t c = 0; c < clusters_.size(); ++c)
1295  {
1296  clusters_[c].dataIndices().clear();
1297  }
1298 
1299  // assign each observation to the best fitting cluster
1300  for (size_t o = 0; o < data_.numberObservations(); ++o)
1301  {
1302  const Observation& observation = data_[o];
1303 
1304  TSquareDistance bestDistance = NumericT<TSquareDistance>::maxValue();
1305  size_t bestCluster = size_t(-1);
1306 
1307  for (size_t c = 0; c < clusters_.size(); ++c)
1308  {
1309  const TSquareDistance localDistance = clusters_[c].sqrDistance(observation);
1310 
1311  if (localDistance < bestDistance)
1312  {
1313  bestDistance = localDistance;
1314  bestCluster = c;
1315  }
1316  }
1317 
1318  ocean_assert(bestCluster != size_t(-1));
1319 
1320  clusters_[bestCluster].dataIndices().push_back(o);
1321  }
1322 
1323  // update the mean values for each cluster
1324  for (size_t c = 0; c < clusters_.size(); ++c)
1325  {
1326  clusters_[c].updateMean();
1327  }
1328 }
1329 
1330 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1332 {
1333  if (worker == nullptr)
1334  {
1335  applyOptimizationIteration();
1336  }
1337  else
1338  {
1339  ocean_assert(!clusters_.empty());
1340 
1341  // remove the old indices, we determine the new distribution
1342  for (size_t c = 0; c < clusters_.size(); ++c)
1343  {
1344  clusters_[c].dataIndices().clear();
1345  }
1346 
1347  Lock lock;
1348  worker->executeFunction(Worker::Function::create(*this, &ClusteringKMeans<T, tDimension, TSum, TSquareDistance, tUseIndices>::applyOptimizationIterationSubset, &lock, 0u, 0u), 0u, (unsigned int)data_.numberObservations(), 1u, 2u);
1349 
1350  // update the mean values for each cluster
1351  for (size_t c = 0; c < clusters_.size(); ++c)
1352  {
1353  clusters_[c].updateMean();
1354  }
1355  }
1356 }
1357 
1358 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1359 void ClusteringKMeans<T, tDimension, TSum, TSquareDistance, tUseIndices>::applyOptimizationIterationSubset(Lock* lock, const unsigned int firstObservation, const unsigned int numberObservations)
1360 {
1361  ocean_assert(!clusters_.empty());
1362 
1363  std::vector<DataIndices> localClusters(clusters_.size());
1364 
1365  // assign each observation to the best fitting cluster
1366  for (size_t o = firstObservation; o < firstObservation + numberObservations; ++o)
1367  {
1368  const Observation& observation = data_[o];
1369 
1370  TSquareDistance bestDistance = NumericT<TSquareDistance>::maxValue();
1371  size_t bestCluster = size_t(-1);
1372 
1373  for (size_t c = 0; c < clusters_.size(); ++c)
1374  {
1375  const TSquareDistance localDistance = clusters_[c].sqrDistance(observation);
1376 
1377  if (localDistance < bestDistance)
1378  {
1379  bestDistance = localDistance;
1380  bestCluster = c;
1381  }
1382  }
1383 
1384  ocean_assert(bestCluster != size_t(-1));
1385 
1386  localClusters[bestCluster].push_back(o);
1387  }
1388 
1389  const OptionalScopedLock scopedLock(lock);
1390 
1391  for (size_t c = 0; c < localClusters.size(); ++c)
1392  {
1393  clusters_[c].dataIndices().insert(clusters_[c].dataIndices().end(), localClusters[c].begin(), localClusters[c].end());
1394  }
1395 }
1396 
1397 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1399 {
1400  clusters_.clear();
1401 }
1402 
1403 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1405 {
1406  return data_;
1407 }
1408 
1409 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1411 {
1412  return data_;
1413 }
1414 
1415 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1417 {
1418  if (this != &clustering)
1419  {
1420  data_ = std::move(clustering.data_);
1421  clusters_ = std::move(clustering.clusters_);
1422  }
1423 
1424  return *this;
1425 }
1426 
1427 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1429 {
1430  ocean_assert(data);
1431 
1432  TSquareDistance smallestDistance = NumericT<TSquareDistance>::maxValue();
1433  DataIndex smallestIndex = size_t(-1);
1434 
1435  for (size_t o = 0; o < data.numberObservations(); ++o)
1436  {
1437  const TSquareDistance localDistance = sqrDistance(data[o]);
1438 
1439  if (localDistance < smallestDistance)
1440  {
1441  smallestDistance = localDistance;
1442  smallestIndex = o;
1443  }
1444  }
1445 
1446  return smallestIndex;
1447 }
1448 
1449 template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1451 {
1452  TSquareDistance result = TSquareDistance(0);
1453 
1454  for (size_t n = 0; n < tDimension; ++n)
1455  {
1456  result += observation[n] * observation[n];
1457  }
1458 
1459  return result;
1460 }
1461 
1462 
1463 }
1464 
1465 #endif // META_OCEAN_MATH_CLUSTERING_K_MEANS_H
This class implements the abstract data object which will be specialized for both data modes toggled ...
Definition: ClusteringKMeans.h:45
size_t DataIndex
Definition of an index that addresses one specific observation element in the data object that stores...
Definition: ClusteringKMeans.h:70
Data< T, tDimension > & operator=(Data< T, tDimension > &&data) noexcept
Move operator.
size_t numberObservations() const
Returns the number of observations that are stored by this data object.
size_t numberObservations_
The number of observation elements of this data object.
Definition: ClusteringKMeans.h:151
const Observation & operator[](const DataIndex &dataIndex) const
Returns one specific observation of this data object specified by the data-index of this observation.
std::vector< DataIndex > DataIndices
Definition of a vector holding indices to the data object.
Definition: ClusteringKMeans.h:75
Data(const Observation *observations, const size_t numberObservations, const bool copyObservations=false)
Creates a new data object by observations lying in a joined memory block as array.
const Observation ** observationPointers_
The observation pointers of this data object.
Definition: ClusteringKMeans.h:253
std::vector< const Observation * > copyObservationPointers_
The optional observation pointers that are stored as copy.
Definition: ClusteringKMeans.h:250
const Observation * observations_
The observation objects of this data object.
Definition: ClusteringKMeans.h:148
const Observation & observation(const DataIndex &dataIndex) const
Returns one specific observation of this data object specified by the data-index of this observation.
Data(Data< T, tDimension > &&data) noexcept
Move constructor for an data object.
std::vector< Observation > copyObservations_
The optional observations that are stored as copy.
Definition: ClusteringKMeans.h:145
bool isValidDataIndex(const DataIndex &dataIndex) const
Returns whether a given data-index is valid and has a corresponding observation stored in this data o...
Data()=default
Creates a new empty data object.
Data(const Observation **observationPointers, const size_t numberObservations, const bool copyPointers=false)
Creates a new data object by observations lying at individual memory positions.
StaticBuffer< T, tDimension > Observation
Definition of an observation object.
Definition: ClusteringKMeans.h:65
This class implements the base class for all classes providing clustering algorithms.
Definition: ClusteringKMeans.h:35
This class implements one cluster that holds the mean values of all observations belonging to this cl...
Definition: ClusteringKMeans.h:308
TSquareDistance maximalSqrDistance(DataIndex *observationIndex=nullptr) const
Calculates the maximal square distance between the mean observation value of this cluster and all obs...
Definition: ClusteringKMeans.h:826
TSquareDistance sqrDistance(const Observation &observation) const
Returns the square distance between a given observation and this cluster (the mean observation value ...
Definition: ClusteringKMeans.h:813
DataIndices dataIndices_
The data indices of all observation that belong to this cluster.
Definition: ClusteringKMeans.h:412
Observation mean_
The mean observation value of this cluster.
Definition: ClusteringKMeans.h:409
const ClusteringKMeans< T, tDimension, TSum, TSquareDistance, tUseIndices > * owner_
The owner of this cluster.
Definition: ClusteringKMeans.h:406
Cluster & operator=(Cluster &&cluster) noexcept
Move operator moving a cluster object to this object.
Definition: ClusteringKMeans.h:897
Cluster(Cluster &&cluster) noexcept
Move constructor for another cluster object.
Definition: ClusteringKMeans.h:778
const DataIndices & dataIndices() const
Returns the indices of the observations that belong to this cluster.
Definition: ClusteringKMeans.h:885
const Observation & mean() const
Returns the mean observation value of this cluster.
Definition: ClusteringKMeans.h:879
bool operator<(const Cluster &cluster) const
Returns whether the left cluster has less elements than the right cluster.
Definition: ClusteringKMeans.h:912
void updateMean()
Updates the mean observation value of this cluster by application of the stored indices of all observ...
Definition: ClusteringKMeans.h:918
TSquareDistance averageSqrDistance() const
Calculates the average square distance between the mean observation value of this cluster and all obs...
Definition: ClusteringKMeans.h:857
This class implements a k-means clustering algorithm.
Definition: ClusteringKMeans.h:270
void clear()
Clears all determined clusters but registered the data information is untouched.
Definition: ClusteringKMeans.h:1398
void applyOptimizationIteration(Worker *worker)
Explicitly applies one further optimization iteration for an existing set of clusters.
Definition: ClusteringKMeans.h:1331
ClusteringKMeans(ClusteringKMeans &&clustering) noexcept
Move constructor.
Definition: ClusteringKMeans.h:958
std::vector< Cluster > Clusters
Definition of a vector holding cluster objects.
Definition: ClusteringKMeans.h:418
const Clusters & clusters() const
Returns the clusters of this k-means clustering object.
Definition: ClusteringKMeans.h:1136
Data data_
The data that stores the observations of this clustering object, either with index-access or pointer-...
Definition: ClusteringKMeans.h:593
void applyOptimizationIterationSubset(Lock *lock, const unsigned int firstObservation, const unsigned int numberObservations)
Explicitly applies one further optimization iteration for an existing set of clusters.
Definition: ClusteringKMeans.h:1359
ClusteringKMeans< T, tDimension, TSum, TSquareDistance, tUseIndices > & operator=(ClusteringKMeans &&clustering)
Move operator.
Definition: ClusteringKMeans.h:1416
ClusteringKMeans()
Creates an empty k-means object.
Definition: ClusteringKMeans.h:952
Data::DataIndices DataIndices
(Re-)Definition of a vector holding (size_t) indices.
Definition: ClusteringKMeans.h:297
ClusteringKMeans(Data &&data)
Creates a new k-means object by a given data object.
Definition: ClusteringKMeans.h:975
Data::DataIndex DataIndex
(Re-)Definition of an index that addresses one specific observation element in the data object that s...
Definition: ClusteringKMeans.h:292
static DataIndex smallestObservation(const Data &data)
Determines the smallest observation (euclidean distance to origin) from a set of observations.
Definition: ClusteringKMeans.h:1428
static TSquareDistance sqrDistance(const Observation &observation)
Returns the square distance between an observation and the origin.
Definition: ClusteringKMeans.h:1450
void determineClustersByDistance(const TSquareDistance maximalSqrDistance, size_t maximalClusters=0, const size_t iterations=5, Worker *worker=nullptr)
Determines the clusters for this object, ensure that this object has been initialized with a valid se...
Definition: ClusteringKMeans.h:1008
void determineClustersByNumber(const size_t numberClusters, const InitializationStrategy strategy=IS_LARGEST_DISTANCE, const size_t iterations=5, Worker *worker=nullptr)
Determines the clusters for this object, ensure that this object has been initialized with a valid se...
Definition: ClusteringKMeans.h:984
Data::Observation Observation
(Re-)Definition of an observation object.
Definition: ClusteringKMeans.h:302
bool addCluster(const size_t iterations=5, TSquareDistance sqrDistance=TSquareDistance(0), Worker *worker=nullptr)
Adds a new clusters for this object.
Definition: ClusteringKMeans.h:1029
bool isValid() const
Returns whether this object holds a valid set of observations.
Definition: ClusteringKMeans.h:1404
void determineInitialClustersRandom(const size_t numberClusters)
Determines the initial clusters for this object with the IS_RANDOM strategy.
Definition: ClusteringKMeans.h:1223
void removeCluster(const size_t iterations=5, Worker *worker=nullptr)
Removes one cluster from this object.
Definition: ClusteringKMeans.h:1065
Clusters clusters_
The current clusters of this object.
Definition: ClusteringKMeans.h:596
InitializationStrategy
Definition of individual initialization strategies.
Definition: ClusteringKMeans.h:277
@ IS_LARGEST_DISTANCE
The first cluster is determined by selection of the (euclidean) smallest observation,...
Definition: ClusteringKMeans.h:279
@ IS_RANDOM
All clusters are selected randomly.
Definition: ClusteringKMeans.h:281
Clustering< tUseIndices >::template Data< T, tDimension > Data
(Re-)Definition of a data object providing the data which will be clustered.
Definition: ClusteringKMeans.h:287
size_t findCluster(const Observation &observation)
Finds a best matching cluster for a given independent observation.
Definition: ClusteringKMeans.h:1116
void determineInitialClustersLargestDistance(const size_t numberClusters)
Determines the initial clusters for this object with the IS_LARGEST_DISTANCE strategy.
Definition: ClusteringKMeans.h:1161
ClusteringKMeans(const Data &data)
Creates a new k-means object by a given data object.
Definition: ClusteringKMeans.h:966
void sortClusters()
Sorts the clusters regarding their number of elements.
Definition: ClusteringKMeans.h:1142
TSquareDistance maximalSqrDistance() const
Calculates the maximal square distance between the mean observation value of each clusters and all ob...
Definition: ClusteringKMeans.h:1148
void applyOptimizationIteration()
Explicitly applies one further optimization iteration for an existing set of clusters.
Definition: ClusteringKMeans.h:1289
This class implements a recursive lock object.
Definition: Lock.h:31
This class provides basic numeric functionalities.
Definition: Numeric.h:57
This class implements an optional recursive scoped lock object locking the lock object only if it's d...
Definition: Lock.h:325
This class implements a static buffer that has a fixed capacity.
Definition: StaticBuffer.h:24
This class implements a worker able to distribute function calls over different threads.
Definition: Worker.h:33
bool executeFunction(const Function &function, const unsigned int first, const unsigned int size, const unsigned int firstIndex=(unsigned int)(-1), const unsigned int sizeIndex=(unsigned int)(-1), const unsigned int minimalIterations=1u, const unsigned int threadIndex=(unsigned int)(-1))
Executes a callback function separable by two function parameters.
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition: base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition: Accessor.h:15