Ocean
Loading...
Searching...
No Matches
ClusteringKMeans.h
Go to the documentation of this file.
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8#ifndef META_OCEAN_MATH_CLUSTERING_K_MEANS_H
9#define META_OCEAN_MATH_CLUSTERING_K_MEANS_H
10
11#include "ocean/math/Math.h"
12
13#include "ocean/base/RandomI.h"
16#include "ocean/base/Worker.h"
17
18#include <climits>
19
20namespace Ocean
21{
22
23/**
24 * This class implements the base class for all classes providing clustering algorithms.
25 * The data (the observations) that will be distributed into individual clusters can be provided with two different modes.<br>
26 * The first mode clusters the data elements by their indices, thus the data elements must be provided as joined data block (an array of elements).<br>
27 * Use the first mode by setting tUseIndices to 'True'.<br>
28 * The second mode clusters the data elements by their pointers, thus the data elements may lie at arbitrary positions in the memory.<br>
29 * Use the second mode by setting tUseIndices to 'False'.
30 * @tparam tUseIndices True,
31 * @ingroup math
32 */
33template <bool tUseIndices>
35{
36 public:
37
38 /**
39 * This class implements the abstract data object which will be specialized for both data modes toggled by tUseIndices.
40 * @tparam T The data type of each element of an observation
41 * @tparam tDimension The dimension of each observation (the number of elements in each observation), with range [1, infinity)
42 */
43 template <typename T, size_t tDimension>
44 class Data
45 {
46 // nothing to do here
47 };
48};
49
50/**
51 * Specialization for Clustering<true>::Data<T, tDimension>.
52 * This data class implements the first data mode of the clustering class identifying observations by their indices.<br>
53 * @tparam T The data type of each element of an observation
54 * @tparam tDimension The dimension of each observation (the number of elements in each observation), with range [1, infinity)
55 */
56template <>
57template <typename T, size_t tDimension>
58class Clustering<true>::Data
59{
60 public:
61
62 /**
63 * Definition of an observation object.
64 */
66
67 /**
68 * Definition of an index that addresses one specific observation element in the data object that stores all observations.
69 */
70 typedef size_t DataIndex;
71
72 /**
73 * Definition of a vector holding indices to the data object.
74 */
75 typedef std::vector<DataIndex> DataIndices;
76
77 public:
78
79 /**
80 * Creates a new empty data object.
81 */
82 Data() = default;
83
84 /**
85 * Creates a new data object by observations lying in a joined memory block as array.
86 * Due to performance issues: The given observations can be copied or used directly without any memory copy.<br>
87 * Beware: If no copy of the observations is created, the given observations must exist as long as this data object (or the corresponding clustering object) exists.
88 * @param observations The first observation in the given joined memory block
89 * @param numberObservations The number of observations that are provided, with range [1, infinity)
90 * @param copyObservations True, to copy the given observations; False, to simply use the given observations as reference
91 */
92 Data(const Observation* observations, const size_t numberObservations, const bool copyObservations = false);
93
94 /**
95 * Move constructor for an data object.
96 * @param data The data object to be moved
97 */
98 inline Data(Data<T, tDimension>&& data) noexcept;
99
100 /**
101 * Returns one specific observation of this data object specified by the data-index of this observation.
102 * @param dataIndex The data-index of the observation that will be returned, ensure that the given data-index is valid
103 * @return The specified observation
104 * @see isValidDataIndex().
105 */
106 inline const Observation& observation(const DataIndex& dataIndex) const;
107
108 /**
109 * Returns the number of observations that are stored by this data object.
110 * @return Number of observations
111 */
112 inline size_t numberObservations() const;
113
114 /**
115 * Returns whether a given data-index is valid and has a corresponding observation stored in this data object.
116 * @param dataIndex The data-index that will be checked
117 * @return True, if so
118 */
119 inline bool isValidDataIndex(const DataIndex& dataIndex) const;
120
121 /**
122 * Returns one specific observation of this data object specified by the data-index of this observation.
123 * @param dataIndex The data-index of the observation that will be returned, ensure that the given data-index is valid
124 * @return The specified observation
125 * @see isValidDataIndex(), observation()..
126 */
127 inline const Observation& operator[](const DataIndex& dataIndex) const;
128
129 /**
130 * Move operator.
131 * @param data The data object that will be moved to this object.
132 * @return Reference to this object.
133 */
135
136 /**
137 * Returns whether this data object holds at least one observation.
138 * @return True, if so
139 */
140 explicit inline operator bool() const;
141
142 protected:
143
144 /// The optional observations that are stored as copy.
145 std::vector<Observation> copyObservations_;
146
147 /// The observation objects of this data object.
148 const Observation* observations_ = nullptr;
149
150 /// The number of observation elements of this data object.
151 size_t numberObservations_ = 0;
152};
153
154/**
155 * Specialization for Clustering<true>::Data<T, tDimension>.
156 * This data class implements the second data mode of the clustering class identifying observations by their pointers.<br>
157 * @tparam T The data type of each element of an observation
158 * @tparam tDimension The dimension of each observation (the number of elements in each observation), with range [1, infinity)
159 */
160template <>
161template <typename T, size_t tDimension>
162class Clustering<false>::Data
163{
164 public:
165
166 /**
167 * Definition of an observation object.
168 */
170
171 /**
172 * Definition of an index that addresses one specific observation element in the data object that stores all observations.
173 */
174 typedef size_t DataIndex;
175
176 /**
177 * Definition of a vector holding indices to the data object.
178 */
179 typedef std::vector<DataIndex> DataIndices;
180
181 public:
182
183 /**
184 * Creates a new empty data object.
185 */
186 Data() = default;
187
188 /**
189 * Creates a new data object by observations lying at individual memory positions.
190 * Due to performance issues: The given observation !pointers! can be copied or used directly without any copy.<br>
191 * Beware: If no copy of the observation pointers is created, the given observation pointers must exist as long as this data object (or the corresponding clustering object) exists.<br>
192 * Beware: In any case, the observations (not the pointers) must exist as long as this data object (or the corresponding clustering object) exists.<br>
193 * @param observationPointers The first pointer of the observations
194 * @param numberObservations The number of observations/pointers that are provided, with range [1, infinity)
195 * @param copyPointers True, to copy the given observation pointers (not the observations); False, to simply use the given observation pointers as reference
196 */
197 Data(const Observation** observationPointers, const size_t numberObservations, const bool copyPointers = false);
198
199 /**
200 * Move constructor for an data object.
201 * @param data The data object to be moved
202 */
203 inline Data(Data<T, tDimension>&& data) noexcept;
204
205 /**
206 * Returns one specific observation of this data object specified by the data-index of this observation.
207 * @param dataIndex The data-index of the observation that will be returned, ensure that the given data-index is valid
208 * @return The specified observation
209 * @see isValidDataIndex().
210 */
211 inline const Observation& observation(const DataIndex& dataIndex) const;
212
213 /**
214 * Returns the number of observations that are stored by this data object.
215 * @return Number of observations
216 */
217 inline size_t numberObservations() const;
218
219 /**
220 * Returns whether a given data-index is valid and has a corresponding observation stored in this data object.
221 * @param dataIndex The data-index that will be checked
222 * @return True, if so
223 */
224 inline bool isValidDataIndex(const DataIndex& dataIndex) const;
225
226 /**
227 * Returns one specific observation of this data object specified by the data-index of this observation.
228 * @param dataIndex The data-index of the observation that will be returned, ensure that the given data-index is valid
229 * @return The specified observation
230 * @see isValidDataIndex(), observation()..
231 */
232 inline const Observation& operator[](const DataIndex& dataIndex) const;
233
234 /**
235 * Move operator.
236 * @param data The data object that will be moved to this object.
237 * @return Reference to this object.
238 */
240
241 /**
242 * Returns whether this data object holds at least one observation.
243 * @return True, if so
244 */
245 explicit inline operator bool() const;
246
247 protected:
248
249 /// The optional observation pointers that are stored as copy.
250 std::vector<const Observation*> copyObservationPointers_;
251
252 /// The observation pointers of this data object.
253 const Observation** observationPointers_ = nullptr;
254
255 /// The number of observation elements of this data object.
256 size_t numberObservations_;
257};
258
259/**
260 * This class implements a k-means clustering algorithm.
261 * Beware: Due to performance issues, this class does not copy the given observation values, this expects that the given observation values exist as long as the KMean object exists.<br>
262 * @tparam T The data type of each element of an observation
263 * @tparam tDimension The dimension of each observation (the number of elements in each observation), with range [1, infinity)
264 * @tparam TSum The data type of the intermediate sum values, that is necessary to determine e.g. the mean parameters
265 * @tparam TSquareDistance The data type of the square distance value, might be different from T
266 * @ingroup math
267 */
268template <typename T, size_t tDimension, typename TSum = T, typename TSquareDistance = T, bool tUseIndices = true>
269class ClusteringKMeans : public Clustering<tUseIndices>
270{
271 public:
272
273 /**
274 * Definition of individual initialization strategies.
275 */
277 {
278 /// The first cluster is determined by selection of the (euclidean) smallest observation, the remaining clusters are defined by observations with largest distance to the already existing clusters.
280 /// All clusters are selected randomly.
282 };
283
284 /**
285 * (Re-)Definition of a data object providing the data which will be clustered.
286 */
288
289 /**
290 * (Re-)Definition of an index that addresses one specific observation element in the data object that stores all observations.
291 */
292 typedef typename Data::DataIndex DataIndex;
293
294 /**
295 * (Re-)Definition of a vector holding (size_t) indices.
296 */
297 typedef typename Data::DataIndices DataIndices;
298
299 /**
300 * (Re-)Definition of an observation object.
301 */
302 typedef typename Data::Observation Observation;
303
304 /**
305 * This class implements one cluster that holds the mean values of all observations belonging to this cluster and the indices of all observations belonging to this cluster.
306 */
308 {
309 // Two friend class declaration, necessary as the constructors of this class are all protected.
310 friend class ClusteringKMeans<T, tDimension, TSum, TSquareDistance, tUseIndices>;
311 friend class std::allocator<Cluster>;
312
313 public:
314
315 /**
316 * Move constructor for another cluster object.
317 * @param cluster The cluster object to be moved
318 */
319 inline Cluster(Cluster&& cluster) noexcept;
320
321 /**
322 * Returns the mean observation value of this cluster.
323 * @return The cluster's mean observation value
324 */
325 inline const Observation& mean() const;
326
327 /**
328 * Returns the indices of the observations that belong to this cluster.
329 * @return The cluster's observation indices
330 */
331 inline const DataIndices& dataIndices() const;
332
333 /**
334 * Returns the square distance between a given observation and this cluster (the mean observation value of this cluster).
335 * @param observation The observation for that the distance is returned
336 * @return Resulting square distance
337 */
338 inline TSquareDistance sqrDistance(const Observation& observation) const;
339
340 /**
341 * Calculates the maximal square distance between the mean observation value of this cluster and all observations which belong to this cluster.
342 * @param observationIndex Optional resulting index of the observation with maximal distance to the mean observation value, if defined
343 * @return Maximal square distance
344 */
345 TSquareDistance maximalSqrDistance(DataIndex* observationIndex = nullptr) const;
346
347 /**
348 * Calculates the average square distance between the mean observation value of this cluster and all observations which belong to this cluster.
349 * @return Average square distance
350 */
351 TSquareDistance averageSqrDistance() const;
352
353 /**
354 * Move operator moving a cluster object to this object.
355 * @param cluster The cluster object to be moved
356 * @return Reference to this object
357 */
358 inline Cluster& operator=(Cluster&& cluster) noexcept;
359
360 /**
361 * Returns whether the left cluster has less elements than the right cluster.
362 * @param cluster The right cluster to compare
363 * @return True, if so
364 */
365 inline bool operator<(const Cluster& cluster) const;
366
367 protected:
368
369 /**
370 * Creates a new cluster object by a given mean value.
371 * @param owner The owner of this cluster object
372 * @param mean The mean observation value that will define this cluster
373 */
375
376 /**
377 * Creates a new cluster object by a given mean value and several indices of observations that belong to this cluster.
378 * @param owner The owner of this cluster object
379 * @param mean The mean observation value that will define this cluster
380 * @param dataIndices The data-indices of the observations that belong to this cluster
381 */
383
384 /**
385 * Creates a new cluster object by a given mean value and several indices of observations that belong to this cluster.
386 * @param owner The owner of this cluster object
387 * @param mean The mean observation value that will define this cluster
388 * @param dataIndices The data-indices of the observations that belong to this cluster; beware: the indices will be moved to this cluster object
389 */
391
392 /**
393 * Returns the data-indices of the observations that belong to this cluster.
394 * @return The cluster's observation indices
395 */
397
398 /**
399 * Updates the mean observation value of this cluster by application of the stored indices of all observations that belong to this cluster.
400 */
402
403 protected:
404
405 /// The owner of this cluster.
407
408 /// The mean observation value of this cluster.
410
411 /// The data indices of all observation that belong to this cluster.
413 };
414
415 /**
416 * Definition of a vector holding cluster objects.
417 */
418 typedef std::vector<Cluster> Clusters;
419
420 public:
421
422 /**
423 * Creates an empty k-means object.
424 */
426
427 /**
428 * Move constructor.
429 * @param clustering The clustering object to be moved
430 */
431 inline ClusteringKMeans(ClusteringKMeans&& clustering) noexcept;
432
433 /**
434 * Creates a new k-means object by a given data object.
435 * @param data The data object to be used to determine the clusters.
436 * @see determineClusters().
437 */
438 inline explicit ClusteringKMeans(const Data &data);
439
440 /**
441 * Creates a new k-means object by a given data object.
442 * @param data The data object that will be moved and used to determine the clusters.
443 * @see determineClusters().
444 */
445 inline explicit ClusteringKMeans(Data &&data);
446
447 /**
448 * Returns the clusters of this k-means clustering object.
449 * @return The determined k-means clusters
450 */
451 inline const Clusters& clusters() const;
452
453 /**
454 * Sorts the clusters regarding their number of elements.
455 */
457
458 /**
459 * Calculates the maximal square distance between the mean observation value of each clusters and all observations belonging to the cluster.
460 * @return Maximal square distance for all clusters
461 */
462 TSquareDistance maximalSqrDistance() const;
463
464 /**
465 * Determines the clusters for this object, ensure that this object has been initialized with a valid set of observations.
466 * @param numberClusters The number of clusters that will be created, with range [1, numberObservations())
467 * @param strategy The initialization strategy for the first clusters
468 * @param iterations The number of optimization iterations that are applied after the initial clusters have been determined, with range [1, infinity)
469 * @param worker Optional worker object to distribute the computation
470 * @see clusters().
471 */
472 void determineClustersByNumber(const size_t numberClusters, const InitializationStrategy strategy = IS_LARGEST_DISTANCE, const size_t iterations = 5, Worker* worker = nullptr);
473
474 /**
475 * Determines the clusters for this object, ensure that this object has been initialized with a valid set of observations.
476 * This function adds new clusters within several iterations until the defined maximalSqrDistance is larger than the distance within all clusters or until the defined maximal number of clusters is reached.<br>
477 * @param maximalSqrDistance The maximal square distance in the final clusters between the clusters' mean observation values and the observations in the clusters
478 * @param maximalClusters The maximal number of clusters that will be created (even if maximalSqrDistance is not reached), with range [0, infinity), define 0 to ignore this parameter
479 * @param iterations The number of optimization iterations that are applied after each time a new cluster is added [1, infinity)
480 * @param worker Optional worker object to distribute the computation
481 */
482 void determineClustersByDistance(const TSquareDistance maximalSqrDistance, size_t maximalClusters = 0, const size_t iterations = 5, Worker* worker = nullptr);
483
484 /**
485 * Adds a new clusters for this object.
486 * @param iterations The number of optimization iterations that are applied after the new cluster has been added, with range [1, infinity)
487 * @param sqrDistance The minimal square distance between the cluster's mean and an observation of this cluster so that this cluster is divided into two clusters
488 * @param worker Optional worker object to distribute the computation
489 * @return True, if a new cluster have been added, False if no further cluster could be added or if the provided distance was too large
490 */
491 bool addCluster(const size_t iterations = 5, TSquareDistance sqrDistance = TSquareDistance(0), Worker* worker = nullptr);
492
493 /**
494 * Removes one cluster from this object.
495 * The cluster with smallest maximal distance of all observations to the mean observation value of the clusters is removed.
496 * @param iterations The number of optimization iterations that are applied after the cluster has been removed, with range [1, infinity)
497 * @param worker Optional worker object to distribute the computation
498 */
499 void removeCluster(const size_t iterations = 5, Worker* worker = nullptr);
500
501 /**
502 * Finds a best matching cluster for a given independent observation.
503 * However, the observation is not added to this cluster, it's simply a lookup for the best matching cluster.
504 * @param observation The observation for that the best matching cluster is determined
505 * @return The index of the best matching cluster, -1 if no cluster could be found
506 * @see clusters().
507 */
508 size_t findCluster(const Observation& observation);
509
510 /**
511 * Explicitly applies one further optimization iteration for an existing set of clusters.
512 * Do not call this function before initial clusters have been found.
513 * @see clusters(), determineCluster().
514 */
516
517 /**
518 * Explicitly applies one further optimization iteration for an existing set of clusters.
519 * Do not call this function before initial clusters have been found.
520 * @param worker The worker object to distribute the computation
521 * @see clusters(), determineCluster().
522 */
524
525 /**
526 * Clears all determined clusters but registered the data information is untouched.
527 */
528 void clear();
529
530 /**
531 * Returns whether this object holds a valid set of observations.
532 * @return True, if so
533 */
534 inline bool isValid() const;
535
536 /**
537 * Returns whether this object holds a valid set of observations.
538 * @return True, if so
539 */
540 explicit inline operator bool() const;
541
542 /**
543 * Move operator.
544 * @param clustering The clustering object to be moved
545 * @return Reference to this object
546 */
548
549 protected:
550
551 /**
552 * Determines the initial clusters for this object with the IS_LARGEST_DISTANCE strategy.
553 * First the smallest observation object is selected as first cluster,<br>
554 * all following clusters are determined by observations that have the largest distance to the already existing clusters.<br>
555 * @param numberClusters The number of initial clusters that will be created.
556 */
557 void determineInitialClustersLargestDistance(const size_t numberClusters);
558
559 /**
560 * Determines the initial clusters for this object with the IS_RANDOM strategy.
561 * All clusters are created randomly.br>
562 * @param numberClusters The number of initial clusters that will be created.
563 */
564 void determineInitialClustersRandom(const size_t numberClusters);
565
566 /**
567 * Explicitly applies one further optimization iteration for an existing set of clusters.
568 * This functions operates on a subset of all observations.<br>
569 * @param lock Optional lock object if this function is executed on multiple threads in parallel
570 * @param firstObservation The first observation that will be handled
571 * @param numberObservations The number of observations that will be handled
572 * @see clusters(), determineCluster().
573 */
574 void applyOptimizationIterationSubset(Lock* lock, const unsigned int firstObservation, const unsigned int numberObservations);
575
576 /**
577 * Determines the smallest observation (euclidean distance to origin) from a set of observations.
578 * @param data The observation data in which the smallest observation is determined, must be valid
579 * @return The index of the smallest observation
580 */
581 static inline DataIndex smallestObservation(const Data& data);
582
583 /**
584 * Returns the square distance between an observation and the origin.
585 * @param observation The observation for that the square distance is determined
586 * @return Resulting square distance
587 */
588 static inline TSquareDistance sqrDistance(const Observation& observation);
589
590 protected:
591
592 /// The data that stores the observations of this clustering object, either with index-access or pointer-access.
594
595 /// The current clusters of this object.
597};
598
599template <>
600template <typename T, size_t tDimension>
601Clustering<true>::Data<T, tDimension>::Data(const Observation* observations, const size_t numberObservations, const bool copyObservations) :
602 copyObservations_(copyObservations ? numberObservations : 0),
603 observations_(nullptr),
604 numberObservations_(0)
605{
606 ocean_assert(observations && numberObservations > 0);
607
608 if (copyObservations)
609 {
610 ocean_assert(copyObservations_.size() == numberObservations);
611 memcpy(copyObservations_.data(), observations, sizeof(Observation) * numberObservations);
612
615 }
616 else
617 {
618 observations_ = observations;
620 }
621}
622
623template <>
624template <typename T, size_t tDimension>
626 copyObservations_(std::move(data.copyObservations_)),
627 observations_(data.observations_),
628 numberObservations_(data.numberObservations_)
629{
630 data.observations_ = nullptr;
631 data.numberObservations_ = 0;
632}
633
634template <>
635template <typename T, size_t tDimension>
636inline const typename Clustering<true>::Data<T, tDimension>::Observation& Clustering<true>::Data<T, tDimension>::observation(const DataIndex& dataIndex) const
637{
638 ocean_assert(isValidDataIndex(dataIndex));
639 return observations_[dataIndex];
640}
641
642template <>
643template <typename T, size_t tDimension>
645{
646 return numberObservations_;
647}
648
649template <>
650template <typename T, size_t tDimension>
652{
653 return dataIndex < numberObservations_;
654}
655
656template <>
657template <typename T, size_t tDimension>
658inline const typename Clustering<true>::Data<T, tDimension>::Observation& Clustering<true>::Data<T, tDimension>::operator[](const DataIndex& dataIndex) const
659{
660 ocean_assert(isValidDataIndex(dataIndex));
661 return observations_[dataIndex];
662}
663
664template <>
665template <typename T, size_t tDimension>
667{
668 if (this != &data)
669 {
670 copyObservations_ = std::move(data.copyObservations_);
671 observations_ = data.observations_;
672 numberObservations_ = data.numberObservations_;
673
674 data.observations_ = nullptr;
675 data.numberObservations_ = 0;
676 }
677
678 return *this;
679}
680
681template <>
682template <typename T, size_t tDimension>
683inline Clustering<true>::Data<T, tDimension>::operator bool() const
684{
685 return observations_ != nullptr;
686}
687
688template <>
689template <typename T, size_t tDimension>
690Clustering<false>::Data<T, tDimension>::Data(const Observation** observationPointers, const size_t numberObservations, const bool copyPointers) :
691 copyObservationPointers_(copyPointers ? numberObservations : 0),
692 observationPointers_(nullptr),
693 numberObservations_(0)
694{
695 ocean_assert(observationPointers && numberObservations > 0);
696
697 if (copyPointers)
698 {
699 ocean_assert(copyObservationPointers_.size() == numberObservations);
700 memcpy(copyObservationPointers_.data(), observationPointers, sizeof(Observation*) * numberObservations);
701
704 }
705 else
706 {
707 observationPointers_ = observationPointers;
709 }
710}
711
712template <>
713template <typename T, size_t tDimension>
715 copyObservationPointers_(std::move(data.copyObservationPointers_)),
716 observationPointers_(data.observationPointers_),
717 numberObservations_(data.numberObservations_)
718{
719 data.observationPointers_ = nullptr;
720 data.numberObservations_ = 0;
721}
722
723template <>
724template <typename T, size_t tDimension>
725inline const typename Clustering<false>::Data<T, tDimension>::Observation& Clustering<false>::Data<T, tDimension>::observation(const DataIndex& dataIndex) const
726{
727 ocean_assert(isValidDataIndex(dataIndex));
728 return *(observationPointers_[dataIndex]);
729}
730
731template <>
732template <typename T, size_t tDimension>
734{
735 return numberObservations_;
736}
737
738template <>
739template <typename T, size_t tDimension>
741{
742 return dataIndex < numberObservations_;
743}
744
745template <>
746template <typename T, size_t tDimension>
747inline const typename Clustering<false>::Data<T, tDimension>::Observation& Clustering<false>::Data<T, tDimension>::operator[](const DataIndex& dataIndex) const
748{
749 ocean_assert(isValidDataIndex(dataIndex));
750 return *(observationPointers_[dataIndex]);
751}
752
753template <>
754template <typename T, size_t tDimension>
756{
757 if (this != &data)
758 {
759 copyObservationPointers_ = std::move(data.copyObservationPointers_);
760 observationPointers_ = data.observationPointers_;
761 numberObservations_ = data.numberObservations_;
762
763 data.observationPointers_ = nullptr;
764 data.numberObservations_ = 0;
765 }
766
767 return *this;
768}
769
770template <>
771template <typename T, size_t tDimension>
772inline Clustering<false>::Data<T, tDimension>::operator bool() const
773{
774 return observationPointers_ != nullptr;
775}
776
777template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
779 owner_(cluster.owner_),
780 mean_(std::move(cluster.mean_)),
781 dataIndices_(std::move(cluster.dataIndices_))
782{
783 cluster.owner_ = nullptr;
784}
785
786template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
793
794template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
796 owner_(&owner),
797 mean_(mean),
798 dataIndices_(dataIndices)
799{
800 static_assert(tDimension != 0, "Invalid observation dimension!");
801}
802
803template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
805 owner_(&owner),
806 mean_(mean),
807 dataIndices_(dataIndices)
808{
809 static_assert(tDimension != 0, "Invalid observation dimension!");
810}
811
812template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
814{
815 TSquareDistance result = TSquareDistance(0);
816
817 for (size_t n = 0; n < tDimension; ++n)
818 {
819 result += (mean_[n] - observation[n]) * (mean_[n] - observation[n]);
820 }
821
822 return result;
823}
824
825template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
827{
828 ocean_assert(owner_);
829
830 if (dataIndices_.empty())
831 {
832 return TSquareDistance(0);
833 }
834
835 const Data& data = owner_->data_;
836 TSquareDistance result = TSquareDistance(0);
837
838 for (typename DataIndices::const_iterator i = dataIndices_.begin(); i != dataIndices_.end(); ++i)
839 {
840 const TSquareDistance localDistance = sqrDistance(data[*i]);
841
842 if (localDistance > result)
843 {
844 result = localDistance;
845
846 if (observationIndex)
847 {
848 *observationIndex = *i;
849 }
850 }
851 }
852
853 return result;
854}
855
856template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
858{
859 ocean_assert(owner_);
860
861 if (dataIndices_.empty())
862 {
863 return TSquareDistance(0);
864 }
865
866 const Data& data = owner_->data_;
867 TSquareDistance result = TSquareDistance(0);
868
869 for (typename DataIndices::const_iterator i = dataIndices_.begin(); i != dataIndices_.end(); ++i)
870 {
871 result += sqrDistance(data[*i]);
872 }
873
874 ocean_assert(dataIndices_.size() != 0);
875 return result / TSquareDistance(dataIndices_.size());
876}
877
878template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
883
884template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
889
890template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
895
896template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
898{
899 if (this != &cluster)
900 {
901 mean_ = std::move(cluster.mean_);
902 dataIndices_ = std::move(cluster.dataIndices_);
903 owner_ = cluster.owner_;
904
905 cluster.owner_ = nullptr;
906 }
907
908 return *this;
909}
910
911template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
913{
914 return dataIndices_.size() < cluster.dataIndices_.size();
915}
916
917template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
919{
920 if (dataIndices_.empty())
921 {
922 for (size_t d = 0; d < tDimension; ++d)
923 mean_[d] = T();
924
925 return;
926 }
927
928 StaticBuffer<TSum, tDimension> sumObservation(tDimension, TSum());
929
930 ocean_assert(owner_);
931 const Data& data = owner_->data_;
932
933 for (typename DataIndices::const_iterator i = dataIndices_.begin(); i != dataIndices_.end(); ++i)
934 {
935 ocean_assert(*i < data.numberObservations());
936
937 for (size_t d = 0; d < tDimension; ++d)
938 {
939 sumObservation[d] += data[*i][d];
940 }
941 }
942
943 const TSum count = TSum(dataIndices_.size());
944
945 for (size_t d = 0; d < tDimension; ++d)
946 {
947 mean_[d] = T(sumObservation[d] / count);
948 }
949}
950
951template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
953{
954 static_assert(tDimension != 0, "Invalid observation dimension!");
955}
956
957template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
959 data_(std::move(clustering.data_)),
960 clusters_(std::move(clustering.clusters_))
961{
962 static_assert(tDimension != 0, "Invalid observation dimension!");
963}
964
965template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
967 data_(data)
968{
969 static_assert(tDimension != 0, "Invalid observation dimension!");
970
971 ocean_assert(data_ && "The data element is invalid!");
972}
973
974template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
976 data_(std::move(data))
977{
978 static_assert(tDimension != 0, "Invalid observation dimension!");
979
980 ocean_assert(data_ && "The data element is invalid!");
981}
982
983template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
984void ClusteringKMeans<T, tDimension, TSum, TSquareDistance, tUseIndices>::determineClustersByNumber(const size_t numberClusters, const InitializationStrategy strategy, const size_t iterations, Worker* worker)
985{
986 ocean_assert(clusters_.empty());
987
988 if (strategy == IS_LARGEST_DISTANCE)
989 {
990 determineInitialClustersLargestDistance(numberClusters);
991 }
992 else
993 {
994 ocean_assert(strategy == IS_RANDOM);
995 determineInitialClustersRandom(numberClusters);
996 }
997
998 ocean_assert(iterations >= 1);
999 applyOptimizationIteration(worker);
1000
1001 for (size_t i = 0; i < iterations; i++)
1002 {
1003 applyOptimizationIteration(worker);
1004 }
1005}
1006
1007template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1008void ClusteringKMeans<T, tDimension, TSum, TSquareDistance, tUseIndices>::determineClustersByDistance(const TSquareDistance maximalSqrDistance, size_t maximalClusters, size_t iterations, Worker* worker)
1009{
1010 ocean_assert(data_);
1011 ocean_assert(clusters_.empty());
1012
1013 // find the smallest observation (euclidean distance to the origin)
1014 const size_t firstDataIndex = smallestObservation(data_);
1015 ocean_assert(firstDataIndex != size_t(-1));
1016
1017 clusters_.push_back(Cluster(*this, data_[firstDataIndex], std::move(createIndices<size_t>(data_.numberObservations(), 0))));
1018
1019 while (maximalClusters == 0 || clusters_.size() < maximalClusters)
1020 {
1021 if (!addCluster(iterations, maximalSqrDistance, worker))
1022 {
1023 break;
1024 }
1025 }
1026}
1027
1028template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1030{
1031 size_t maximalIndex = size_t(-1);
1032 TSquareDistance maximalDistance = TSquareDistance(0);
1033
1034 for (size_t c = 0; c < clusters_.size(); ++c)
1035 {
1036 size_t localIndex = size_t(-1);
1037 const TSquareDistance localDistance = clusters_[c].maximalSqrDistance(&localIndex);
1038
1039 if (localDistance > maximalDistance)
1040 {
1041 maximalDistance = localDistance;
1042 maximalIndex = localIndex;
1043 }
1044 }
1045
1046 if (maximalIndex != size_t(-1) && maximalDistance >= sqrDistance)
1047 {
1048 clusters_.push_back(Cluster(*this, data_[maximalIndex]));
1049
1050 ocean_assert(iterations >= 1);
1051 applyOptimizationIteration(worker);
1052
1053 for (size_t n = 1; n < iterations; ++n)
1054 {
1055 applyOptimizationIteration(worker);
1056 }
1057
1058 return true;
1059 }
1060
1061 return false;
1062}
1063
1064template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1066{
1067 ocean_assert(!clusters_.empty());
1068
1069 if (clusters_.size() <= 1)
1070 {
1071 clusters_.clear();
1072 }
1073 else
1074 {
1075 size_t minimalCluster = 0;
1076 TSquareDistance minimalDistance = NumericT<TSquareDistance>::maxValue();
1077
1078 for (size_t c = 0; c < clusters_.size(); ++c)
1079 {
1080 const TSquareDistance localDistance = clusters_[c].maximalSqrDistance();
1081
1082 if (localDistance < minimalDistance)
1083 {
1084 minimalDistance = localDistance;
1085 minimalCluster = c;
1086 }
1087 }
1088
1089 Clusters tmpClusters(std::move(clusters_));
1090 ocean_assert(clusters_.empty());
1091
1092 ocean_assert(tmpClusters.size() >= 1);
1093 clusters_.reserve(tmpClusters.size() - 1);
1094
1095 for (size_t c = 0; c < tmpClusters.size(); ++c)
1096 {
1097 if (c != minimalCluster)
1098 {
1099 clusters_.push_back(std::move(tmpClusters[c]));
1100 }
1101 }
1102
1103 ocean_assert(!clusters_.empty());
1104
1105 ocean_assert(iterations >= 1);
1106 applyOptimizationIteration(worker);
1107
1108 for (size_t n = 1; n < iterations; ++n)
1109 {
1110 applyOptimizationIteration(worker);
1111 }
1112 }
1113}
1114
1115template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1117{
1118 TSquareDistance minimalDistance = NumericT<TSquareDistance>::maxValue();
1119 size_t minimalIndex = size_t(-1);
1120
1121 for (size_t n = 0; n < clusters_.size(); ++n)
1122 {
1123 const TSquareDistance localDistance = clusters_[n].sqrDistance(observation);
1124
1125 if (localDistance < minimalDistance)
1126 {
1127 minimalDistance = localDistance;
1128 minimalIndex = n;
1129 }
1130 }
1131
1132 return minimalIndex;
1133}
1134
1135template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1140
1141template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1143{
1144 std::sort(clusters_.rbegin(), clusters_.rend());
1145}
1146
1147template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1149{
1150 TSquareDistance maximalDistance = TSquareDistance(0);
1151
1152 for (typename Clusters::const_iterator i = clusters_.begin(); i != clusters_.end(); ++i)
1153 {
1154 maximalDistance = max(maximalDistance, i->maximalSqrDistance());
1155 }
1156
1157 return maximalDistance;
1158}
1159
1160template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1162{
1163 ocean_assert(data_.numberObservations() != 0);
1164 ocean_assert(clusters_.empty());
1165
1166 // find the smallest observation (euclidean distance to the origin)
1167 const DataIndex firstDataIndex = smallestObservation(data_);
1168 ocean_assert(firstDataIndex != DataIndex(-1));
1169
1170 clusters_.push_back(Cluster(*this, data_[firstDataIndex]));
1171
1172 while (clusters_.size() < numberClusters)
1173 {
1174 TSquareDistance largestDistance = TSquareDistance(0);
1175 size_t largestIndex = size_t(-1);
1176
1177 for (size_t o = 0; o < data_.numberObservations(); ++o)
1178 {
1179 const Observation& observation = data_[o];
1180
1181 TSquareDistance localDistance = NumericT<TSquareDistance>::maxValue();
1182
1183 for (size_t c = 0; c < clusters_.size(); ++c)
1184 {
1185 localDistance = min(localDistance, clusters_[c].sqrDistance(observation));
1186 }
1187
1188 if (localDistance > largestDistance)
1189 {
1190 largestDistance = localDistance;
1191 largestIndex = o;
1192 }
1193 }
1194
1195 // check whether no observation is left
1196 if (largestIndex == size_t(-1))
1197 {
1198 break;
1199 }
1200
1201 ocean_assert(largestDistance != TSquareDistance(0));
1202
1203#ifdef OCEAN_DEBUG
1204 for (size_t c = 0; c < clusters_.size(); ++c)
1205 {
1206 ocean_assert(clusters_[c].mean() != data_[largestIndex]);
1207 }
1208#endif
1209
1210 clusters_.push_back(Cluster(*this, data_[largestIndex]));
1211 }
1212
1213 ocean_assert(!clusters_.empty());
1214
1215 for (size_t c = 0; c < clusters_.size(); ++c)
1216 {
1217 ocean_assert(clusters_[c].dataIndices().empty());
1218 clusters_[c].dataIndices().reserve(data_.numberObservations() * 2 / clusters_.size());
1219 }
1220}
1221
1222template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1224{
1225 ocean_assert(data_.numberObservations() != 0);
1226 ocean_assert(clusters_.empty());
1227
1228 const bool random64 = data_.numberObservations() > NumericT<unsigned int>::maxValue();
1229
1230 // find the smallest observation (euclidean distance to the origin)
1231 const size_t firstDataIndex = smallestObservation(data_);
1232 ocean_assert(firstDataIndex != size_t(-1));
1233
1234 clusters_.push_back(Cluster(*this, data_[firstDataIndex]));
1235
1236 size_t iterations = 0;
1237 while (clusters_.size() < numberClusters && iterations++ < numberClusters * 100)
1238 {
1239 TSquareDistance largestDistance = 0u;
1240 size_t largestIndex = size_t(-1);
1241
1242 for (size_t n = 0; n < max<size_t>(1, data_.numberObservations() / 128); ++n)
1243 {
1244 const size_t index = random64 ? RandomI::random64() % data_.numberObservations() : RandomI::random32() % (unsigned int)data_.numberObservations();
1245 const Observation& candidate = data_[index];
1246
1247 TSquareDistance smallestDistance = NumericT<TSquareDistance>::maxValue();
1248
1249 for (size_t c = 0; c < clusters_.size(); ++c)
1250 {
1251 smallestDistance = min(smallestDistance, clusters_[c].sqrDistance(candidate));
1252 }
1253
1254 if (smallestDistance > largestDistance)
1255 {
1256 largestDistance = smallestDistance;
1257 largestIndex = index;
1258 }
1259 }
1260
1261 // check whether no observation is left
1262 if (largestIndex == size_t(-1))
1263 {
1264 break;
1265 }
1266
1267 ocean_assert(largestDistance != TSquareDistance(0));
1268
1269#ifdef OCEAN_DEBUG
1270 for (size_t c = 0; c < clusters_.size(); ++c)
1271 {
1272 ocean_assert(clusters_[c].mean() != data_[largestIndex]);
1273 }
1274#endif
1275
1276 clusters_.push_back(Cluster(*this, data_[largestIndex]));
1277 }
1278
1279 ocean_assert(!clusters_.empty());
1280
1281 for (size_t c = 0; c < clusters_.size(); ++c)
1282 {
1283 ocean_assert(clusters_[c].dataIndices().empty());
1284 clusters_[c].dataIndices().reserve(data_.numberObservations() * 2 / clusters_.size());
1285 }
1286}
1287
1288template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1290{
1291 ocean_assert(!clusters_.empty());
1292
1293 // remove the old indices, we determine the new distribution
1294 for (size_t c = 0; c < clusters_.size(); ++c)
1295 {
1296 clusters_[c].dataIndices().clear();
1297 }
1298
1299 // assign each observation to the best fitting cluster
1300 for (size_t o = 0; o < data_.numberObservations(); ++o)
1301 {
1302 const Observation& observation = data_[o];
1303
1304 TSquareDistance bestDistance = NumericT<TSquareDistance>::maxValue();
1305 size_t bestCluster = size_t(-1);
1306
1307 for (size_t c = 0; c < clusters_.size(); ++c)
1308 {
1309 const TSquareDistance localDistance = clusters_[c].sqrDistance(observation);
1310
1311 if (localDistance < bestDistance)
1312 {
1313 bestDistance = localDistance;
1314 bestCluster = c;
1315 }
1316 }
1317
1318 ocean_assert(bestCluster != size_t(-1));
1319
1320 clusters_[bestCluster].dataIndices().push_back(o);
1321 }
1322
1323 // update the mean values for each cluster
1324 for (size_t c = 0; c < clusters_.size(); ++c)
1325 {
1326 clusters_[c].updateMean();
1327 }
1328}
1329
1330template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1332{
1333 if (worker == nullptr)
1334 {
1335 applyOptimizationIteration();
1336 }
1337 else
1338 {
1339 ocean_assert(!clusters_.empty());
1340
1341 // remove the old indices, we determine the new distribution
1342 for (size_t c = 0; c < clusters_.size(); ++c)
1343 {
1344 clusters_[c].dataIndices().clear();
1345 }
1346
1347 Lock lock;
1348 worker->executeFunction(Worker::Function::create(*this, &ClusteringKMeans<T, tDimension, TSum, TSquareDistance, tUseIndices>::applyOptimizationIterationSubset, &lock, 0u, 0u), 0u, (unsigned int)data_.numberObservations(), 1u, 2u);
1349
1350 // update the mean values for each cluster
1351 for (size_t c = 0; c < clusters_.size(); ++c)
1352 {
1353 clusters_[c].updateMean();
1354 }
1355 }
1356}
1357
1358template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1359void ClusteringKMeans<T, tDimension, TSum, TSquareDistance, tUseIndices>::applyOptimizationIterationSubset(Lock* lock, const unsigned int firstObservation, const unsigned int numberObservations)
1360{
1361 ocean_assert(!clusters_.empty());
1362
1363 std::vector<DataIndices> localClusters(clusters_.size());
1364
1365 // assign each observation to the best fitting cluster
1366 for (size_t o = firstObservation; o < firstObservation + numberObservations; ++o)
1367 {
1368 const Observation& observation = data_[o];
1369
1370 TSquareDistance bestDistance = NumericT<TSquareDistance>::maxValue();
1371 size_t bestCluster = size_t(-1);
1372
1373 for (size_t c = 0; c < clusters_.size(); ++c)
1374 {
1375 const TSquareDistance localDistance = clusters_[c].sqrDistance(observation);
1376
1377 if (localDistance < bestDistance)
1378 {
1379 bestDistance = localDistance;
1380 bestCluster = c;
1381 }
1382 }
1383
1384 ocean_assert(bestCluster != size_t(-1));
1385
1386 localClusters[bestCluster].push_back(o);
1387 }
1388
1389 const OptionalScopedLock scopedLock(lock);
1390
1391 for (size_t c = 0; c < localClusters.size(); ++c)
1392 {
1393 clusters_[c].dataIndices().insert(clusters_[c].dataIndices().end(), localClusters[c].begin(), localClusters[c].end());
1394 }
1395}
1396
1397template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1402
1403template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1408
1409template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1414
1415template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1417{
1418 if (this != &clustering)
1419 {
1420 data_ = std::move(clustering.data_);
1421 clusters_ = std::move(clustering.clusters_);
1422 }
1423
1424 return *this;
1425}
1426
1427template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1429{
1430 ocean_assert(data);
1431
1432 TSquareDistance smallestDistance = NumericT<TSquareDistance>::maxValue();
1433 DataIndex smallestIndex = size_t(-1);
1434
1435 for (size_t o = 0; o < data.numberObservations(); ++o)
1436 {
1437 const TSquareDistance localDistance = sqrDistance(data[o]);
1438
1439 if (localDistance < smallestDistance)
1440 {
1441 smallestDistance = localDistance;
1442 smallestIndex = o;
1443 }
1444 }
1445
1446 return smallestIndex;
1447}
1448
1449template <typename T, size_t tDimension, typename TSum, typename TSquareDistance, bool tUseIndices>
1451{
1452 TSquareDistance result = TSquareDistance(0);
1453
1454 for (size_t n = 0; n < tDimension; ++n)
1455 {
1456 result += observation[n] * observation[n];
1457 }
1458
1459 return result;
1460}
1461
1462
1463}
1464
1465#endif // META_OCEAN_MATH_CLUSTERING_K_MEANS_H
static Caller< void > create(CT &object, typename MemberFunctionPointerMaker< CT, void, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass, NullClass >::Type function)
Creates a new caller container for a member function with no function parameter.
Definition Caller.h:3023
This class implements the abstract data object which will be specialized for both data modes toggled ...
Definition ClusteringKMeans.h:45
size_t DataIndex
Definition of an index that addresses one specific observation element in the data object that stores...
Definition ClusteringKMeans.h:70
size_t numberObservations() const
Returns the number of observations that are stored by this data object.
size_t numberObservations_
The number of observation elements of this data object.
Definition ClusteringKMeans.h:151
const Observation & observation(const DataIndex &dataIndex) const
Returns one specific observation of this data object specified by the data-index of this observation.
const Observation & operator[](const DataIndex &dataIndex) const
Returns one specific observation of this data object specified by the data-index of this observation.
std::vector< DataIndex > DataIndices
Definition of a vector holding indices to the data object.
Definition ClusteringKMeans.h:75
Data< T, tDimension > & operator=(Data< T, tDimension > &&data) noexcept
Move operator.
Data(const Observation *observations, const size_t numberObservations, const bool copyObservations=false)
Creates a new data object by observations lying in a joined memory block as array.
const Observation ** observationPointers_
The observation pointers of this data object.
Definition ClusteringKMeans.h:253
std::vector< const Observation * > copyObservationPointers_
The optional observation pointers that are stored as copy.
Definition ClusteringKMeans.h:250
const Observation * observations_
The observation objects of this data object.
Definition ClusteringKMeans.h:148
Data(Data< T, tDimension > &&data) noexcept
Move constructor for an data object.
std::vector< Observation > copyObservations_
The optional observations that are stored as copy.
Definition ClusteringKMeans.h:145
bool isValidDataIndex(const DataIndex &dataIndex) const
Returns whether a given data-index is valid and has a corresponding observation stored in this data o...
Data()=default
Creates a new empty data object.
Data(const Observation **observationPointers, const size_t numberObservations, const bool copyPointers=false)
Creates a new data object by observations lying at individual memory positions.
StaticBuffer< T, tDimension > Observation
Definition of an observation object.
Definition ClusteringKMeans.h:65
This class implements the base class for all classes providing clustering algorithms.
Definition ClusteringKMeans.h:35
This class implements one cluster that holds the mean values of all observations belonging to this cl...
Definition ClusteringKMeans.h:308
TSquareDistance maximalSqrDistance(DataIndex *observationIndex=nullptr) const
Calculates the maximal square distance between the mean observation value of this cluster and all obs...
Definition ClusteringKMeans.h:826
TSquareDistance sqrDistance(const Observation &observation) const
Returns the square distance between a given observation and this cluster (the mean observation value ...
Definition ClusteringKMeans.h:813
DataIndices dataIndices_
The data indices of all observation that belong to this cluster.
Definition ClusteringKMeans.h:412
Observation mean_
The mean observation value of this cluster.
Definition ClusteringKMeans.h:409
const ClusteringKMeans< T, tDimension, TSum, TSquareDistance, tUseIndices > * owner_
The owner of this cluster.
Definition ClusteringKMeans.h:406
Cluster & operator=(Cluster &&cluster) noexcept
Move operator moving a cluster object to this object.
Definition ClusteringKMeans.h:897
Cluster(Cluster &&cluster) noexcept
Move constructor for another cluster object.
Definition ClusteringKMeans.h:778
const DataIndices & dataIndices() const
Returns the indices of the observations that belong to this cluster.
Definition ClusteringKMeans.h:885
const Observation & mean() const
Returns the mean observation value of this cluster.
Definition ClusteringKMeans.h:879
bool operator<(const Cluster &cluster) const
Returns whether the left cluster has less elements than the right cluster.
Definition ClusteringKMeans.h:912
void updateMean()
Updates the mean observation value of this cluster by application of the stored indices of all observ...
Definition ClusteringKMeans.h:918
TSquareDistance averageSqrDistance() const
Calculates the average square distance between the mean observation value of this cluster and all obs...
Definition ClusteringKMeans.h:857
This class implements a k-means clustering algorithm.
Definition ClusteringKMeans.h:270
void clear()
Clears all determined clusters but registered the data information is untouched.
Definition ClusteringKMeans.h:1398
void applyOptimizationIteration(Worker *worker)
Explicitly applies one further optimization iteration for an existing set of clusters.
Definition ClusteringKMeans.h:1331
ClusteringKMeans(ClusteringKMeans &&clustering) noexcept
Move constructor.
Definition ClusteringKMeans.h:958
std::vector< Cluster > Clusters
Definition of a vector holding cluster objects.
Definition ClusteringKMeans.h:418
const Clusters & clusters() const
Returns the clusters of this k-means clustering object.
Definition ClusteringKMeans.h:1136
Data data_
The data that stores the observations of this clustering object, either with index-access or pointer-...
Definition ClusteringKMeans.h:593
void applyOptimizationIterationSubset(Lock *lock, const unsigned int firstObservation, const unsigned int numberObservations)
Explicitly applies one further optimization iteration for an existing set of clusters.
Definition ClusteringKMeans.h:1359
ClusteringKMeans< T, tDimension, TSum, TSquareDistance, tUseIndices > & operator=(ClusteringKMeans &&clustering)
Move operator.
Definition ClusteringKMeans.h:1416
ClusteringKMeans()
Creates an empty k-means object.
Definition ClusteringKMeans.h:952
Data::DataIndices DataIndices
(Re-)Definition of a vector holding (size_t) indices.
Definition ClusteringKMeans.h:297
ClusteringKMeans(Data &&data)
Creates a new k-means object by a given data object.
Definition ClusteringKMeans.h:975
Data::DataIndex DataIndex
(Re-)Definition of an index that addresses one specific observation element in the data object that s...
Definition ClusteringKMeans.h:292
static DataIndex smallestObservation(const Data &data)
Determines the smallest observation (euclidean distance to origin) from a set of observations.
Definition ClusteringKMeans.h:1428
static TSquareDistance sqrDistance(const Observation &observation)
Returns the square distance between an observation and the origin.
Definition ClusteringKMeans.h:1450
void determineClustersByDistance(const TSquareDistance maximalSqrDistance, size_t maximalClusters=0, const size_t iterations=5, Worker *worker=nullptr)
Determines the clusters for this object, ensure that this object has been initialized with a valid se...
Definition ClusteringKMeans.h:1008
void determineClustersByNumber(const size_t numberClusters, const InitializationStrategy strategy=IS_LARGEST_DISTANCE, const size_t iterations=5, Worker *worker=nullptr)
Determines the clusters for this object, ensure that this object has been initialized with a valid se...
Definition ClusteringKMeans.h:984
Data::Observation Observation
(Re-)Definition of an observation object.
Definition ClusteringKMeans.h:302
bool addCluster(const size_t iterations=5, TSquareDistance sqrDistance=TSquareDistance(0), Worker *worker=nullptr)
Adds a new clusters for this object.
Definition ClusteringKMeans.h:1029
bool isValid() const
Returns whether this object holds a valid set of observations.
Definition ClusteringKMeans.h:1404
void determineInitialClustersRandom(const size_t numberClusters)
Determines the initial clusters for this object with the IS_RANDOM strategy.
Definition ClusteringKMeans.h:1223
void removeCluster(const size_t iterations=5, Worker *worker=nullptr)
Removes one cluster from this object.
Definition ClusteringKMeans.h:1065
Clusters clusters_
The current clusters of this object.
Definition ClusteringKMeans.h:596
InitializationStrategy
Definition of individual initialization strategies.
Definition ClusteringKMeans.h:277
@ IS_LARGEST_DISTANCE
The first cluster is determined by selection of the (euclidean) smallest observation,...
Definition ClusteringKMeans.h:279
@ IS_RANDOM
All clusters are selected randomly.
Definition ClusteringKMeans.h:281
Clustering< tUseIndices >::template Data< T, tDimension > Data
(Re-)Definition of a data object providing the data which will be clustered.
Definition ClusteringKMeans.h:287
size_t findCluster(const Observation &observation)
Finds a best matching cluster for a given independent observation.
Definition ClusteringKMeans.h:1116
void determineInitialClustersLargestDistance(const size_t numberClusters)
Determines the initial clusters for this object with the IS_LARGEST_DISTANCE strategy.
Definition ClusteringKMeans.h:1161
ClusteringKMeans(const Data &data)
Creates a new k-means object by a given data object.
Definition ClusteringKMeans.h:966
void sortClusters()
Sorts the clusters regarding their number of elements.
Definition ClusteringKMeans.h:1142
TSquareDistance maximalSqrDistance() const
Calculates the maximal square distance between the mean observation value of each clusters and all ob...
Definition ClusteringKMeans.h:1148
void applyOptimizationIteration()
Explicitly applies one further optimization iteration for an existing set of clusters.
Definition ClusteringKMeans.h:1289
This class implements a recursive lock object.
Definition Lock.h:31
static constexpr T maxValue()
Returns the max scalar value.
Definition Numeric.h:3244
This class implements an optional recursive scoped lock object locking the lock object only if it's d...
Definition Lock.h:325
static uint32_t random32()
Returns one random integer number with range [0x00000000, 0xFFFFFFFF].
static uint64_t random64()
Returns one random integer number with range [0x00000000 00000000, 0xFFFFFFFF FFFFFFFF].
This class implements a static buffer that has a fixed capacity.
Definition StaticBuffer.h:24
const T * data() const
Returns the buffer data pointer.
Definition StaticBuffer.h:240
This class implements a worker able to distribute function calls over different threads.
Definition Worker.h:33
bool executeFunction(const Function &function, const unsigned int first, const unsigned int size, const unsigned int firstIndex=(unsigned int)(-1), const unsigned int sizeIndex=(unsigned int)(-1), const unsigned int minimalIterations=1u, const unsigned int threadIndex=(unsigned int)(-1))
Executes a callback function separable by two function parameters.
unsigned int sqrDistance(const char first, const char second)
Returns the square distance between two values.
Definition base/Utilities.h:1089
The namespace covering the entire Ocean framework.
Definition Accessor.h:15