CluE  1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
kumarsabharwalsen.h
Go to the documentation of this file.
1 #ifndef KUMARSABHARWALSEN_H
2 #define KUMARSABHARWALSEN_H
3 
4 #include <algorithm>
5 #include <map>
6 
7 #include "../base/algorithm.h"
8 #include "../base/inputsetter.h"
9 #include "../base/measuresetter.h"
10 #include "../datastructure/doublesolution.h"
11 #include "../base/proxygenerator.h"
12 #include "../base/dissimilaritymeasure.h"
13 #include "../sampling/uniformsampling.h"
14 #include "../misc/fixedsizesubsetiterator.h"
15 #include "../measure/pointsetdistance.h"
16 
17 #include "../exception/invalidargumentexception.h"
18 #include "../exception/invalidruntimeconfigurationexception.h"
19 
20 namespace CluE
21 {
22 
31 template<typename T> class KumarSabharwalSen : public Algorithm, public InputSetter<T>, public MeasureSetter<T>
32 {
33 public:
39  KumarSabharwalSen(DissimilarityMeasure<T> const *measure = 0, ProxyGenerator<T> const *centroidGenerator = 0, unsigned int numberOfClusters = 2, std::vector<T*> const* input = 0);
40 
43  virtual ~KumarSabharwalSen();
44 
50  virtual DoubleSolution<T>* compute();
51 
55  double getLastCost();
56 
57  virtual void setInput(std::vector<T*> const*);
58 
59  virtual void setMeasure(DissimilarityMeasure<T> const *measure);
60 
66 
72  void setSetSizes(unsigned int superSetSize, unsigned int subsetSize);
73 
79 
80 private:
84  struct Tupel
85  {
87  ds(ds),
88  cost(cost)
89  {
90  }
91 
92  Tupel() :
93  ds(0),
94  cost(std::numeric_limits<double>::max())
95  {
96  }
98  double cost;
99  };
100 
101  Tupel irredKMeans(std::set<T*> Q, size_t m, size_t k, std::map<size_t, T> C, double Sum);
102 
104  std::set<T*> input;
106  unsigned int superSetSize;
107  unsigned int subsetSize;
108  unsigned int numberOfClusters;
109  double lastCost;
110 };
111 
112 template<typename T> KumarSabharwalSen<T>::KumarSabharwalSen(DissimilarityMeasure<T> const *measure, ProxyGenerator<T> const *centGen, unsigned int numberOfClusters, std::vector<T*> const* data) :
113  measure(measure==0 ? 0 : measure->clone()),
114  centroidGenerator(centGen==0 ? 0 : centGen->clone()),
115  superSetSize(0),
116  subsetSize(0),
117  numberOfClusters(numberOfClusters),
118  lastCost(0)
119 {
120  if(data != 0)
121  input = std::set<T*>(data->begin(), data->end());
122 }
123 
125  measure(kss.measure == 0 ? 0 : kss.measure->clone()),
126  input(kss.data),
127  centroidGenerator(kss.centroidGenerator==0 ? 0 : kss.centroidGenerator->clone()),
128  superSetSize(kss.superSetSize),
129  subsetSize(kss.subsetSize),
130  numberOfClusters(kss.numberOfClusters),
131  lastCost(0)
132 {
133 }
134 
136 {
137  if(this->measure != 0)
138  delete this->measure;
139  if(this->centroidGenerator != 0)
140  delete this->centroidGenerator;
141 
142  Algorithm::operator= (kss);
143 
144  this->measure = kss.measure == 0 ? 0 : kss.measure->clone();
145  this->input = kss.input;
146  this->centroidGenerator = kss.centroidGenerator == 0 ? 0 : kss.centroidGenerator->clone();
147  this->superSetSize = kss.superSetSize;
148  this->subsetSize = kss.subsetSize;
149  this->numberOfClusters = kss.numberOfClusters;
150  this->lastCost = kss.lastCost;
151 
152  return *this;
153 }
154 
156 {
157  if(this->measure != 0)
158  delete this->measure;
159  if(this->centroidGenerator != 0)
160  delete this->centroidGenerator;
161 }
162 
164 {
165  if(numberOfClusters == 0)
166  throw InvalidRuntimeConfigurationException(0, "Number of desired clusters is 0.");
167  if(numberOfClusters > input.size())
168  throw InvalidRuntimeConfigurationException(1, "Input size must be larger than number of desired clusters.");
169  if(subsetSize == 0 || superSetSize == 0)
170  throw InvalidRuntimeConfigurationException(2, "Set sizes are uninitialized.");
171 
172  std::map<size_t, T> empty;
173  Tupel result;
174  for(size_t i = 1; i <= numberOfClusters; i++)
175  {
176  Tupel candidate(irredKMeans(input, i, i, empty, 0));
177  if(candidate.cost < result.cost)
178  result = candidate;
179  }
180  lastCost = result.cost;
181  return result.ds;
182 }
183 
184 template<typename T> typename KumarSabharwalSen<T>::Tupel KumarSabharwalSen<T>::irredKMeans(std::set<T*> Q, size_t m, size_t k, std::map<size_t, T> C, double Sum)
185 {
186  //C stores cluster centers by their index (used to access PartitionSolution's partitions)
187 
188  //STEP 1
189  if(m == 0)
190  {
191  //Prepare result
192  DoubleSolution<T> *step1ds = new DoubleSolution<T>();
193  step1ds->partitions.push_back(std::vector<std::vector<T*> >());
194  step1ds->proxysets.push_back(std::vector<T>());
195 
196  typename std::map<size_t, T>::iterator endC = C.end();
197  for(typename std::map<size_t, T>::iterator itC = C.begin(); itC != endC; itC++)
198  {
199  step1ds->partitions[0].push_back(std::vector<T*>());
200  step1ds->proxysets[0].push_back((*itC).second);
201  }
202 
203  //Assign points
204  typename std::set<T*>::iterator endQ = Q.end();
205  for(typename std::set<T*>::iterator itQ = Q.begin(); itQ != endQ; itQ++)
206  {
207  double minDist = measure->dissimilarity(**itQ, C.begin()->second);
208  size_t minIndex = 0;
209  for(typename std::map<size_t, T>::iterator itC = C.begin()++; itC != endC; itC++)
210  {
211  double tmpDist = measure->dissimilarity(**itQ, itC->second);
212  if(tmpDist < minDist)
213  {
214  minDist = tmpDist;
215  minIndex = itC->first;
216  }
217  }
218  Sum += minDist;
219  step1ds->partitions[0][minIndex].push_back(*itQ);
220  }
221 
222  Tupel step1result(step1ds, Sum);
223  return step1result;
224  }
225 
226  //STEP 2
227  size_t actualSuperSetSize = superSetSize;
228  if(actualSuperSetSize > Q.size())
229  actualSuperSetSize = Q.size();
230 
231  std::vector<T*> qVector(Q.begin(), Q.end());
232  UniformSampling<T> supersetSampler(&qVector, superSetSize);
233  DiscreteProxySolution<T>* supersetSolution = supersetSampler.compute();
234 
235  size_t actualSubsetSize = subsetSize;
236  if(actualSubsetSize > supersetSolution->proxysets[0].size())
237  actualSubsetSize = supersetSolution->proxysets[0].size();
238  FixedSizeSubsetIterator<T> subsetIterator(supersetSolution->proxysets[0], actualSubsetSize);
239 
240  Tupel step2result;
241  while(true)
242  {
243  std::vector<T*> currentSubset = subsetIterator.vector();
244  T centroid = centroidGenerator->generate(currentSubset);
245  std::map<size_t, T> cCentroid(C);
246  cCentroid.insert(std::pair<size_t, T>(C.size(), centroid));
247  Tupel candidate(irredKMeans(Q, m-1, k, cCentroid, Sum));
248  if(candidate.cost < step2result.cost)
249  {
250  delete step2result.ds;
251  step2result = candidate;
252  }
253  else
254  delete candidate.ds;
255 
256  if(subsetIterator.hasMore())
257  subsetIterator.next();
258  else
259  break;
260  }
261 
262  //STEP 3
263  if(C.size() > 0 && Q.size() > 1)
264  {
265  //Found centers yet? Then STEP 3.
266 
267  //Compute U
268  std::set<T*> setC;
269  for(typename std::map<size_t, T>::iterator it = C.begin(); it != C.end(); it++)
270  setC.insert(&(it->second));
271  PointSetDistance<T> dts(*measure, setC);
272  std::vector<T*> U(Q.begin(), Q.end());
273  typename std::vector<T*>::iterator U_half = U.begin() + U.size()/2;
274  std::nth_element(U.begin(), U_half, U.end(), dts);
275  U.erase(U_half++, U.end());
276 
277  //Assign points
278  std::set<T*> setU(U.begin(), U.end());
279  Tupel assignedU(irredKMeans(setU, 0, C.size(), C, Sum)); //use irredKMeans to assign points to centers
280 
281  //Compute irredKMeans(Q-U, ...)
282  for(typename std::vector<T*>::iterator it = U.begin(); it != U.end(); it++)
283  Q.erase(*it);
284  Tupel step3result(irredKMeans(Q, m, k, C, Sum));
285 
286  //Merge
287  step3result.cost += assignedU.cost;
288  for(size_t cluster = 0; cluster < assignedU.ds->partitions[0].size(); cluster++)
289  {
290  for(size_t point = 0; point < assignedU.ds->partitions[0][cluster].size(); point++)
291  step3result.ds->partitions[0][cluster].push_back(assignedU.ds->partitions[0][cluster][point]);
292  step3result.ds->proxysets[0].push_back(assignedU.ds->proxysets[0][cluster]);
293  }
294  delete assignedU.ds;
295 
296  //Return minimum result
297  if(step2result.cost < step3result.cost)
298  {
299  delete step3result.ds;
300  return step2result;
301  }
302  else
303  {
304  delete step2result.ds;
305  return step3result;
306  }
307  }
308  else
309  {
310  //No centers so far? No STEP 3.
311  return step2result;
312  }
313 }
314 
315 template<typename T> double KumarSabharwalSen<T>::getLastCost()
316 {
317  return lastCost;
318 }
319 
320 template<typename T> void KumarSabharwalSen<T>::setInput(std::vector<T*> const* data)
321 {
322  if(data != 0)
323  input = std::set<T*>(data->begin(), data->end());
324 }
325 
326 template<typename T> void KumarSabharwalSen<T>::setMeasure(DissimilarityMeasure<T> const *measure)
327 {
328  if(this->measure == 0)
329  delete this->measure;
330 
331  this->measure = measure==0 ? 0 : measure->clone();
332 }
333 
334 template<typename T> void KumarSabharwalSen<T>::setCentroidGenerator(ProxyGenerator<T> const *centroidGenerator)
335 {
336  if(this->centroidGenerator == 0)
337  delete this->centroidGenerator;
338 
339  this->centroidGenerator = centroidGenerator==0 ? 0 : centroidGenerator->clone();
340 }
341 
342 template<typename T> void KumarSabharwalSen<T>::setSetSizes(unsigned int superSetSize, unsigned int subsetSize)
343 {
344  if(subsetSize >= superSetSize)
345  throw InvalidArgumentException(0, "subsetSize has to be smaller than superSetSize.", "subsetSize");
346 
347  this->superSetSize = superSetSize;
348  this->subsetSize = subsetSize;
349 }
350 
351 
353 {
354  return dynamic_cast<KumarSabharwalSen<T>*>(s);
355 }
356 
357 }
358 
359 #endif
Data structure for partitions and proxies.
virtual bool hasMore() const
Returns if the Graycode sequence is incomplete yet or not.
Iterates over all fixed-size subsets of a given superset.
static KumarSabharwalSen< T > * toKumarSabharwalSen(Algorithm *s)
does a dynamic cast of the given Algorithm to FarthestFirstTraversal
Calculates the (minimum) distance between a point and a set of points.
virtual void setInput(std::vector< T * > const *)
Data structure for discrete proxies.
double getLastCost()
Returns the cost of the last computation.
void setSetSizes(unsigned int superSetSize, unsigned int subsetSize)
Set the superset and subset sizes used for guessing clusters.
virtual void setMeasure(DissimilarityMeasure< T > const *measure)
Used to return a DoubleSolution and its cost.
virtual DoubleSolution< T > * compute()
virtual ProxyGenerator< T > * clone() const =0
std::vector< std::vector< T * > > proxysets
Uniform sampling.
Abstract base class for mechanisms that compute a proxy or representative object for a given set of o...
virtual DissimilarityMeasure< T > * clone() const =0
Tupel(DoubleSolution< T > *ds, double cost)
KumarSabharwalSen(DissimilarityMeasure< T > const *measure=0, ProxyGenerator< T > const *centroidGenerator=0, unsigned int numberOfClusters=2, std::vector< T * > const *input=0)
Constructor for general usage.
virtual std::vector< T * > vector() const
Returns the current subset as a vector.
Abstract base class for algorithms.
Definition: algorithm.h:17
Interface to propagate the ability to set a DissimilarityMeasure.
Definition: measuresetter.h:13
KumarSabharwalSen< T > & operator=(const KumarSabharwalSen< T > &)
Indicates invalid values of arguments.
ProxyGenerator< T > * centroidGenerator
DissimilarityMeasure< T > * measure
KumarSabharwalSen algorithm.
std::vector< std::vector< T > > proxysets
Tupel irredKMeans(std::set< T * > Q, size_t m, size_t k, std::map< size_t, T > C, double Sum)
Abstract base class for dissimilarity measurement.
virtual void next()
Generates the next subset.
void setCentroidGenerator(ProxyGenerator< T > const *centroidGenerator)
Set the centroid generator used for cluster center approximation.
Indicates that a computation entered an invalid configuration state.
std::vector< std::vector< std::vector< T * > > > partitions
Interface to propagate the ability to set input data.
Definition: inputsetter.h:13