CluE  1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
frahlingsohler.h
Go to the documentation of this file.
1 #ifndef FRAHLINGSOHLER_H
2 #define FRAHLINGSOHLER_H
3 
4 #include <memory>
5 #include <unordered_map>
6 
7 #include "../clustering/cfentry.h"
8 #include "../base/streamingalgorithm.h"
9 #include "../base/weightmodifier.h"
10 #include "../datastructure/proxysolution.h"
11 #include "../datastreams/distinctelementsprimitive.h"
12 #include "../hashing/hashfunction.h"
13 #include "../hashing/carterwegman.h"
14 
15 namespace CluE
16 {
17 
26 template<typename VectorType, typename Hash, typename size_space>
27 class FrahlingSohler : public StreamingAlgorithm<VectorType>
28 {
29 public:
33  struct GridInstance
34  {
38  struct Grid
39  {
41  cellsPerRow(cellsPerRow),
42  numberOfCells(numberOfCells),
43  distinctCells(0,0,0),
44  bucketHash(bucketHash),
45  sampleHash(sampleHash),
46  buckets()
47  {
48  }
49 
50  unsigned long long cellsPerRow;
51  unsigned long long numberOfCells;
55  std::unordered_map<size_t, CFEntry<VectorType>> buckets;
56  };
57 
58  unsigned long opt;
59  std::vector<Grid> subgrids;
60  };
61 
62  FrahlingSohler(DiscreteBoundedSpace<VectorType, size_space> * space, WeightModifier<VectorType> * weightModifier, std::function<Hash()> createHashfunction, double eps, double rho, unsigned int numOfCenters);
63 
64  void insert(VectorType const & element);
65  void remove(VectorType const & element);
66 
67  virtual FrahlingSohler<VectorType, Hash, size_space>& operator<<(VectorType const & element);
68 
70 
71 private:
72  void shiftGrid(std::vector<size_space>& cell);
73  void toParentCell(std::vector<size_space>& cell);
74  std::vector<std::vector<size_space>> getChildCells(std::vector<size_space> cell);
75  unsigned long long getCellIndex(typename GridInstance::Grid const & grid, std::vector<size_space>& cell, size_t layer);
76  bool evaluateGrid(std::vector<VectorType>& coreset, size_t gridInstance, size_t gridIndex, std::vector<size_space> cell);
77  bool isHeavy(size_t gridInstance, size_t gridIndex, size_t index);
78 
79  std::unique_ptr<DiscreteBoundedSpace<VectorType, size_space>> space;
80  std::unique_ptr<WeightModifier<VectorType>> weightModifier;
81  std::vector<GridInstance> grids;
82  VectorHash<size_t> vechash;
83  double eps;
84  double rho;
85  unsigned int numOfCenters; // = k
86  double alpha;
87  double delta;
88  size_t dimension;
89  unsigned long long nFitting; // = smallest 2^i >= n
91 };
92 
93 template<typename VectorType, typename Hash, typename size_space>
94 FrahlingSohler<VectorType, Hash, size_space>::FrahlingSohler(DiscreteBoundedSpace<VectorType, size_space> * space, WeightModifier<VectorType> * weightModifier, std::function<Hash()> createHashfunction, double eps, double rho, unsigned int numOfCenters) :
95  space(space->clone()),
96  weightModifier(weightModifier->clone()),
97  grids(),
98  vechash(),
99  eps(eps),
100  rho(rho),
101  numOfCenters(numOfCenters),
102  alpha(6 * std::pow(eps, -2) * std::log(std::pow(rho, -1)) + 1),
103  delta(1000), //TODO delta
104  dimension(space->dimension()),
105  nFitting(std::pow(std::ceil(std::log2(space->n())), 2))
106 {
107  // Number of Opt candidates
108  size_t jBound = std::ceil((space->n()+1) * std::log2(space->dimension()));
109  // Number of counter-hash-function-buckets
110  size_t numOfBuckets = 3 * std::pow(16 * alpha / delta + numOfCenters * std::pow(2, dimension), 2);
111  // Number of nested grid layers
112  numOfNestedLayers = std::ceil(std::log2(space->n()))+1;
113 
114  // Prepare Opt candidate grids
115  grids.resize(jBound);
116  for(size_t j = 0; j < jBound; ++j)
117  {
118  // Prepare nested grids of an Opt candidate grid
119  GridInstance& grid = grids[j];
120  grid.opt = std::pow(2, j);
121  grid.subgrids.reserve(numOfNestedLayers);
122  for(size_t g = 0; g < numOfNestedLayers; ++g)
123  {
124  unsigned long long sampleBuckets = delta * std::pow(2, double(j) - double(g)) / alpha;
125  if(sampleBuckets == 0) // Construct at least one bucket
126  throw "TODO delta to small";
127  unsigned long long cellsPerRow = std::ceil(nFitting / std::pow(2, g+1));
128  unsigned long long numberOfCells = std::ceil(std::pow(nFitting / std::pow(2, g+1), dimension));
129  CarterWegman<unsigned long long, size_t> bucketHash(numberOfCells, numOfBuckets);
130  CarterWegman<unsigned long long, unsigned long long> sampleHash(std::pow(nFitting, dimension), sampleBuckets);
131  grid.subgrids.push_back(typename GridInstance::Grid(cellsPerRow, numberOfCells, bucketHash, sampleHash));
132  }
133  }
134 }
135 
136 template<typename VectorType, typename Hash, typename size_space>
138 {
139  //TODO Insertion / deletion streaming
140 }
141 
142 template<typename VectorType, typename Hash, typename size_space>
144 {
145  // Get cell coordinates and consecutive index
146  std::vector<size_space> cell(space->getCoordinates(element));
147  unsigned long long originalIndex = getCellIndex(grids[0].subgrids[0], cell, 0);
148  shiftGrid(cell); // Apply random shift
149  // Iterate over all grid instances
150  for(size_t i = 0; i < grids.size(); ++i)
151  {
152  GridInstance & instance = grids[i];
153  // Iterate over all nested grids
154  for(size_t g = 0; g < instance.subgrids.size(); ++g)
155  {
156  typename GridInstance::Grid & grid = instance.subgrids[g];
157  if(grid.sampleHash(originalIndex) == 0) // Take element into account?
158  {
159  unsigned long long index = getCellIndex(grid, cell, g);
160  grid.distinctCells << index;
161  size_t bucket = grid.bucketHash(index);
162  if(grid.buckets.count(index) == 0)
163  {
164  // Construct bucket if empty
165  VectorType origin = space->origin();
166  grid.buckets.insert(std::pair<unsigned long long, CFEntry<VectorType>>(index, CFEntry<VectorType>(0, origin, origin*origin)));
167  }
168  // Update the bucket's clustering feature
169  CFEntry<VectorType> cfelement(1, element, element*element);
170  grid.buckets.at(index) += cfelement;
171  // Get parent cell
172  }
173  }
174  }
175 }
176 
177 template<typename VectorType, typename Hash, typename size_space>
179 {
180  // Get cell coordinates and consecutive index
181  std::vector<size_space> cell(space->getCoordinates(element));
182  unsigned long long originalIndex = getCellIndex(grids[0].subgrids[0], cell, 0);
183  shiftGrid(cell); // Apply random shift
184  // Iterate over all grid instances
185  for(size_t i = 0; i < grids.size(); ++i)
186  {
187  GridInstance & instance = grids[i];
188  // Iterate over all nested grids
189  for(size_t g = 0; g < instance.subgrids.size(); ++g)
190  {
191  typename GridInstance::Grid & grid = instance.subgrids[g];
192  if(grid.sampleHash(originalIndex) == 0) // Take element into account?
193  {
194  unsigned long long index = getCellIndex(grid, cell, g);
195  size_t bucket = grid.bucketHash(index);
196  // Update the bucket's clustering feature
197  CFEntry<VectorType> cfelement(1, element, element*element);
198  grid.buckets.at(index) -= cfelement;
199  if(grid.buckets.at(index).number == 0) // Destruct bucket if empty
200  grid.buckets.erase(index);
201  // Get parent cell
202  }
203  }
204  }
205 }
206 
207 template<typename VectorType, typename Hash, typename size_space>
209 {
210  std::vector<size_space> startCell(dimension, 0);
211  for(size_t i = 0; i < grids.size(); ++i)
212  {
213  std::vector<VectorType> coreset;
214  bool success = evaluateGrid(coreset, i, numOfNestedLayers-1, startCell);
215  if(success)
216  {
218  solution->proxysets.push_back(coreset);
219  return solution;
220  }
221  }
222 }
223 
224 template<typename VectorType, typename Hash, typename size_space>
225 bool FrahlingSohler<VectorType, Hash, size_space>::evaluateGrid(std::vector<VectorType>& coreset, size_t gridInstance, size_t gridIndex, std::vector<size_space> cell)
226 {
227  bool success = true;
228  double threshold = 16 * alpha / delta + numOfCenters * std::pow(2, dimension);
229  unsigned long long distElements = grids[gridInstance].subgrids[gridIndex].distinctCells.numberOfDistinctElements();
230  // Check for hash function collisions
231  if(distElements <= threshold)
232  {
233  bool mark = true; // "Marked cell" indicator
234  if(gridIndex < grids[gridInstance].subgrids.size())
235  {
236  // Mark a cell iff it is heavy but contains no heavy subcells
237  std::vector<std::vector<size_space>> children(getChildCells(cell));
238  for(size_t i = 0; i < children.size(); ++i)
239  {
240  unsigned long long index = getCellIndex(grids[gridInstance].subgrids[gridIndex-1], children[i], gridIndex);
241  mark &= !isHeavy(gridInstance, gridIndex-1, index);
242  }
243  if(!mark)
244  {
245  // Decide whether to add or move a representative point
246  unsigned long long toMove = 0;
247  for(size_t i = 0; i < children.size(); ++i)
248  {
249  unsigned long long index = getCellIndex(grids[gridInstance].subgrids[gridIndex-1], children[i], gridIndex);
250  if(isHeavy(gridInstance, gridIndex+1, index)) // Add
251  evaluateGrid(coreset, gridInstance, gridIndex-1, children[i]);
252  else // Move
253  toMove += grids[gridInstance].subgrids[gridIndex-1].buckets.at(index).number;
254  }
255  VectorType& last = coreset[coreset.size()-1];
256  weightModifier->setWeight(last, weightModifier->getWeight(last) + toMove);
257  }
258  }
259  if(mark)
260  {
261  unsigned long long index = getCellIndex(grids[gridInstance].subgrids[gridIndex], cell, gridIndex);
262  CFEntry<VectorType>& cf = grids[gridInstance].subgrids[gridIndex].buckets.at(index);
263  VectorType point((1.0 / cf.number) * cf.LS);
264  weightModifier->setWeight(point, cf.number);
265  coreset.push_back(point);
266  }
267  }
268  else
269  {
270  success = false;
271  }
272  return success;
273 }
274 
275 template<typename VectorType, typename Hash, typename size_space>
276 bool FrahlingSohler<VectorType, Hash, size_space>::isHeavy(size_t gridInstance, size_t gridIndex, size_t index)
277 {
278  if(grids[gridInstance].subgrids[gridIndex].buckets.count(index) > 0)
279  return grids[gridInstance].subgrids[gridIndex].buckets.at(index).number >=
280  delta * grids[gridInstance].opt / std::pow(2, gridIndex+1);
281  else
282  return false;
283 }
284 
285 template<typename VectorType, typename Hash, typename size_space>
287 {
288  //TODO Random shift
289 }
290 
291 template<typename VectorType, typename Hash, typename size_space>
293 {
294  for(size_t i = 0; i < cell.size(); ++i)
295  cell[i] /= 2;
296 }
297 
298 template<typename VectorType, typename Hash, typename size_space>
299 std::vector<std::vector<size_space>> FrahlingSohler<VectorType, Hash, size_space>::getChildCells(std::vector<size_space> cell)
300 {
301  std::vector<std::vector<size_space>> cells;
302  for(size_t i = 0; i < cell.size(); ++i)
303  cell[i] *= 2;
304  for(size_t i = 0; i < std::pow(2, dimension); ++i)
305  {
306  size_t inc = i;
307  std::vector<size_space> childCell(cell);
308  for(size_t j = 0; j < childCell.size(); ++j)
309  childCell[j] += (inc & 1);
310  cells.push_back(childCell);
311  inc >>= 1;
312  }
313  return cells;
314 }
315 
316 template<typename VectorType, typename Hash, typename size_space>
317 unsigned long long FrahlingSohler<VectorType, Hash, size_space>::getCellIndex(typename GridInstance::Grid const & grid, std::vector<size_space>& cell, size_t layer)
318 {
319  unsigned long long index = 0;
320  unsigned long long power = 1;
321  for(size_t d = 0; d < dimension; ++d)
322  {
323  size_space coord = cell[d];
324  for(size_t l = 0; l < layer; ++l)
325  coord /= 2;
326  index += coord * power;
327  power *= grid.cellsPerRow;
328  }
329  return index;
330 }
331 
332 }
333 #endif
std::unique_ptr< WeightModifier< VectorType > > weightModifier
void insert(VectorType const &element)
Grid(unsigned long long cellsPerRow, unsigned long long numberOfCells, CarterWegman< unsigned long long, size_t > bucketHash, CarterWegman< unsigned long long, unsigned long long > sampleHash)
std::vector< std::vector< T > > proxysets
Definition: proxysolution.h:37
unsigned long long nFitting
std::unordered_map< size_t, CFEntry< VectorType > > buckets
VectorHash< size_t > vechash
Coreset algorithm for k-median clustering.
void shiftGrid(std::vector< size_space > &cell)
void toParentCell(std::vector< size_space > &cell)
CarterWegman< unsigned long long, unsigned long long > sampleHash
virtual size_space n() const =0
Number of discrete coordinates per dimension.
DistinctElementsPrimitive< unsigned long long, unsigned long long > distinctCells
void remove(VectorType const &element)
bool isHeavy(size_t gridInstance, size_t gridIndex, size_t index)
std::vector< GridInstance > grids
T LS
Linear sum.
Definition: cfentry.h:32
size_t number
Number of points contained in the feature.
Definition: cfentry.h:27
virtual ProxySolution< VectorType > * compute()
Runs the algorithm and returns the computed solution.
virtual void setWeight(T &, double)=0
Abstract base class to modify the weight of weighted objects.
Interface to extend a template type to provide discrete (bounded) space {0, ..., n-1}^d features...
std::vector< std::vector< size_space > > getChildCells(std::vector< size_space > cell)
virtual double getWeight(T &)=0
virtual size_t dimension() const =0
Space dimension.
Data structure for proxies.
Definition: proxysolution.h:19
unsigned int numOfCenters
bool evaluateGrid(std::vector< VectorType > &coreset, size_t gridInstance, size_t gridIndex, std::vector< size_space > cell)
virtual FrahlingSohler< VectorType, Hash, size_space > & operator<<(VectorType const &element)
Streaming operator.
unsigned long long getCellIndex(typename GridInstance::Grid const &grid, std::vector< size_space > &cell, size_t layer)
FrahlingSohler(DiscreteBoundedSpace< VectorType, size_space > *space, WeightModifier< VectorType > *weightModifier, std::function< Hash()> createHashfunction, double eps, double rho, unsigned int numOfCenters)
Abstract base class for streaming algorithms.
Clustering feature tree entry.
Definition: cfentry.h:22
virtual std::vector< size_space > getCoordinates(V const &vector) const =0
Returns the coordinates of the given vector.
CarterWegman< unsigned long long, size_t > bucketHash
virtual VectorType origin() const =0
Returns the space's origin.
std::unique_ptr< DiscreteBoundedSpace< VectorType, size_space > > space