1 #ifndef FRAHLINGSOHLER_H
2 #define FRAHLINGSOHLER_H
5 #include <unordered_map>
7 #include "../clustering/cfentry.h"
8 #include "../base/streamingalgorithm.h"
9 #include "../base/weightmodifier.h"
10 #include "../datastructure/proxysolution.h"
11 #include "../datastreams/distinctelementsprimitive.h"
12 #include "../hashing/hashfunction.h"
13 #include "../hashing/carterwegman.h"
26 template<
typename VectorType,
typename Hash,
typename size_space>
41 cellsPerRow(cellsPerRow),
42 numberOfCells(numberOfCells),
44 bucketHash(bucketHash),
45 sampleHash(sampleHash),
55 std::unordered_map<size_t, CFEntry<VectorType>>
buckets;
64 void insert(VectorType
const & element);
65 void remove(VectorType
const & element);
72 void shiftGrid(std::vector<size_space>& cell);
74 std::vector<std::vector<size_space>>
getChildCells(std::vector<size_space> cell);
76 bool evaluateGrid(std::vector<VectorType>& coreset,
size_t gridInstance,
size_t gridIndex, std::vector<size_space> cell);
77 bool isHeavy(
size_t gridInstance,
size_t gridIndex,
size_t index);
79 std::unique_ptr<DiscreteBoundedSpace<VectorType, size_space>>
space;
81 std::vector<GridInstance>
grids;
93 template<
typename VectorType,
typename Hash,
typename size_space>
95 space(space->clone()),
96 weightModifier(weightModifier->clone()),
101 numOfCenters(numOfCenters),
102 alpha(6 * std::pow(eps, -2) * std::log(std::pow(rho, -1)) + 1),
104 dimension(space->dimension()),
105 nFitting(std::pow(std::ceil(std::log2(space->n())), 2))
108 size_t jBound = std::ceil((space->
n()+1) * std::log2(space->
dimension()));
110 size_t numOfBuckets = 3 * std::pow(16 *
alpha /
delta + numOfCenters * std::pow(2,
dimension), 2);
115 grids.resize(jBound);
116 for(
size_t j = 0; j < jBound; ++j)
120 grid.
opt = std::pow(2, j);
124 unsigned long long sampleBuckets =
delta * std::pow(2,
double(j) -
double(g)) /
alpha;
125 if(sampleBuckets == 0)
126 throw "TODO delta to small";
127 unsigned long long cellsPerRow = std::ceil(
nFitting / std::pow(2, g+1));
128 unsigned long long numberOfCells = std::ceil(std::pow(
nFitting / std::pow(2, g+1),
dimension));
136 template<
typename VectorType,
typename Hash,
typename size_space>
142 template<
typename VectorType,
typename Hash,
typename size_space>
147 unsigned long long originalIndex = getCellIndex(grids[0].subgrids[0], cell, 0);
150 for(
size_t i = 0; i < grids.size(); ++i)
154 for(
size_t g = 0; g < instance.
subgrids.size(); ++g)
159 unsigned long long index = getCellIndex(grid, cell, g);
162 if(grid.
buckets.count(index) == 0)
165 VectorType origin = space->
origin();
170 grid.
buckets.at(index) += cfelement;
177 template<
typename VectorType,
typename Hash,
typename size_space>
182 unsigned long long originalIndex = getCellIndex(grids[0].subgrids[0], cell, 0);
185 for(
size_t i = 0; i < grids.size(); ++i)
189 for(
size_t g = 0; g < instance.
subgrids.size(); ++g)
194 unsigned long long index = getCellIndex(grid, cell, g);
198 grid.
buckets.at(index) -= cfelement;
199 if(grid.
buckets.at(index).number == 0)
207 template<
typename VectorType,
typename Hash,
typename size_space>
210 std::vector<size_space> startCell(dimension, 0);
211 for(
size_t i = 0; i < grids.size(); ++i)
213 std::vector<VectorType> coreset;
214 bool success = evaluateGrid(coreset, i, numOfNestedLayers-1, startCell);
224 template<
typename VectorType,
typename Hash,
typename size_space>
228 double threshold = 16 * alpha / delta + numOfCenters * std::pow(2, dimension);
229 unsigned long long distElements = grids[gridInstance].subgrids[gridIndex].distinctCells.numberOfDistinctElements();
231 if(distElements <= threshold)
234 if(gridIndex < grids[gridInstance].subgrids.size())
237 std::vector<std::vector<size_space>> children(getChildCells(cell));
238 for(
size_t i = 0; i < children.size(); ++i)
240 unsigned long long index = getCellIndex(grids[gridInstance].subgrids[gridIndex-1], children[i], gridIndex);
241 mark &= !isHeavy(gridInstance, gridIndex-1, index);
246 unsigned long long toMove = 0;
247 for(
size_t i = 0; i < children.size(); ++i)
249 unsigned long long index = getCellIndex(grids[gridInstance].subgrids[gridIndex-1], children[i], gridIndex);
250 if(isHeavy(gridInstance, gridIndex+1, index))
251 evaluateGrid(coreset, gridInstance, gridIndex-1, children[i]);
253 toMove += grids[gridInstance].subgrids[gridIndex-1].buckets.at(index).number;
255 VectorType& last = coreset[coreset.size()-1];
261 unsigned long long index = getCellIndex(grids[gridInstance].subgrids[gridIndex], cell, gridIndex);
263 VectorType point((1.0 / cf.
number) * cf.
LS);
265 coreset.push_back(point);
275 template<
typename VectorType,
typename Hash,
typename size_space>
278 if(grids[gridInstance].subgrids[gridIndex].buckets.count(index) > 0)
279 return grids[gridInstance].subgrids[gridIndex].buckets.at(index).number >=
280 delta * grids[gridInstance].opt / std::pow(2, gridIndex+1);
285 template<
typename VectorType,
typename Hash,
typename size_space>
291 template<
typename VectorType,
typename Hash,
typename size_space>
294 for(
size_t i = 0; i < cell.size(); ++i)
298 template<
typename VectorType,
typename Hash,
typename size_space>
301 std::vector<std::vector<size_space>> cells;
302 for(
size_t i = 0; i < cell.size(); ++i)
304 for(
size_t i = 0; i < std::pow(2, dimension); ++i)
307 std::vector<size_space> childCell(cell);
308 for(
size_t j = 0; j < childCell.size(); ++j)
309 childCell[j] += (inc & 1);
310 cells.push_back(childCell);
316 template<
typename VectorType,
typename Hash,
typename size_space>
319 unsigned long long index = 0;
320 unsigned long long power = 1;
321 for(
size_t d = 0; d < dimension; ++d)
323 size_space coord = cell[d];
324 for(
size_t l = 0; l < layer; ++l)
326 index += coord * power;
std::unique_ptr< WeightModifier< VectorType > > weightModifier
void insert(VectorType const &element)
Grid(unsigned long long cellsPerRow, unsigned long long numberOfCells, CarterWegman< unsigned long long, size_t > bucketHash, CarterWegman< unsigned long long, unsigned long long > sampleHash)
std::vector< std::vector< T > > proxysets
unsigned long long nFitting
std::unordered_map< size_t, CFEntry< VectorType > > buckets
VectorHash< size_t > vechash
Coreset algorithm for k-median clustering.
void shiftGrid(std::vector< size_space > &cell)
void toParentCell(std::vector< size_space > &cell)
CarterWegman< unsigned long long, unsigned long long > sampleHash
virtual size_space n() const =0
Number of discrete coordinates per dimension.
unsigned long long cellsPerRow
DistinctElementsPrimitive< unsigned long long, unsigned long long > distinctCells
void remove(VectorType const &element)
bool isHeavy(size_t gridInstance, size_t gridIndex, size_t index)
std::vector< GridInstance > grids
size_t number
Number of points contained in the feature.
unsigned long long numberOfCells
virtual ProxySolution< VectorType > * compute()
Runs the algorithm and returns the computed solution.
virtual void setWeight(T &, double)=0
Abstract base class to modify the weight of weighted objects.
Interface to extend a template type to provide discrete (bounded) space {0, ..., n-1}^d features...
std::vector< std::vector< size_space > > getChildCells(std::vector< size_space > cell)
std::vector< Grid > subgrids
virtual double getWeight(T &)=0
virtual size_t dimension() const =0
Space dimension.
Data structure for proxies.
unsigned int numOfCenters
bool evaluateGrid(std::vector< VectorType > &coreset, size_t gridInstance, size_t gridIndex, std::vector< size_space > cell)
virtual FrahlingSohler< VectorType, Hash, size_space > & operator<<(VectorType const &element)
Streaming operator.
unsigned long long getCellIndex(typename GridInstance::Grid const &grid, std::vector< size_space > &cell, size_t layer)
FrahlingSohler(DiscreteBoundedSpace< VectorType, size_space > *space, WeightModifier< VectorType > *weightModifier, std::function< Hash()> createHashfunction, double eps, double rho, unsigned int numOfCenters)
Abstract base class for streaming algorithms.
Clustering feature tree entry.
virtual std::vector< size_space > getCoordinates(V const &vector) const =0
Returns the coordinates of the given vector.
CarterWegman< unsigned long long, size_t > bucketHash
virtual VectorType origin() const =0
Returns the space's origin.
std::unique_ptr< DiscreteBoundedSpace< VectorType, size_space > > space