1 /***********************************************************************
2 * Software License Agreement (BSD License)
4 * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5 * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 *************************************************************************/
31 #ifndef AUTOTUNEDINDEX_H_
32 #define AUTOTUNEDINDEX_H_
34 #include "constants.h"
36 #include "ground_truth.h"
37 #include "index_testing.h"
42 class AutotunedIndex : public NNIndex
45 IndexParams* bestParams;
46 SearchParams bestSearchParams;
48 Matrix<float>* sampledDataset;
49 Matrix<float>* testDataset;
50 Matrix<int>* gt_matches;
55 * The dataset used by this index
57 const Matrix<float> dataset;
62 const AutotunedIndexParams& params;
65 * Number of features in the dataset.
70 * Length of each feature.
77 AutotunedIndex(const Matrix<float>& inputData, const AutotunedIndexParams& params_ = AutotunedIndexParams() ) :
78 dataset(inputData), params(params_)
81 veclen_ = dataset.cols;
87 virtual ~AutotunedIndex()
94 Method responsible with building the index.
96 virtual void buildIndex()
98 bestParams = estimateBuildParams();
99 bestIndex = bestParams->createIndex(dataset);
100 bestIndex->buildIndex();
101 speedup = estimateSearchParams(bestSearchParams);
105 Saves the index to a stream
107 virtual void saveIndex(FILE* stream)
109 bestIndex->saveIndex(stream);
113 Loads the index from a stream
115 virtual void loadIndex(FILE* stream)
117 bestIndex->loadIndex(stream);
121 Method that searches for nearest-neighbors
123 virtual void findNeighbors(ResultSet& result, const float* vec, const SearchParams& /*searchParams*/)
125 bestIndex->findNeighbors(result, vec, bestSearchParams);
129 Number of features in this index.
131 virtual int size() const
133 return bestIndex->size();
137 The length of each vector in this index.
139 virtual int veclen() const
141 return bestIndex->veclen();
145 The amount of memory (in bytes) this index uses.
147 virtual int usedMemory() const
149 return bestIndex->usedMemory();
155 virtual flann_algorithm_t getType() const
157 return bestIndex->getType();
161 Estimates the search parameters required in order to get a certain precision.
162 If testset is not given it uses cross-validation.
164 // virtual Params estimateSearchParams(float precision, Dataset<float>* testset = NULL)
173 float searchTimeCost;
180 typedef pair<CostData, KDTreeIndexParams> KDTreeCostData;
181 typedef pair<CostData, KMeansIndexParams> KMeansCostData;
184 void evaluate_kmeans(CostData& cost, const KMeansIndexParams& kmeans_params)
190 logger.info("KMeansTree using params: max_iterations=%d, branching=%d\n", kmeans_params.iterations, kmeans_params.branching);
191 KMeansIndex kmeans(*sampledDataset, kmeans_params);
192 // measure index build time
196 float buildTime = (float)t.value;
198 // measure search time
199 float searchTime = test_index_precision(kmeans, *sampledDataset, *testDataset, *gt_matches, params.target_precision, checks, nn);;
201 float datasetMemory = (float)(sampledDataset->rows*sampledDataset->cols*sizeof(float));
202 cost.memoryCost = (kmeans.usedMemory()+datasetMemory)/datasetMemory;
203 cost.searchTimeCost = searchTime;
204 cost.buildTimeCost = buildTime;
205 cost.timeCost = (buildTime*params.build_weight+searchTime);
206 logger.info("KMeansTree buildTime=%g, searchTime=%g, timeCost=%g, buildTimeFactor=%g\n",buildTime, searchTime, cost.timeCost, params.build_weight);
210 void evaluate_kdtree(CostData& cost, const KDTreeIndexParams& kdtree_params)
216 logger.info("KDTree using params: trees=%d\n",kdtree_params.trees);
217 KDTreeIndex kdtree(*sampledDataset, kdtree_params);
222 float buildTime = (float)t.value;
224 //measure search time
225 float searchTime = test_index_precision(kdtree, *sampledDataset, *testDataset, *gt_matches, params.target_precision, checks, nn);
227 float datasetMemory = (float)(sampledDataset->rows*sampledDataset->cols*sizeof(float));
228 cost.memoryCost = (kdtree.usedMemory()+datasetMemory)/datasetMemory;
229 cost.searchTimeCost = searchTime;
230 cost.buildTimeCost = buildTime;
231 cost.timeCost = (buildTime*params.build_weight+searchTime);
232 logger.info("KDTree buildTime=%g, searchTime=%g, timeCost=%g\n",buildTime, searchTime, cost.timeCost);
236 // struct KMeansSimpleDownhillFunctor {
238 // Autotune& autotuner;
239 // KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
241 // float operator()(int* params) {
243 // float maxFloat = numeric_limits<float>::max();
245 // if (params[0]<2) return maxFloat;
246 // if (params[1]<0) return maxFloat;
249 // c.params["algorithm"] = KMEANS;
250 // c.params["centers-init"] = CENTERS_RANDOM;
251 // c.params["branching"] = params[0];
252 // c.params["max-iterations"] = params[1];
254 // autotuner.evaluate_kmeans(c);
256 // return c.timeCost;
261 // struct KDTreeSimpleDownhillFunctor {
263 // Autotune& autotuner;
264 // KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
266 // float operator()(int* params) {
267 // float maxFloat = numeric_limits<float>::max();
269 // if (params[0]<1) return maxFloat;
272 // c.params["algorithm"] = KDTREE;
273 // c.params["trees"] = params[0];
275 // autotuner.evaluate_kdtree(c);
277 // return c.timeCost;
284 KMeansCostData optimizeKMeans()
286 logger.info("KMEANS, Step 1: Exploring parameter space\n");
288 // explore kmeans parameters space using combinations of the parameters below
289 int maxIterations[] = { 1, 5, 10, 15 };
290 int branchingFactors[] = { 16, 32, 64, 128, 256 };
292 int kmeansParamSpaceSize = ARRAY_LEN(maxIterations)*ARRAY_LEN(branchingFactors);
294 vector<KMeansCostData> kmeansCosts(kmeansParamSpaceSize);
296 // CostData* kmeansCosts = new CostData[kmeansParamSpaceSize];
298 // evaluate kmeans for all parameter combinations
300 for (size_t i=0; i<ARRAY_LEN(maxIterations); ++i) {
301 for (size_t j=0; j<ARRAY_LEN(branchingFactors); ++j) {
303 kmeansCosts[cnt].second.centers_init = CENTERS_RANDOM;
304 kmeansCosts[cnt].second.branching = branchingFactors[j];
305 kmeansCosts[cnt].second.iterations = maxIterations[j];
307 evaluate_kmeans(kmeansCosts[cnt].first, kmeansCosts[cnt].second);
310 // order by time cost
311 while (k>0 && kmeansCosts[k].first.timeCost < kmeansCosts[k-1].first.timeCost) {
312 swap(kmeansCosts[k],kmeansCosts[k-1]);
319 // logger.info("KMEANS, Step 2: simplex-downhill optimization\n");
322 // // choose initial simplex points as the best parameters so far
323 // int kmeansNMPoints[n*(n+1)];
324 // float kmeansVals[n+1];
325 // for (int i=0;i<n+1;++i) {
326 // kmeansNMPoints[i*n] = (int)kmeansCosts[i].params["branching"];
327 // kmeansNMPoints[i*n+1] = (int)kmeansCosts[i].params["max-iterations"];
328 // kmeansVals[i] = kmeansCosts[i].timeCost;
330 // KMeansSimpleDownhillFunctor kmeans_cost_func(*this);
331 // // run optimization
332 // optimizeSimplexDownhill(kmeansNMPoints,n,kmeans_cost_func,kmeansVals);
334 // for (int i=0;i<n+1;++i) {
335 // kmeansCosts[i].params["branching"] = kmeansNMPoints[i*2];
336 // kmeansCosts[i].params["max-iterations"] = kmeansNMPoints[i*2+1];
337 // kmeansCosts[i].timeCost = kmeansVals[i];
340 float optTimeCost = kmeansCosts[0].first.timeCost;
341 // recompute total costs factoring in the memory costs
342 for (int i=0;i<kmeansParamSpaceSize;++i) {
343 kmeansCosts[i].first.totalCost = (kmeansCosts[i].first.timeCost/optTimeCost + params.memory_weight * kmeansCosts[i].first.memoryCost);
346 while (k>0 && kmeansCosts[k].first.totalCost < kmeansCosts[k-1].first.totalCost) {
347 swap(kmeansCosts[k],kmeansCosts[k-1]);
351 // display the costs obtained
352 for (int i=0;i<kmeansParamSpaceSize;++i) {
353 logger.info("KMeans, branching=%d, iterations=%d, time_cost=%g[%g] (build=%g, search=%g), memory_cost=%g, cost=%g\n",
354 kmeansCosts[i].second.branching, kmeansCosts[i].second.iterations,
355 kmeansCosts[i].first.timeCost,kmeansCosts[i].first.timeCost/optTimeCost,
356 kmeansCosts[i].first.buildTimeCost, kmeansCosts[i].first.searchTimeCost,
357 kmeansCosts[i].first.memoryCost,kmeansCosts[i].first.totalCost);
360 return kmeansCosts[0];
364 KDTreeCostData optimizeKDTree()
367 logger.info("KD-TREE, Step 1: Exploring parameter space\n");
369 // explore kd-tree parameters space using the parameters below
370 int testTrees[] = { 1, 4, 8, 16, 32 };
372 size_t kdtreeParamSpaceSize = ARRAY_LEN(testTrees);
373 vector<KDTreeCostData> kdtreeCosts(kdtreeParamSpaceSize);
375 // evaluate kdtree for all parameter combinations
377 for (size_t i=0; i<ARRAY_LEN(testTrees); ++i) {
378 kdtreeCosts[cnt].second.trees = testTrees[i];
380 evaluate_kdtree(kdtreeCosts[cnt].first, kdtreeCosts[cnt].second);
383 // order by time cost
384 while (k>0 && kdtreeCosts[k].first.timeCost < kdtreeCosts[k-1].first.timeCost) {
385 swap(kdtreeCosts[k],kdtreeCosts[k-1]);
391 // logger.info("KD-TREE, Step 2: simplex-downhill optimization\n");
394 // // choose initial simplex points as the best parameters so far
395 // int kdtreeNMPoints[n*(n+1)];
396 // float kdtreeVals[n+1];
397 // for (int i=0;i<n+1;++i) {
398 // kdtreeNMPoints[i] = (int)kdtreeCosts[i].params["trees"];
399 // kdtreeVals[i] = kdtreeCosts[i].timeCost;
401 // KDTreeSimpleDownhillFunctor kdtree_cost_func(*this);
402 // // run optimization
403 // optimizeSimplexDownhill(kdtreeNMPoints,n,kdtree_cost_func,kdtreeVals);
405 // for (int i=0;i<n+1;++i) {
406 // kdtreeCosts[i].params["trees"] = kdtreeNMPoints[i];
407 // kdtreeCosts[i].timeCost = kdtreeVals[i];
410 float optTimeCost = kdtreeCosts[0].first.timeCost;
411 // recompute costs for kd-tree factoring in memory cost
412 for (size_t i=0;i<kdtreeParamSpaceSize;++i) {
413 kdtreeCosts[i].first.totalCost = (kdtreeCosts[i].first.timeCost/optTimeCost + params.memory_weight * kdtreeCosts[i].first.memoryCost);
416 while (k>0 && kdtreeCosts[k].first.totalCost < kdtreeCosts[k-1].first.totalCost) {
417 swap(kdtreeCosts[k],kdtreeCosts[k-1]);
421 // display costs obtained
422 for (size_t i=0;i<kdtreeParamSpaceSize;++i) {
423 logger.info("kd-tree, trees=%d, time_cost=%g[%g] (build=%g, search=%g), memory_cost=%g, cost=%g\n",
424 kdtreeCosts[i].second.trees,kdtreeCosts[i].first.timeCost,kdtreeCosts[i].first.timeCost/optTimeCost,
425 kdtreeCosts[i].first.buildTimeCost, kdtreeCosts[i].first.searchTimeCost,
426 kdtreeCosts[i].first.memoryCost,kdtreeCosts[i].first.totalCost);
429 return kdtreeCosts[0];
433 Chooses the best nearest-neighbor algorithm and estimates the optimal
434 parameters to use when building the index (for a given precision).
435 Returns a dictionary with the optimal parameters.
437 IndexParams* estimateBuildParams()
439 int sampleSize = int(params.sample_fraction*dataset.rows);
440 int testSampleSize = min(sampleSize/10, 1000);
442 logger.info("Entering autotuning, dataset size: %d, sampleSize: %d, testSampleSize: %d\n",dataset.rows, sampleSize, testSampleSize);
444 // For a very small dataset, it makes no sense to build any fancy index, just
446 if (testSampleSize<10) {
447 logger.info("Choosing linear, dataset too small\n");
448 return new LinearIndexParams();
451 // We use a fraction of the original dataset to speedup the autotune algorithm
452 sampledDataset = dataset.sample(sampleSize);
453 // We use a cross-validation approach, first we sample a testset from the dataset
454 testDataset = sampledDataset->sample(testSampleSize,true);
456 // We compute the ground truth using linear search
457 logger.info("Computing ground truth... \n");
458 gt_matches = new Matrix<int>(testDataset->rows, 1);
461 compute_ground_truth(*sampledDataset, *testDataset, *gt_matches, 0);
463 float bestCost = (float)t.value;
464 IndexParams* bestParams = new LinearIndexParams();
466 // Start parameter autotune process
467 logger.info("Autotuning parameters...\n");
470 KMeansCostData kmeansCost = optimizeKMeans();
472 if (kmeansCost.first.totalCost<bestCost) {
473 bestParams = new KMeansIndexParams(kmeansCost.second);
474 bestCost = kmeansCost.first.totalCost;
477 KDTreeCostData kdtreeCost = optimizeKDTree();
479 if (kdtreeCost.first.totalCost<bestCost) {
480 bestParams = new KDTreeIndexParams(kdtreeCost.second);
481 bestCost = kdtreeCost.first.totalCost;
484 // free the memory used by the datasets we sampled
485 delete sampledDataset;
495 Estimates the search time parameters needed to get the desired precision.
496 Precondition: the index is built
497 Postcondition: the searchParams will have the optimum params set, also the speedup obtained over linear search.
499 float estimateSearchParams(SearchParams& searchParams)
502 const long SAMPLE_COUNT = 1000;
504 assert(bestIndex!=NULL); // must have a valid index
508 int samples = min(dataset.rows/10, SAMPLE_COUNT);
510 Matrix<float>* testDataset = dataset.sample(samples);
512 logger.info("Computing ground truth\n");
514 // we need to compute the ground truth first
515 Matrix<int> gt_matches(testDataset->rows,1);
518 compute_ground_truth(dataset, *testDataset, gt_matches,1);
520 float linear = (float)t.value;
523 logger.info("Estimating number of checks\n");
527 if (bestIndex->getType() == KMEANS) {
529 logger.info("KMeans algorithm, estimating cluster border factor\n");
530 KMeansIndex* kmeans = (KMeansIndex*)bestIndex;
531 float bestSearchTime = -1;
532 float best_cb_index = -1;
533 int best_checks = -1;
534 for (cb_index = 0;cb_index<1.1f; cb_index+=0.2f) {
535 kmeans->set_cb_index(cb_index);
536 searchTime = test_index_precision(*kmeans, dataset, *testDataset, gt_matches, params.target_precision, checks, nn, 1);
537 if (searchTime<bestSearchTime || bestSearchTime == -1) {
538 bestSearchTime = searchTime;
539 best_cb_index = cb_index;
540 best_checks = checks;
543 searchTime = bestSearchTime;
544 cb_index = best_cb_index;
545 checks = best_checks;
547 kmeans->set_cb_index(best_cb_index);
548 logger.info("Optimum cb_index: %g\n",cb_index);
549 ((KMeansIndexParams*)bestParams)->cb_index = cb_index;
552 searchTime = test_index_precision(*bestIndex, dataset, *testDataset, gt_matches, params.target_precision, checks, nn, 1);
555 logger.info("Required number of checks: %d \n",checks);;
556 searchParams.checks = checks;
559 speedup = linear/searchTime;
571 #endif /* AUTOTUNEDINDEX_H_ */