Update to 2.0.0 tree from current Fremantle build
[opencv] / 3rdparty / flann / algorithms / autotuned_index.h
1 /***********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *************************************************************************/
30
31 #ifndef AUTOTUNEDINDEX_H_
32 #define AUTOTUNEDINDEX_H_
33
34 #include "constants.h"
35 #include "nn_index.h"
36 #include "ground_truth.h"
37 #include "index_testing.h"
38
39 namespace flann
40 {
41
42 class AutotunedIndex : public NNIndex
43 {
44         NNIndex* bestIndex;
45         IndexParams* bestParams;
46         SearchParams bestSearchParams;
47
48     Matrix<float>* sampledDataset;
49     Matrix<float>* testDataset;
50     Matrix<int>* gt_matches;
51
52     float speedup;
53
54         /**
55          * The dataset used by this index
56          */
57     const Matrix<float> dataset;
58
59     /**
60      * Index parameters
61      */
62     const AutotunedIndexParams& params;
63
64     /**
65     * Number of features in the dataset.
66     */
67     int size_;
68
69     /**
70     * Length of each feature.
71     */
72     int veclen_;
73
74
75 public:
76
77     AutotunedIndex(const Matrix<float>& inputData, const AutotunedIndexParams& params_ = AutotunedIndexParams() ) :
78         dataset(inputData), params(params_)
79         {
80         size_ = dataset.rows;
81         veclen_ = dataset.cols;
82
83         bestIndex = NULL;
84
85         }
86
87     virtual ~AutotunedIndex()
88     {
89         delete bestIndex;
90         delete bestParams;
91     };
92
93         /**
94                 Method responsible with building the index.
95         */
96         virtual void buildIndex()
97         {
98                 bestParams = estimateBuildParams();
99                 bestIndex = bestParams->createIndex(dataset);
100                 bestIndex->buildIndex();
101                 speedup = estimateSearchParams(bestSearchParams);
102         }
103
104     /**
105         Saves the index to a stream
106     */
107     virtual void saveIndex(FILE* stream)
108     {
109         bestIndex->saveIndex(stream);
110     }
111
112     /**
113         Loads the index from a stream
114     */
115     virtual void loadIndex(FILE* stream)
116     {
117         bestIndex->loadIndex(stream);
118     }
119
120         /**
121                 Method that searches for nearest-neighbors
122         */
123         virtual void findNeighbors(ResultSet& result, const float* vec, const SearchParams& /*searchParams*/)
124         {
125                 bestIndex->findNeighbors(result, vec, bestSearchParams);
126         }
127
128         /**
129                 Number of features in this index.
130         */
131         virtual int size() const
132         {
133                 return bestIndex->size();
134         }
135
136         /**
137                 The length of each vector in this index.
138         */
139         virtual int veclen() const
140         {
141                 return bestIndex->veclen();
142         }
143
144         /**
145          The amount of memory (in bytes) this index uses.
146         */
147         virtual int usedMemory() const
148         {
149                 return bestIndex->usedMemory();
150         }
151
152     /**
153     * Algorithm name
154     */
155     virtual flann_algorithm_t getType() const
156     {
157         return bestIndex->getType();
158     }
159
160     /**
161       Estimates the search parameters required in order to get a certain precision.
162       If testset is not given it uses cross-validation.
163     */
164 //    virtual Params estimateSearchParams(float precision, Dataset<float>* testset = NULL)
165 //    {
166 //        Params params;
167 //
168 //        return params;
169 //    }
170 private:
171
172     struct CostData {
173         float searchTimeCost;
174         float buildTimeCost;
175         float timeCost;
176         float memoryCost;
177         float totalCost;
178     };
179
180     typedef pair<CostData, KDTreeIndexParams> KDTreeCostData;
181     typedef pair<CostData, KMeansIndexParams> KMeansCostData;
182
183
184     void evaluate_kmeans(CostData& cost, const KMeansIndexParams& kmeans_params)
185     {
186         StartStopTimer t;
187         int checks;
188         const int nn = 1;
189
190         logger.info("KMeansTree using params: max_iterations=%d, branching=%d\n", kmeans_params.iterations, kmeans_params.branching);
191         KMeansIndex kmeans(*sampledDataset, kmeans_params);
192         // measure index build time
193         t.start();
194         kmeans.buildIndex();
195         t.stop();
196         float buildTime = (float)t.value;
197
198         // measure search time
199         float searchTime = test_index_precision(kmeans, *sampledDataset, *testDataset, *gt_matches, params.target_precision, checks, nn);;
200
201         float datasetMemory = (float)(sampledDataset->rows*sampledDataset->cols*sizeof(float));
202         cost.memoryCost = (kmeans.usedMemory()+datasetMemory)/datasetMemory;
203         cost.searchTimeCost = searchTime;
204         cost.buildTimeCost = buildTime;
205         cost.timeCost = (buildTime*params.build_weight+searchTime);
206         logger.info("KMeansTree buildTime=%g, searchTime=%g, timeCost=%g, buildTimeFactor=%g\n",buildTime, searchTime, cost.timeCost, params.build_weight);
207     }
208
209
210      void evaluate_kdtree(CostData& cost, const KDTreeIndexParams& kdtree_params)
211     {
212         StartStopTimer t;
213         int checks;
214         const int nn = 1;
215
216         logger.info("KDTree using params: trees=%d\n",kdtree_params.trees);
217         KDTreeIndex kdtree(*sampledDataset, kdtree_params);
218
219         t.start();
220         kdtree.buildIndex();
221         t.stop();
222         float buildTime = (float)t.value;
223
224         //measure search time
225         float searchTime = test_index_precision(kdtree, *sampledDataset, *testDataset, *gt_matches, params.target_precision, checks, nn);
226
227         float datasetMemory = (float)(sampledDataset->rows*sampledDataset->cols*sizeof(float));
228         cost.memoryCost = (kdtree.usedMemory()+datasetMemory)/datasetMemory;
229         cost.searchTimeCost = searchTime;
230         cost.buildTimeCost = buildTime;
231         cost.timeCost = (buildTime*params.build_weight+searchTime);
232         logger.info("KDTree buildTime=%g, searchTime=%g, timeCost=%g\n",buildTime, searchTime, cost.timeCost);
233     }
234
235
236 //    struct KMeansSimpleDownhillFunctor {
237 //
238 //        Autotune& autotuner;
239 //        KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
240 //
241 //        float operator()(int* params) {
242 //
243 //            float maxFloat = numeric_limits<float>::max();
244 //
245 //            if (params[0]<2) return maxFloat;
246 //            if (params[1]<0) return maxFloat;
247 //
248 //            CostData c;
249 //            c.params["algorithm"] = KMEANS;
250 //            c.params["centers-init"] = CENTERS_RANDOM;
251 //            c.params["branching"] = params[0];
252 //            c.params["max-iterations"] = params[1];
253 //
254 //            autotuner.evaluate_kmeans(c);
255 //
256 //            return c.timeCost;
257 //
258 //        }
259 //    };
260 //
261 //    struct KDTreeSimpleDownhillFunctor {
262 //
263 //        Autotune& autotuner;
264 //        KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {};
265 //
266 //        float operator()(int* params) {
267 //            float maxFloat = numeric_limits<float>::max();
268 //
269 //            if (params[0]<1) return maxFloat;
270 //
271 //            CostData c;
272 //            c.params["algorithm"] = KDTREE;
273 //            c.params["trees"] = params[0];
274 //
275 //            autotuner.evaluate_kdtree(c);
276 //
277 //            return c.timeCost;
278 //
279 //        }
280 //    };
281
282
283
284     KMeansCostData optimizeKMeans()
285     {
286         logger.info("KMEANS, Step 1: Exploring parameter space\n");
287
288         // explore kmeans parameters space using combinations of the parameters below
289         int maxIterations[] = { 1, 5, 10, 15 };
290         int branchingFactors[] = { 16, 32, 64, 128, 256 };
291
292         int kmeansParamSpaceSize = ARRAY_LEN(maxIterations)*ARRAY_LEN(branchingFactors);
293
294         vector<KMeansCostData> kmeansCosts(kmeansParamSpaceSize);
295
296 //        CostData* kmeansCosts = new CostData[kmeansParamSpaceSize];
297
298         // evaluate kmeans for all parameter combinations
299         int cnt = 0;
300         for (size_t i=0; i<ARRAY_LEN(maxIterations); ++i) {
301             for (size_t j=0; j<ARRAY_LEN(branchingFactors); ++j) {
302
303                 kmeansCosts[cnt].second.centers_init = CENTERS_RANDOM;
304                 kmeansCosts[cnt].second.branching = branchingFactors[j];
305                 kmeansCosts[cnt].second.iterations = maxIterations[j];
306
307                 evaluate_kmeans(kmeansCosts[cnt].first, kmeansCosts[cnt].second);
308
309                 int k = cnt;
310                 // order by time cost
311                 while (k>0 && kmeansCosts[k].first.timeCost < kmeansCosts[k-1].first.timeCost) {
312                     swap(kmeansCosts[k],kmeansCosts[k-1]);
313                     --k;
314                 }
315                 ++cnt;
316             }
317         }
318
319 //         logger.info("KMEANS, Step 2: simplex-downhill optimization\n");
320 //
321 //         const int n = 2;
322 //         // choose initial simplex points as the best parameters so far
323 //         int kmeansNMPoints[n*(n+1)];
324 //         float kmeansVals[n+1];
325 //         for (int i=0;i<n+1;++i) {
326 //             kmeansNMPoints[i*n] = (int)kmeansCosts[i].params["branching"];
327 //             kmeansNMPoints[i*n+1] = (int)kmeansCosts[i].params["max-iterations"];
328 //             kmeansVals[i] = kmeansCosts[i].timeCost;
329 //         }
330 //         KMeansSimpleDownhillFunctor kmeans_cost_func(*this);
331 //         // run optimization
332 //         optimizeSimplexDownhill(kmeansNMPoints,n,kmeans_cost_func,kmeansVals);
333 //         // store results
334 //         for (int i=0;i<n+1;++i) {
335 //             kmeansCosts[i].params["branching"] = kmeansNMPoints[i*2];
336 //             kmeansCosts[i].params["max-iterations"] = kmeansNMPoints[i*2+1];
337 //             kmeansCosts[i].timeCost = kmeansVals[i];
338 //         }
339
340         float optTimeCost = kmeansCosts[0].first.timeCost;
341         // recompute total costs factoring in the memory costs
342         for (int i=0;i<kmeansParamSpaceSize;++i) {
343             kmeansCosts[i].first.totalCost = (kmeansCosts[i].first.timeCost/optTimeCost + params.memory_weight * kmeansCosts[i].first.memoryCost);
344
345             int k = i;
346             while (k>0 && kmeansCosts[k].first.totalCost < kmeansCosts[k-1].first.totalCost) {
347                 swap(kmeansCosts[k],kmeansCosts[k-1]);
348                 k--;
349             }
350         }
351         // display the costs obtained
352         for (int i=0;i<kmeansParamSpaceSize;++i) {
353             logger.info("KMeans, branching=%d, iterations=%d, time_cost=%g[%g] (build=%g, search=%g), memory_cost=%g, cost=%g\n",
354                 kmeansCosts[i].second.branching, kmeansCosts[i].second.iterations,
355             kmeansCosts[i].first.timeCost,kmeansCosts[i].first.timeCost/optTimeCost,
356             kmeansCosts[i].first.buildTimeCost, kmeansCosts[i].first.searchTimeCost,
357             kmeansCosts[i].first.memoryCost,kmeansCosts[i].first.totalCost);
358         }
359
360         return kmeansCosts[0];
361     }
362
363
364     KDTreeCostData optimizeKDTree()
365     {
366
367         logger.info("KD-TREE, Step 1: Exploring parameter space\n");
368
369         // explore kd-tree parameters space using the parameters below
370         int testTrees[] = { 1, 4, 8, 16, 32 };
371
372         size_t kdtreeParamSpaceSize = ARRAY_LEN(testTrees);
373         vector<KDTreeCostData> kdtreeCosts(kdtreeParamSpaceSize);
374
375         // evaluate kdtree for all parameter combinations
376         int cnt = 0;
377         for (size_t i=0; i<ARRAY_LEN(testTrees); ++i) {
378                 kdtreeCosts[cnt].second.trees = testTrees[i];
379
380             evaluate_kdtree(kdtreeCosts[cnt].first, kdtreeCosts[cnt].second);
381
382             int k = cnt;
383             // order by time cost
384             while (k>0 && kdtreeCosts[k].first.timeCost < kdtreeCosts[k-1].first.timeCost) {
385                 swap(kdtreeCosts[k],kdtreeCosts[k-1]);
386                 --k;
387             }
388             ++cnt;
389         }
390
391 //         logger.info("KD-TREE, Step 2: simplex-downhill optimization\n");
392 //
393 //         const int n = 1;
394 //         // choose initial simplex points as the best parameters so far
395 //         int kdtreeNMPoints[n*(n+1)];
396 //         float kdtreeVals[n+1];
397 //         for (int i=0;i<n+1;++i) {
398 //             kdtreeNMPoints[i] = (int)kdtreeCosts[i].params["trees"];
399 //             kdtreeVals[i] = kdtreeCosts[i].timeCost;
400 //         }
401 //         KDTreeSimpleDownhillFunctor kdtree_cost_func(*this);
402 //         // run optimization
403 //         optimizeSimplexDownhill(kdtreeNMPoints,n,kdtree_cost_func,kdtreeVals);
404 //         // store results
405 //         for (int i=0;i<n+1;++i) {
406 //             kdtreeCosts[i].params["trees"] = kdtreeNMPoints[i];
407 //             kdtreeCosts[i].timeCost = kdtreeVals[i];
408 //         }
409
410         float optTimeCost = kdtreeCosts[0].first.timeCost;
411         // recompute costs for kd-tree factoring in memory cost
412         for (size_t i=0;i<kdtreeParamSpaceSize;++i) {
413             kdtreeCosts[i].first.totalCost = (kdtreeCosts[i].first.timeCost/optTimeCost + params.memory_weight * kdtreeCosts[i].first.memoryCost);
414
415             int k = i;
416             while (k>0 && kdtreeCosts[k].first.totalCost < kdtreeCosts[k-1].first.totalCost) {
417                 swap(kdtreeCosts[k],kdtreeCosts[k-1]);
418                 k--;
419             }
420         }
421         // display costs obtained
422         for (size_t i=0;i<kdtreeParamSpaceSize;++i) {
423             logger.info("kd-tree, trees=%d, time_cost=%g[%g] (build=%g, search=%g), memory_cost=%g, cost=%g\n",
424             kdtreeCosts[i].second.trees,kdtreeCosts[i].first.timeCost,kdtreeCosts[i].first.timeCost/optTimeCost,
425             kdtreeCosts[i].first.buildTimeCost, kdtreeCosts[i].first.searchTimeCost,
426             kdtreeCosts[i].first.memoryCost,kdtreeCosts[i].first.totalCost);
427         }
428
429         return kdtreeCosts[0];
430     }
431
432     /**
433         Chooses the best nearest-neighbor algorithm and estimates the optimal
434         parameters to use when building the index (for a given precision).
435         Returns a dictionary with the optimal parameters.
436     */
437     IndexParams* estimateBuildParams()
438     {
439         int sampleSize = int(params.sample_fraction*dataset.rows);
440         int testSampleSize = min(sampleSize/10, 1000);
441
442         logger.info("Entering autotuning, dataset size: %d, sampleSize: %d, testSampleSize: %d\n",dataset.rows, sampleSize, testSampleSize);
443
444         // For a very small dataset, it makes no sense to build any fancy index, just
445         // use linear search
446         if (testSampleSize<10) {
447             logger.info("Choosing linear, dataset too small\n");
448             return new LinearIndexParams();
449         }
450
451         // We use a fraction of the original dataset to speedup the autotune algorithm
452         sampledDataset = dataset.sample(sampleSize);
453         // We use a cross-validation approach, first we sample a testset from the dataset
454         testDataset = sampledDataset->sample(testSampleSize,true);
455
456         // We compute the ground truth using linear search
457         logger.info("Computing ground truth... \n");
458         gt_matches = new Matrix<int>(testDataset->rows, 1);
459         StartStopTimer t;
460         t.start();
461         compute_ground_truth(*sampledDataset, *testDataset, *gt_matches, 0);
462         t.stop();
463         float bestCost = (float)t.value;
464         IndexParams* bestParams = new LinearIndexParams();
465
466         // Start parameter autotune process
467         logger.info("Autotuning parameters...\n");
468
469
470         KMeansCostData kmeansCost = optimizeKMeans();
471
472         if (kmeansCost.first.totalCost<bestCost) {
473             bestParams = new KMeansIndexParams(kmeansCost.second);
474             bestCost = kmeansCost.first.totalCost;
475         }
476
477         KDTreeCostData kdtreeCost = optimizeKDTree();
478
479         if (kdtreeCost.first.totalCost<bestCost) {
480             bestParams = new KDTreeIndexParams(kdtreeCost.second);
481             bestCost = kdtreeCost.first.totalCost;
482         }
483
484         // free the memory used by the datasets we sampled
485         delete sampledDataset;
486         delete testDataset;
487         delete gt_matches;
488
489         return bestParams;
490     }
491
492
493
494     /**
495         Estimates the search time parameters needed to get the desired precision.
496         Precondition: the index is built
497         Postcondition: the searchParams will have the optimum params set, also the speedup obtained over linear search.
498     */
499     float estimateSearchParams(SearchParams& searchParams)
500     {
501         const int nn = 1;
502         const long SAMPLE_COUNT = 1000;
503
504         assert(bestIndex!=NULL);   // must have a valid index
505
506         float speedup = 0;
507
508         int samples = min(dataset.rows/10, SAMPLE_COUNT);
509         if (samples>0) {
510             Matrix<float>* testDataset = dataset.sample(samples);
511
512             logger.info("Computing ground truth\n");
513
514             // we need to compute the ground truth first
515             Matrix<int> gt_matches(testDataset->rows,1);
516             StartStopTimer t;
517             t.start();
518             compute_ground_truth(dataset, *testDataset, gt_matches,1);
519             t.stop();
520             float linear = (float)t.value;
521
522             int checks;
523             logger.info("Estimating number of checks\n");
524
525             float searchTime;
526             float cb_index;
527             if (bestIndex->getType() == KMEANS) {
528
529                 logger.info("KMeans algorithm, estimating cluster border factor\n");
530                 KMeansIndex* kmeans = (KMeansIndex*)bestIndex;
531                 float bestSearchTime = -1;
532                 float best_cb_index = -1;
533                 int best_checks = -1;
534                 for (cb_index = 0;cb_index<1.1f; cb_index+=0.2f) {
535                     kmeans->set_cb_index(cb_index);
536                     searchTime = test_index_precision(*kmeans, dataset, *testDataset, gt_matches, params.target_precision, checks, nn, 1);
537                     if (searchTime<bestSearchTime || bestSearchTime == -1) {
538                         bestSearchTime = searchTime;
539                         best_cb_index = cb_index;
540                         best_checks = checks;
541                     }
542                 }
543                 searchTime = bestSearchTime;
544                 cb_index = best_cb_index;
545                 checks = best_checks;
546
547                 kmeans->set_cb_index(best_cb_index);
548                 logger.info("Optimum cb_index: %g\n",cb_index);
549                 ((KMeansIndexParams*)bestParams)->cb_index = cb_index;
550             }
551             else {
552                 searchTime = test_index_precision(*bestIndex, dataset, *testDataset, gt_matches, params.target_precision, checks, nn, 1);
553             }
554
555             logger.info("Required number of checks: %d \n",checks);;
556             searchParams.checks = checks;
557             delete testDataset;
558
559             speedup = linear/searchTime;
560         }
561
562         return speedup;
563     }
564
565
566
567 };
568
569 }
570
571 #endif /* AUTOTUNEDINDEX_H_ */