git.maemo.org Git - opencv/blob - 3rdparty/flann/algorithms/kdtree_index.h

   1 /***********************************************************************
   2  * Software License Agreement (BSD License)
   3  *
   4  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
   5  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
   6  *
   7  * THE BSD LICENSE
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  *
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29  *************************************************************************/
  30
  31 #ifndef KDTREE_H
  32 #define KDTREE_H
  33
  34 #include <algorithm>
  35 #include <map>
  36 #include <cassert>
  37
  38 #include "heap.h"
  39 #include "common.h"
  40 #include "constants.h"
  41 #include "allocator.h"
  42 #include "matrix.h"
  43 #include "result_set.h"
  44 #include "random.h"
  45 #include "nn_index.h"
  46 #include "saving.h"
  47
  48 using namespace std;
  49
  50
  51 namespace flann
  52 {
  53
  54
  55 /**
  56  * Randomized kd-tree index
  57  *
  58  * Contains the k-d trees and other information for indexing a set of points
  59  * for nearest-neighbor matching.
  60  */
  61 class KDTreeIndex : public NNIndex
  62 {
  63
  64         enum {
  65                 /**
  66                  * To improve efficiency, only SAMPLE_MEAN random values are used to
  67                  * compute the mean and variance at each level when building a tree.
  68                  * A value of 100 seems to perform as well as using all values.
  69                  */
  70                 SAMPLE_MEAN = 100,
  71                 /**
  72                  * Top random dimensions to consider
  73                  *
  74                  * When creating random trees, the dimension on which to subdivide is
  75                  * selected at random from among the top RAND_DIM dimensions with the
  76                  * highest variance.  A value of 5 works well.
  77                  */
  78                 RAND_DIM=5
  79         };
  80
  81
  82         /**
  83          * Number of randomized trees that are used
  84          */
  85         int numTrees;
  86
  87         /**
  88          *  Array of indices to vectors in the dataset.  When doing lookup,
  89          *  this is used instead to mark checkID.
  90          */
  91         int* vind;
  92
  93         /**
  94          * An unique ID for each lookup.
  95          */
  96         int checkID;
  97
  98         /**
  99          * The dataset used by this index
 100          */
 101         const Matrix<float> dataset;
 102
 103     int size_;
 104     int veclen_;
 105
 106
 107     float* mean;
 108     float* var;
 109
 110
 111         /*--------------------- Internal Data Structures --------------------------*/
 112
 113         /**
 114          * A node of the binary k-d tree.
 115          *
 116          *  This is   All nodes that have vec[divfeat] < divval are placed in the
 117          *   child1 subtree, else child2., A leaf node is indicated if both children are NULL.
 118          */
 119         struct TreeSt {
 120                 /**
 121                  * Index of the vector feature used for subdivision.
 122                  * If this is a leaf node (both children are NULL) then
 123                  * this holds vector index for this leaf.
 124                  */
 125                 int divfeat;
 126                 /**
 127                  * The value used for subdivision.
 128                  */
 129                 float divval;
 130                 /**
 131                  * The child nodes.
 132                  */
 133                 TreeSt *child1, *child2;
 134         };
 135         typedef TreeSt* Tree;
 136
 137     /**
 138      * Array of k-d trees used to find neighbors.
 139      */
 140     Tree* trees;
 141     typedef BranchStruct<Tree> BranchSt;
 142     typedef BranchSt* Branch;
 143     /**
 144      * Priority queue storing intermediate branches in the best-bin-first search
 145      */
 146     Heap<BranchSt>* heap;
 147
 148
 149         /**
 150          * Pooled memory allocator.
 151          *
 152          * Using a pooled memory allocator is more efficient
 153          * than allocating memory directly when there is a large
 154          * number small of memory allocations.
 155          */
 156         PooledAllocator pool;
 157
 158
 159
 160 public:
 161
 162     flann_algorithm_t getType() const
 163     {
 164         return KDTREE;
 165     }
 166
 167         /**
 168          * KDTree constructor
 169          *
 170          * Params:
 171          *              inputData = dataset with the input features
 172          *              params = parameters passed to the kdtree algorithm
 173          */
 174         KDTreeIndex(const Matrix<float>& inputData, const KDTreeIndexParams& params = KDTreeIndexParams() ) : dataset(inputData)
 175         {
 176         size_ = dataset.rows;
 177         veclen_ = dataset.cols;
 178
 179         numTrees = params.trees;
 180         trees = new Tree[numTrees];
 181
 182                 // get the parameters
 183 //        if (params.find("trees") != params.end()) {
 184 //              numTrees = (int)params["trees"];
 185 //              trees = new Tree[numTrees];
 186 //        }
 187 //        else {
 188 //              numTrees = -1;
 189 //              trees = NULL;
 190 //        }
 191                 heap = new Heap<BranchSt>(size_);
 192                 checkID = -1000;
 193
 194                 // Create a permutable array of indices to the input vectors.
 195                 vind = new int[size_];
 196                 for (int i = 0; i < size_; i++) {
 197                         vind[i] = i;
 198                 }
 199
 200         mean = new float[veclen_];
 201         var = new float[veclen_];
 202         }
 203
 204         /**
 205          * Standard destructor
 206          */
 207         ~KDTreeIndex()
 208         {
 209                 delete[] vind;
 210                 if (trees!=NULL) {
 211                         delete[] trees;
 212                 }
 213                 delete heap;
 214         delete[] mean;
 215         delete[] var;
 216         }
 217
 218
 219         /**
 220          * Builds the index
 221          */
 222         void buildIndex()
 223         {
 224                 /* Construct the randomized trees. */
 225                 for (int i = 0; i < numTrees; i++) {
 226                         /* Randomize the order of vectors to allow for unbiased sampling. */
 227                         for (int j = size_; j > 0; --j) {
 228 //                              int rand = cast(int) (drand48() * size);
 229                                 int rnd = rand_int(j);
 230                                 assert(rnd >=0 && rnd < size_);
 231                                 swap(vind[j-1], vind[rnd]);
 232                         }
 233                         trees[i] = NULL;
 234                         divideTree(&trees[i], 0, size_ - 1);
 235                 }
 236         }
 237
 238
 239
 240     void saveIndex(FILE* stream)
 241     {
 242         save_header(stream, *this);
 243         save_value(stream, numTrees);
 244         for (int i=0;i<numTrees;++i) {
 245                 save_tree(stream, trees[i]);
 246         }
 247     }
 248
 249
 250
 251     void loadIndex(FILE* stream)
 252     {
 253         IndexHeader header = load_header(stream);
 254
 255         if (header.rows!=size() || header.cols!=veclen()) {
 256                 throw FLANNException("The index saved belongs to a different dataset");
 257         }
 258         load_value(stream, numTrees);
 259
 260         if (trees!=NULL) {
 261                 delete[] trees;
 262         }
 263         trees = new Tree[numTrees];
 264         for (int i=0;i<numTrees;++i) {
 265                 load_tree(stream,trees[i]);
 266         }
 267     }
 268
 269
 270
 271     /**
 272     *  Returns size of index.
 273     */
 274     int size() const
 275     {
 276         return size_;
 277     }
 278
 279     /**
 280     * Returns the length of an index feature.
 281     */
 282     int veclen() const
 283     {
 284         return veclen_;
 285     }
 286
 287
 288         /**
 289          * Computes the inde memory usage
 290          * Returns: memory used by the index
 291          */
 292         int usedMemory() const
 293         {
 294                 return  pool.usedMemory+pool.wastedMemory+dataset.rows*sizeof(int);   // pool memory and vind array memory
 295         }
 296
 297
 298     /**
 299      * Find set of nearest neighbors to vec. Their indices are stored inside
 300      * the result object.
 301      *
 302      * Params:
 303      *     result = the result object in which the indices of the nearest-neighbors are stored
 304      *     vec = the vector for which to search the nearest neighbors
 305      *     maxCheck = the maximum number of restarts (in a best-bin-first manner)
 306      */
 307     void findNeighbors(ResultSet& result, const float* vec, const SearchParams& searchParams)
 308     {
 309         int maxChecks = searchParams.checks;
 310
 311         if (maxChecks<0) {
 312             getExactNeighbors(result, vec);
 313         } else {
 314             getNeighbors(result, vec, maxChecks);
 315         }
 316     }
 317
 318
 319         void continueSearch(ResultSet& result, float* vec, int maxCheck)
 320         {
 321                 BranchSt branch;
 322
 323                 int checkCount = 0;
 324
 325                 /* Keep searching other branches from heap until finished. */
 326                 while ( heap->popMin(branch) && (checkCount < maxCheck || !result.full() )) {
 327                         searchLevel(result, vec, branch.node,branch.mindistsq, checkCount, maxCheck);
 328                 }
 329
 330                 assert(result.full());
 331         }
 332
 333
 334 //    Params estimateSearchParams(float precision, Dataset<float>* testset = NULL)
 335 //    {
 336 //        Params params;
 337 //
 338 //        return params;
 339 //    }
 340
 341
 342 private:
 343
 344
 345     void save_tree(FILE* stream, Tree tree)
 346     {
 347         save_value(stream, *tree);
 348         if (tree->child1!=NULL) {
 349                 save_tree(stream, tree->child1);
 350         }
 351         if (tree->child2!=NULL) {
 352                 save_tree(stream, tree->child2);
 353         }
 354     }
 355
 356
 357     void load_tree(FILE* stream, Tree& tree)
 358     {
 359         tree = pool.allocate<TreeSt>();
 360         load_value(stream, *tree);
 361         if (tree->child1!=NULL) {
 362                 load_tree(stream, tree->child1);
 363         }
 364         if (tree->child2!=NULL) {
 365                 load_tree(stream, tree->child2);
 366         }
 367     }
 368
 369
 370         /**
 371          * Create a tree node that subdivides the list of vecs from vind[first]
 372          * to vind[last].  The routine is called recursively on each sublist.
 373          * Place a pointer to this new tree node in the location pTree.
 374          *
 375          * Params: pTree = the new node to create
 376          *                      first = index of the first vector
 377          *                      last = index of the last vector
 378          */
 379         void divideTree(Tree* pTree, int first, int last)
 380         {
 381                 Tree node;
 382
 383                 node = pool.allocate<TreeSt>(); // allocate memory
 384                 *pTree = node;
 385
 386                 /* If only one exemplar remains, then make this a leaf node. */
 387                 if (first == last) {
 388                         node->child1 = node->child2 = NULL;    /* Mark as leaf node. */
 389                         node->divfeat = vind[first];    /* Store index of this vec. */
 390                 } else {
 391                         chooseDivision(node, first, last);
 392                         subdivide(node, first, last);
 393                 }
 394         }
 395
 396
 397
 398
 399         /**
 400          * Choose which feature to use in order to subdivide this set of vectors.
 401          * Make a random choice among those with the highest variance, and use
 402          * its variance as the threshold value.
 403          */
 404         void chooseDivision(Tree node, int first, int last)
 405         {
 406         memset(mean,0,veclen_*sizeof(float));
 407         memset(var,0,veclen_*sizeof(float));
 408
 409                 /* Compute mean values.  Only the first SAMPLE_MEAN values need to be
 410                         sampled to get a good estimate.
 411                 */
 412                 int end = min(first + SAMPLE_MEAN, last);
 413                 int count = end - first + 1;
 414                 for (int j = first; j <= end; ++j) {
 415                         float* v = dataset[vind[j]];
 416             for (int k=0; k<veclen_; ++k) {
 417                 mean[k] += v[k];
 418             }
 419                 }
 420         for (int k=0; k<veclen_; ++k) {
 421             mean[k] /= count;
 422         }
 423
 424                 /* Compute variances (no need to divide by count). */
 425                 for (int j = first; j <= end; ++j) {
 426                         float* v = dataset[vind[j]];
 427             for (int k=0; k<veclen_; ++k) {
 428                 float dist = v[k] - mean[k];
 429                 var[k] += dist * dist;
 430             }
 431                 }
 432                 /* Select one of the highest variance indices at random. */
 433                 node->divfeat = selectDivision(var);
 434                 node->divval = mean[node->divfeat];
 435
 436         }
 437
 438
 439         /**
 440          * Select the top RAND_DIM largest values from v and return the index of
 441          * one of these selected at random.
 442          */
 443         int selectDivision(float* v)
 444         {
 445                 int num = 0;
 446                 int topind[RAND_DIM];
 447
 448                 /* Create a list of the indices of the top RAND_DIM values. */
 449                 for (int i = 0; i < veclen_; ++i) {
 450                         if (num < RAND_DIM  ||  v[i] > v[topind[num-1]]) {
 451                                 /* Put this element at end of topind. */
 452                                 if (num < RAND_DIM) {
 453                                         topind[num++] = i;            /* Add to list. */
 454                                 }
 455                                 else {
 456                                         topind[num-1] = i;         /* Replace last element. */
 457                                 }
 458                                 /* Bubble end value down to right location by repeated swapping. */
 459                                 int j = num - 1;
 460                                 while (j > 0  &&  v[topind[j]] > v[topind[j-1]]) {
 461                                         swap(topind[j], topind[j-1]);
 462                                         --j;
 463                                 }
 464                         }
 465                 }
 466                 /* Select a random integer in range [0,num-1], and return that index. */
 467 //              int rand = cast(int) (drand48() * num);
 468                 int rnd = rand_int(num);
 469                 assert(rnd >=0 && rnd < num);
 470                 return topind[rnd];
 471         }
 472
 473
 474         /**
 475          *  Subdivide the list of exemplars using the feature and division
 476          *  value given in this node.  Call divideTree recursively on each list.
 477         */
 478         void subdivide(Tree node, int first, int last)
 479         {
 480                 /* Move vector indices for left subtree to front of list. */
 481                 int i = first;
 482                 int j = last;
 483                 while (i <= j) {
 484                         int ind = vind[i];
 485                         float val = dataset[ind][node->divfeat];
 486                         if (val < node->divval) {
 487                                 ++i;
 488                         } else {
 489                                 /* Move to end of list by swapping vind i and j. */
 490                                 swap(vind[i], vind[j]);
 491                                 --j;
 492                         }
 493                 }
 494                 /* If either list is empty, it means we have hit the unlikely case
 495                         in which all remaining features are identical. Split in the middle
 496             to maintain a balanced tree.
 497                 */
 498                 if ( (i == first) || (i == last+1)) {
 499             i = (first+last+1)/2;
 500                 }
 501
 502                 divideTree(& node->child1, first, i - 1);
 503                 divideTree(& node->child2, i, last);
 504         }
 505
 506
 507
 508         /**
 509          * Performs an exact nearest neighbor search. The exact search performs a full
 510          * traversal of the tree.
 511          */
 512         void getExactNeighbors(ResultSet& result, const float* vec)
 513         {
 514                 checkID -= 1;  /* Set a different unique ID for each search. */
 515
 516                 if (numTrees > 1) {
 517             fprintf(stderr,"It doesn't make any sense to use more than one tree for exact search");
 518                 }
 519                 if (numTrees>0) {
 520                         searchLevelExact(result, vec, trees[0], 0.0);
 521                 }
 522                 assert(result.full());
 523         }
 524
 525         /**
 526          * Performs the approximate nearest-neighbor search. The search is approximate
 527          * because the tree traversal is abandoned after a given number of descends in
 528          * the tree.
 529          */
 530         void getNeighbors(ResultSet& result, const float* vec, int maxCheck)
 531         {
 532                 int i;
 533                 BranchSt branch;
 534
 535                 int checkCount = 0;
 536                 heap->clear();
 537                 checkID -= 1;  /* Set a different unique ID for each search. */
 538
 539                 /* Search once through each tree down to root. */
 540                 for (i = 0; i < numTrees; ++i) {
 541                         searchLevel(result, vec, trees[i], 0.0, checkCount, maxCheck);
 542                 }
 543
 544                 /* Keep searching other branches from heap until finished. */
 545                 while ( heap->popMin(branch) && (checkCount < maxCheck || !result.full() )) {
 546                         searchLevel(result, vec, branch.node,branch.mindistsq, checkCount, maxCheck);
 547                 }
 548
 549                 assert(result.full());
 550         }
 551
 552
 553         /**
 554          *  Search starting from a given node of the tree.  Based on any mismatches at
 555          *  higher levels, all exemplars below this level must have a distance of
 556          *  at least "mindistsq".
 557         */
 558         void searchLevel(ResultSet& result, const float* vec, Tree node, float mindistsq, int& checkCount, int maxCheck)
 559         {
 560                 if (result.worstDist()<mindistsq) {
 561 //                      printf("Ignoring branch, too far\n");
 562                         return;
 563                 }
 564
 565                 float val, diff;
 566                 Tree bestChild, otherChild;
 567
 568                 /* If this is a leaf node, then do check and return. */
 569                 if (node->child1 == NULL  &&  node->child2 == NULL) {
 570
 571                         /* Do not check same node more than once when searching multiple trees.
 572                                 Once a vector is checked, we set its location in vind to the
 573                                 current checkID.
 574                         */
 575                         if (vind[node->divfeat] == checkID || checkCount>=maxCheck) {
 576                                 if (result.full()) return;
 577                         }
 578             checkCount++;
 579                         vind[node->divfeat] = checkID;
 580
 581                         result.addPoint(dataset[node->divfeat],node->divfeat);
 582                         return;
 583                 }
 584
 585                 /* Which child branch should be taken first? */
 586                 val = vec[node->divfeat];
 587                 diff = val - node->divval;
 588                 bestChild = (diff < 0) ? node->child1 : node->child2;
 589                 otherChild = (diff < 0) ? node->child2 : node->child1;
 590
 591                 /* Create a branch record for the branch not taken.  Add distance
 592                         of this feature boundary (we don't attempt to correct for any
 593                         use of this feature in a parent node, which is unlikely to
 594                         happen and would have only a small effect).  Don't bother
 595                         adding more branches to heap after halfway point, as cost of
 596                         adding exceeds their value.
 597                 */
 598
 599                 float new_distsq = flann_dist(&val, &val+1, &node->divval, mindistsq);
 600 //              if (2 * checkCount < maxCheck  ||  !result.full()) {
 601                 if (new_distsq < result.worstDist() ||  !result.full()) {
 602                         heap->insert( BranchSt::make_branch(otherChild, new_distsq) );
 603                 }
 604
 605                 /* Call recursively to search next level down. */
 606                 searchLevel(result, vec, bestChild, mindistsq, checkCount, maxCheck);
 607         }
 608
 609         /**
 610          * Performs an exact search in the tree starting from a node.
 611          */
 612         void searchLevelExact(ResultSet& result, const float* vec, Tree node, float mindistsq)
 613         {
 614                 if (mindistsq>result.worstDist()) {
 615                         return;
 616                 }
 617
 618                 float val, diff;
 619                 Tree bestChild, otherChild;
 620
 621                 /* If this is a leaf node, then do check and return. */
 622                 if (node->child1 == NULL  &&  node->child2 == NULL) {
 623
 624                         /* Do not check same node more than once when searching multiple trees.
 625                                 Once a vector is checked, we set its location in vind to the
 626                                 current checkID.
 627                         */
 628                         if (vind[node->divfeat] == checkID)
 629                                 return;
 630                         vind[node->divfeat] = checkID;
 631
 632                         result.addPoint(dataset[node->divfeat],node->divfeat);
 633                         return;
 634                 }
 635
 636                 /* Which child branch should be taken first? */
 637                 val = vec[node->divfeat];
 638                 diff = val - node->divval;
 639                 bestChild = (diff < 0) ? node->child1 : node->child2;
 640                 otherChild = (diff < 0) ? node->child2 : node->child1;
 641
 642
 643                 /* Call recursively to search next level down. */
 644                 searchLevelExact(result, vec, bestChild, mindistsq);
 645                 float new_distsq = flann_dist(&val, &val+1, &node->divval, mindistsq);
 646                 searchLevelExact(result, vec, otherChild, new_distsq);
 647         }
 648
 649 };   // class KDTree
 650
 651 }
 652
 653 #endif //KDTREE_H