1 /***************************************
2 $Header: /home/amb/routino/src/RCS/sorting.c,v 1.11 2010/09/25 13:54:18 amb Exp $
6 Part of the Routino routing software.
7 ******************/ /******************
8 This file Copyright 2009-2010 Andrew M. Bishop
10 This program is free software: you can redistribute it and/or modify
11 it under the terms of the GNU Affero General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU Affero General Public License for more details.
20 You should have received a copy of the GNU Affero General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 ***************************************/
31 #include "functions.h"
36 /*+ The command line '--tmpdir' option or its default value. +*/
37 extern char *option_tmpdirname;
39 /*+ The amount of RAM to use for filesorting. +*/
40 extern size_t option_filesort_ramsize;
43 /*++++++++++++++++++++++++++++++++++++++
44 A function to sort the contents of a file of fixed length objects using a
45 limited amount of RAM.
47 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
48 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
49 The individual sort steps and the merge step both use a "Heap sort"
50 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
51 if the data is already partially sorted.
53 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
55 int fd_out The file descriptor of the output file (opened for writing and empty).
57 size_t itemsize The size of each item in the file that needs sorting.
59 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
60 data to be sorted is an array of things not pointers).
62 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
63 returns 1 then it is written to the output file.
64 ++++++++++++++++++++++++++++++++++++++*/
66 void filesort_fixed(int fd_in,int fd_out,size_t itemsize,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
68 int *fds=NULL,*heap=NULL;
70 index_t count=0,total=0;
71 size_t nitems=option_filesort_ramsize/(itemsize+sizeof(void*));
72 void *data=NULL,**datap=NULL;
76 /* Allocate the RAM buffer and other bits */
78 data=malloc(nitems*itemsize);
79 datap=malloc(nitems*sizeof(void*));
81 filename=(char*)malloc(strlen(option_tmpdirname)+24);
83 /* Loop around, fill the buffer, sort the data and write a temporary file */
89 /* Read in the data and create pointers */
93 datap[i]=data+i*itemsize;
95 if(ReadFile(fd_in,datap[i],itemsize))
109 /* Sort the data pointers using a heap sort */
111 filesort_heapsort(datap,n,compare);
113 /* Shortcut if all read in and sorted at once */
115 if(nfiles==0 && !more)
119 if(!buildindex || buildindex(datap[i],count))
121 WriteFile(fd_out,datap[i],itemsize);
129 /* Create a temporary file and write the result */
131 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
133 fd=OpenFileNew(filename);
136 WriteFile(fd,datap[i],itemsize);
144 /* Shortcut if only one file (unlucky for us there must have been exactly
145 nitems, lucky for us we still have the data in RAM) */
149 for(i=0;i<nitems;i++)
151 if(!buildindex || buildindex(datap[i],count))
153 WriteFile(fd_out,datap[i],itemsize);
158 DeleteFile(filename);
163 /* Check that number of files is less than file size */
165 assert(nfiles<nitems);
167 /* Open all of the temporary files */
169 fds=(int*)malloc(nfiles*sizeof(int));
171 for(i=0;i<nfiles;i++)
173 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
175 fds[i]=ReOpenFile(filename);
177 DeleteFile(filename);
180 /* Perform an n-way merge using a binary heap */
182 heap=(int*)malloc(nfiles*sizeof(int));
184 /* Fill the heap to start with */
186 for(i=0;i<nfiles;i++)
190 datap[i]=data+i*itemsize;
192 ReadFile(fds[i],datap[i],itemsize);
198 /* Bubble up the new value */
201 compare(datap[heap[index]],datap[heap[(index-1)/2]])<0)
206 newindex=(index-1)/2;
209 heap[index]=heap[newindex];
216 /* Repeatedly pull out the root of the heap and refill from the same file */
224 if(!buildindex || buildindex(datap[heap[0]],count))
226 WriteFile(fd_out,datap[heap[0]],itemsize);
230 if(ReadFile(fds[heap[0]],datap[heap[0]],itemsize))
236 /* Bubble down the new value */
238 while((2*index+2)<ndata &&
239 (compare(datap[heap[index]],datap[heap[2*index+1]])>0 ||
240 compare(datap[heap[index]],datap[heap[2*index+2]])>0))
245 if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0)
251 heap[newindex]=heap[index];
257 if((2*index+2)==ndata &&
258 compare(datap[heap[index]],datap[heap[2*index+1]])>0)
266 heap[newindex]=heap[index];
278 for(i=0;i<nfiles;i++)
292 /*++++++++++++++++++++++++++++++++++++++
293 A function to sort the contents of a file of variable length objects (each
294 preceded by its length in 2 bytes) using a limited amount of RAM.
296 The data is sorted using a "Merge sort" http://en.wikipedia.org/wiki/Merge_sort
297 and in particular an "external sort" http://en.wikipedia.org/wiki/External_sorting.
298 The individual sort steps and the merge step both use a "Heap sort"
299 http://en.wikipedia.org/wiki/Heapsort. The combination of the two should work well
300 if the data is already partially sorted.
302 int fd_in The file descriptor of the input file (opened for reading and at the beginning).
304 int fd_out The file descriptor of the output file (opened for writing and empty).
306 int (*compare)(const void*, const void*) The comparison function (identical to qsort if the
307 data to be sorted is an array of things not pointers).
309 int (*buildindex)(void *,index_t) If non-NULL then this function is called for each item, if it
310 returns 1 then it is written to the output file.
311 ++++++++++++++++++++++++++++++++++++++*/
313 void filesort_vary(int fd_in,int fd_out,int (*compare)(const void*,const void*),int (*buildindex)(void*,index_t))
315 int *fds=NULL,*heap=NULL;
316 int nfiles=0,ndata=0;
317 index_t count=0,total=0;
318 FILESORT_VARINT nextitemsize,largestitemsize=0;
319 void *data=NULL,**datap=NULL;
323 /* Allocate the RAM buffer and other bits */
325 data=malloc(option_filesort_ramsize);
327 filename=(char*)malloc(strlen(option_tmpdirname)+24);
329 /* Loop around, fill the buffer, sort the data and write a temporary file */
331 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE)) /* Always have the next item size known in advance */
337 size_t ramused=FILESORT_VARALIGN-FILESORT_VARSIZE;
339 datap=data+option_filesort_ramsize;
341 /* Read in the data and create pointers */
343 while((ramused+FILESORT_VARSIZE+nextitemsize)<=((void*)datap-sizeof(void*)-data))
345 FILESORT_VARINT itemsize=nextitemsize;
347 if(itemsize>largestitemsize)
348 largestitemsize=itemsize;
350 *(FILESORT_VARINT*)(data+ramused)=itemsize;
352 ramused+=FILESORT_VARSIZE;
354 ReadFile(fd_in,data+ramused,itemsize);
356 *--datap=data+ramused; /* points to real data */
360 ramused =FILESORT_VARALIGN*((ramused+FILESORT_VARSIZE-1)/FILESORT_VARALIGN);
361 ramused+=FILESORT_VARALIGN-FILESORT_VARSIZE;
366 if(ReadFile(fd_in,&nextitemsize,FILESORT_VARSIZE))
376 /* Sort the data pointers using a heap sort */
378 filesort_heapsort(datap,n,compare);
380 /* Shortcut if all read in and sorted at once */
382 if(nfiles==0 && !more)
386 if(!buildindex || buildindex(datap[i],count))
388 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
390 WriteFile(fd_out,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
398 /* Create a temporary file and write the result */
400 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,nfiles);
402 fd=OpenFileNew(filename);
406 FILESORT_VARINT itemsize=*(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE);
408 WriteFile(fd,datap[i]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
417 /* Check that number of files is less than file size */
419 largestitemsize=FILESORT_VARALIGN*(1+(largestitemsize+FILESORT_VARALIGN-FILESORT_VARSIZE)/FILESORT_VARALIGN);
421 assert(nfiles<((option_filesort_ramsize-nfiles*sizeof(void*))/largestitemsize));
423 /* Open all of the temporary files */
425 fds=(int*)malloc(nfiles*sizeof(int));
427 for(i=0;i<nfiles;i++)
429 sprintf(filename,"%s/filesort.%d.tmp",option_tmpdirname,i);
431 fds[i]=ReOpenFile(filename);
433 DeleteFile(filename);
436 /* Perform an n-way merge using a binary heap */
438 heap=(int*)malloc(nfiles*sizeof(int));
440 datap=data+option_filesort_ramsize-nfiles*sizeof(void*);
442 /* Fill the heap to start with */
444 for(i=0;i<nfiles;i++)
447 FILESORT_VARINT itemsize;
449 datap[i]=data+FILESORT_VARALIGN-FILESORT_VARSIZE+i*largestitemsize;
451 ReadFile(fds[i],&itemsize,FILESORT_VARSIZE);
453 *(FILESORT_VARINT*)(datap[i]-FILESORT_VARSIZE)=itemsize;
455 ReadFile(fds[i],datap[i],itemsize);
461 /* Bubble up the new value */
464 compare(datap[heap[index]],datap[heap[(index-1)/2]])<0)
469 newindex=(index-1)/2;
472 heap[index]=heap[newindex];
479 /* Repeatedly pull out the root of the heap and refill from the same file */
486 FILESORT_VARINT itemsize;
488 if(!buildindex || buildindex(datap[heap[0]],count))
490 itemsize=*(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE);
492 WriteFile(fd_out,datap[heap[0]]-FILESORT_VARSIZE,itemsize+FILESORT_VARSIZE);
496 if(ReadFile(fds[heap[0]],&itemsize,FILESORT_VARSIZE))
503 *(FILESORT_VARINT*)(datap[heap[0]]-FILESORT_VARSIZE)=itemsize;
505 ReadFile(fds[heap[0]],datap[heap[0]],itemsize);
508 /* Bubble down the new value */
510 while((2*index+2)<ndata &&
511 (compare(datap[heap[index]],datap[heap[2*index+1]])>0 ||
512 compare(datap[heap[index]],datap[heap[2*index+2]])>0))
517 if(compare(datap[heap[2*index+1]],datap[heap[2*index+2]])<0)
523 heap[newindex]=heap[index];
529 if((2*index+2)==ndata &&
530 compare(datap[heap[index]],datap[heap[2*index+1]])>0)
538 heap[newindex]=heap[index];
550 for(i=0;i<nfiles;i++)
563 /*++++++++++++++++++++++++++++++++++++++
564 A function to sort an array of pointers efficiently.
566 The data is sorted using a "Heap sort" http://en.wikipedia.org/wiki/Heapsort,
567 in particular an this good because it can operate in-place and doesn't
568 allocate more memory like using qsort() does.
570 void **datap A pointer to the array of pointers to sort.
572 size_t nitems The number of items of data to sort.
574 int(*compare)(const void *, const void *) The comparison function (identical to qsort if the
575 data to be sorted was an array of things not pointers).
576 ++++++++++++++++++++++++++++++++++++++*/
578 void filesort_heapsort(void **datap,size_t nitems,int(*compare)(const void*, const void*))
582 /* Fill the heap by pretending to insert the data that is already there */
584 for(i=1;i<nitems;i++)
588 /* Bubble up the new value (upside-down, put largest at top) */
591 compare(datap[index],datap[(index-1)/2])>0) /* reversed compared to filesort() above */
596 newindex=(index-1)/2;
599 datap[index]=datap[newindex];
600 datap[newindex]=temp;
606 /* Repeatedly pull out the root of the heap and swap with the bottom item */
608 for(i=nitems-1;i>0;i--)
614 datap[index]=datap[i];
617 /* Bubble down the new value (upside-down, put largest at top) */
619 while((2*index+2)<i &&
620 (compare(datap[index],datap[2*index+1])<0 || /* reversed compared to filesort() above */
621 compare(datap[index],datap[2*index+2])<0)) /* reversed compared to filesort() above */
626 if(compare(datap[2*index+1],datap[2*index+2])>0) /* reversed compared to filesort() above */
631 temp=datap[newindex];
632 datap[newindex]=datap[index];
639 compare(datap[index],datap[2*index+1])<0) /* reversed compared to filesort() above */
646 temp=datap[newindex];
647 datap[newindex]=datap[index];