Initial public busybox upstream commit
[busybox4maemo] / archival / bz / blocksort.c
1 /*
2  * bzip2 is written by Julian Seward <jseward@bzip.org>.
3  * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
4  * See README and LICENSE files in this directory for more information.
5  */
6
7 /*-------------------------------------------------------------*/
8 /*--- Block sorting machinery                               ---*/
9 /*---                                           blocksort.c ---*/
10 /*-------------------------------------------------------------*/
11
12 /* ------------------------------------------------------------------
13 This file is part of bzip2/libbzip2, a program and library for
14 lossless, block-sorting data compression.
15
16 bzip2/libbzip2 version 1.0.4 of 20 December 2006
17 Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
18
19 Please read the WARNING, DISCLAIMER and PATENTS sections in the
20 README file.
21
22 This program is released under the terms of the license contained
23 in the file LICENSE.
24 ------------------------------------------------------------------ */
25
26 /* #include "bzlib_private.h" */
27
28 #define mswap(zz1, zz2) \
29 { \
30         int32_t zztmp = zz1; \
31         zz1 = zz2; \
32         zz2 = zztmp; \
33 }
34
35 static
36 /* No measurable speed gain with inlining */
37 /* ALWAYS_INLINE */
38 void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
39 {
40         while (zzn > 0) {
41                 mswap(ptr[zzp1], ptr[zzp2]);
42                 zzp1++;
43                 zzp2++;
44                 zzn--;
45         }
46 }
47
48 static
49 ALWAYS_INLINE
50 int32_t mmin(int32_t a, int32_t b)
51 {
52         return (a < b) ? a : b;
53 }
54
55
56 /*---------------------------------------------*/
57 /*--- Fallback O(N log(N)^2) sorting        ---*/
58 /*--- algorithm, for repetitive blocks      ---*/
59 /*---------------------------------------------*/
60
61 /*---------------------------------------------*/
62 static
63 inline
64 void fallbackSimpleSort(uint32_t* fmap,
65                 uint32_t* eclass,
66                 int32_t   lo,
67                 int32_t   hi)
68 {
69         int32_t i, j, tmp;
70         uint32_t ec_tmp;
71
72         if (lo == hi) return;
73
74         if (hi - lo > 3) {
75                 for (i = hi-4; i >= lo; i--) {
76                         tmp = fmap[i];
77                         ec_tmp = eclass[tmp];
78                         for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
79                                 fmap[j-4] = fmap[j];
80                         fmap[j-4] = tmp;
81                 }
82         }
83
84         for (i = hi-1; i >= lo; i--) {
85                 tmp = fmap[i];
86                 ec_tmp = eclass[tmp];
87                 for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
88                         fmap[j-1] = fmap[j];
89                 fmap[j-1] = tmp;
90         }
91 }
92
93
94 /*---------------------------------------------*/
95 #define fpush(lz,hz) { \
96         stackLo[sp] = lz; \
97         stackHi[sp] = hz; \
98         sp++; \
99 }
100
101 #define fpop(lz,hz) { \
102         sp--; \
103         lz = stackLo[sp]; \
104         hz = stackHi[sp]; \
105 }
106
107 #define FALLBACK_QSORT_SMALL_THRESH 10
108 #define FALLBACK_QSORT_STACK_SIZE   100
109
110 static
111 void fallbackQSort3(uint32_t* fmap,
112                 uint32_t* eclass,
113                 int32_t   loSt,
114                 int32_t   hiSt)
115 {
116         int32_t unLo, unHi, ltLo, gtHi, n, m;
117         int32_t sp, lo, hi;
118         uint32_t med, r, r3;
119         int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
120         int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
121
122         r = 0;
123
124         sp = 0;
125         fpush(loSt, hiSt);
126
127         while (sp > 0) {
128                 AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
129
130                 fpop(lo, hi);
131                 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
132                         fallbackSimpleSort(fmap, eclass, lo, hi);
133                         continue;
134                 }
135
136                 /* Random partitioning.  Median of 3 sometimes fails to
137                  * avoid bad cases.  Median of 9 seems to help but
138                  * looks rather expensive.  This too seems to work but
139                  * is cheaper.  Guidance for the magic constants
140                  * 7621 and 32768 is taken from Sedgewick's algorithms
141                  * book, chapter 35.
142                  */
143                 r = ((r * 7621) + 1) % 32768;
144                 r3 = r % 3;
145                 if (r3 == 0)
146                         med = eclass[fmap[lo]];
147                 else if (r3 == 1)
148                         med = eclass[fmap[(lo+hi)>>1]];
149                 else
150                         med = eclass[fmap[hi]];
151
152                 unLo = ltLo = lo;
153                 unHi = gtHi = hi;
154
155                 while (1) {
156                         while (1) {
157                                 if (unLo > unHi) break;
158                                 n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
159                                 if (n == 0) {
160                                         mswap(fmap[unLo], fmap[ltLo]);
161                                         ltLo++;
162                                         unLo++;
163                                         continue;
164                                 };
165                                 if (n > 0) break;
166                                 unLo++;
167                         }
168                         while (1) {
169                                 if (unLo > unHi) break;
170                                 n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
171                                 if (n == 0) {
172                                         mswap(fmap[unHi], fmap[gtHi]);
173                                         gtHi--; unHi--;
174                                         continue;
175                                 };
176                                 if (n < 0) break;
177                                 unHi--;
178                         }
179                         if (unLo > unHi) break;
180                         mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
181                 }
182
183                 AssertD(unHi == unLo-1, "fallbackQSort3(2)");
184
185                 if (gtHi < ltLo) continue;
186
187                 n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
188                 m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
189
190                 n = lo + unLo - ltLo - 1;
191                 m = hi - (gtHi - unHi) + 1;
192
193                 if (n - lo > hi - m) {
194                         fpush(lo, n);
195                         fpush(m, hi);
196                 } else {
197                         fpush(m, hi);
198                         fpush(lo, n);
199                 }
200         }
201 }
202
203 #undef fpush
204 #undef fpop
205 #undef FALLBACK_QSORT_SMALL_THRESH
206 #undef FALLBACK_QSORT_STACK_SIZE
207
208
209 /*---------------------------------------------*/
210 /* Pre:
211  *      nblock > 0
212  *      eclass exists for [0 .. nblock-1]
213  *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
214  *      ptr exists for [0 .. nblock-1]
215  *
216  * Post:
217  *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
218  *      All other areas of eclass destroyed
219  *      fmap [0 .. nblock-1] holds sorted order
220  *      bhtab[0 .. 2+(nblock/32)] destroyed
221 */
222
223 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
224 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
225 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
226 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
227 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
228
229 static
230 void fallbackSort(uint32_t* fmap,
231                 uint32_t* eclass,
232                 uint32_t* bhtab,
233                 int32_t   nblock)
234 {
235         int32_t ftab[257];
236         int32_t ftabCopy[256];
237         int32_t H, i, j, k, l, r, cc, cc1;
238         int32_t nNotDone;
239         int32_t nBhtab;
240         uint8_t* eclass8 = (uint8_t*)eclass;
241
242         /*
243          * Initial 1-char radix sort to generate
244          * initial fmap and initial BH bits.
245          */
246         for (i = 0; i < 257;    i++) ftab[i] = 0;
247         for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
248         for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
249
250         j = ftab[0];  /* bbox: optimized */
251         for (i = 1; i < 257;    i++) {
252                 j += ftab[i];
253                 ftab[i] = j;
254         }
255
256         for (i = 0; i < nblock; i++) {
257                 j = eclass8[i];
258                 k = ftab[j] - 1;
259                 ftab[j] = k;
260                 fmap[k] = i;
261         }
262
263         nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
264         for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
265         for (i = 0; i < 256; i++) SET_BH(ftab[i]);
266
267         /*
268          * Inductively refine the buckets.  Kind-of an
269          * "exponential radix sort" (!), inspired by the
270          * Manber-Myers suffix array construction algorithm.
271          */
272
273         /*-- set sentinel bits for block-end detection --*/
274         for (i = 0; i < 32; i++) {
275                 SET_BH(nblock + 2*i);
276                 CLEAR_BH(nblock + 2*i + 1);
277         }
278
279         /*-- the log(N) loop --*/
280         H = 1;
281         while (1) {
282                 j = 0;
283                 for (i = 0; i < nblock; i++) {
284                         if (ISSET_BH(i))
285                                 j = i;
286                         k = fmap[i] - H;
287                         if (k < 0)
288                                 k += nblock;
289                         eclass[k] = j;
290                 }
291
292                 nNotDone = 0;
293                 r = -1;
294                 while (1) {
295
296                 /*-- find the next non-singleton bucket --*/
297                         k = r + 1;
298                         while (ISSET_BH(k) && UNALIGNED_BH(k))
299                                 k++;
300                         if (ISSET_BH(k)) {
301                                 while (WORD_BH(k) == 0xffffffff) k += 32;
302                                 while (ISSET_BH(k)) k++;
303                         }
304                         l = k - 1;
305                         if (l >= nblock)
306                                 break;
307                         while (!ISSET_BH(k) && UNALIGNED_BH(k))
308                                 k++;
309                         if (!ISSET_BH(k)) {
310                                 while (WORD_BH(k) == 0x00000000) k += 32;
311                                 while (!ISSET_BH(k)) k++;
312                         }
313                         r = k - 1;
314                         if (r >= nblock)
315                                 break;
316
317                         /*-- now [l, r] bracket current bucket --*/
318                         if (r > l) {
319                                 nNotDone += (r - l + 1);
320                                 fallbackQSort3(fmap, eclass, l, r);
321
322                                 /*-- scan bucket and generate header bits-- */
323                                 cc = -1;
324                                 for (i = l; i <= r; i++) {
325                                         cc1 = eclass[fmap[i]];
326                                         if (cc != cc1) {
327                                                 SET_BH(i);
328                                                 cc = cc1;
329                                         };
330                                 }
331                         }
332                 }
333
334                 H *= 2;
335                 if (H > nblock || nNotDone == 0)
336                         break;
337         }
338
339         /*
340          * Reconstruct the original block in
341          * eclass8 [0 .. nblock-1], since the
342          * previous phase destroyed it.
343          */
344         j = 0;
345         for (i = 0; i < nblock; i++) {
346                 while (ftabCopy[j] == 0)
347                         j++;
348                 ftabCopy[j]--;
349                 eclass8[fmap[i]] = (uint8_t)j;
350         }
351         AssertH(j < 256, 1005);
352 }
353
354 #undef       SET_BH
355 #undef     CLEAR_BH
356 #undef     ISSET_BH
357 #undef      WORD_BH
358 #undef UNALIGNED_BH
359
360
361 /*---------------------------------------------*/
362 /*--- The main, O(N^2 log(N)) sorting       ---*/
363 /*--- algorithm.  Faster for "normal"       ---*/
364 /*--- non-repetitive blocks.                ---*/
365 /*---------------------------------------------*/
366
367 /*---------------------------------------------*/
368 static
369 NOINLINE
370 int mainGtU(
371                 uint32_t  i1,
372                 uint32_t  i2,
373                 uint8_t*  block,
374                 uint16_t* quadrant,
375                 uint32_t  nblock,
376                 int32_t*  budget)
377 {
378         int32_t  k;
379         uint8_t  c1, c2;
380         uint16_t s1, s2;
381
382 /* Loop unrolling here is actually very useful
383  * (generated code is much simpler),
384  * code size increase is only 270 bytes (i386)
385  * but speeds up compression 10% overall
386  */
387
388 #if CONFIG_BZIP2_FEATURE_SPEED >= 1
389
390 #define TIMES_8(code) \
391         code; code; code; code; \
392         code; code; code; code;
393 #define TIMES_12(code) \
394         code; code; code; code; \
395         code; code; code; code; \
396         code; code; code; code;
397
398 #else
399
400 #define TIMES_8(code) \
401 { \
402         int nn = 8; \
403         do { \
404                 code; \
405         } while (--nn); \
406 }
407 #define TIMES_12(code) \
408 { \
409         int nn = 12; \
410         do { \
411                 code; \
412         } while (--nn); \
413 }
414
415 #endif
416
417         AssertD(i1 != i2, "mainGtU");
418         TIMES_12(
419                 c1 = block[i1]; c2 = block[i2];
420                 if (c1 != c2) return (c1 > c2);
421                 i1++; i2++;
422         )
423
424         k = nblock + 8;
425
426         do {
427                 TIMES_8(
428                         c1 = block[i1]; c2 = block[i2];
429                         if (c1 != c2) return (c1 > c2);
430                         s1 = quadrant[i1]; s2 = quadrant[i2];
431                         if (s1 != s2) return (s1 > s2);
432                         i1++; i2++;
433                 )
434
435                 if (i1 >= nblock) i1 -= nblock;
436                 if (i2 >= nblock) i2 -= nblock;
437
438                 (*budget)--;
439                 k -= 8;
440         } while (k >= 0);
441
442         return False;
443 }
444 #undef TIMES_8
445 #undef TIMES_12
446
447 /*---------------------------------------------*/
448 /*
449  * Knuth's increments seem to work better
450  * than Incerpi-Sedgewick here.  Possibly
451  * because the number of elems to sort is
452  * usually small, typically <= 20.
453  */
454 static
455 const int32_t incs[14] = {
456         1, 4, 13, 40, 121, 364, 1093, 3280,
457         9841, 29524, 88573, 265720,
458         797161, 2391484
459 };
460
461 static
462 void mainSimpleSort(uint32_t* ptr,
463                 uint8_t*  block,
464                 uint16_t* quadrant,
465                 int32_t   nblock,
466                 int32_t   lo,
467                 int32_t   hi,
468                 int32_t   d,
469                 int32_t*  budget)
470 {
471         int32_t i, j, h, bigN, hp;
472         uint32_t v;
473
474         bigN = hi - lo + 1;
475         if (bigN < 2) return;
476
477         hp = 0;
478         while (incs[hp] < bigN) hp++;
479         hp--;
480
481         for (; hp >= 0; hp--) {
482                 h = incs[hp];
483
484                 i = lo + h;
485                 while (1) {
486                         /*-- copy 1 --*/
487                         if (i > hi) break;
488                         v = ptr[i];
489                         j = i;
490                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
491                                 ptr[j] = ptr[j-h];
492                                 j = j - h;
493                                 if (j <= (lo + h - 1)) break;
494                         }
495                         ptr[j] = v;
496                         i++;
497
498 /* 1.5% overall speedup, +290 bytes */
499 #if CONFIG_BZIP2_FEATURE_SPEED >= 3
500                         /*-- copy 2 --*/
501                         if (i > hi) break;
502                         v = ptr[i];
503                         j = i;
504                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
505                                 ptr[j] = ptr[j-h];
506                                 j = j - h;
507                                 if (j <= (lo + h - 1)) break;
508                         }
509                         ptr[j] = v;
510                         i++;
511
512                         /*-- copy 3 --*/
513                         if (i > hi) break;
514                         v = ptr[i];
515                         j = i;
516                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
517                                 ptr[j] = ptr[j-h];
518                                 j = j - h;
519                                 if (j <= (lo + h - 1)) break;
520                         }
521                         ptr[j] = v;
522                         i++;
523 #endif
524                         if (*budget < 0) return;
525                 }
526         }
527 }
528
529
530 /*---------------------------------------------*/
531 /*
532  * The following is an implementation of
533  * an elegant 3-way quicksort for strings,
534  * described in a paper "Fast Algorithms for
535  * Sorting and Searching Strings", by Robert
536  * Sedgewick and Jon L. Bentley.
537  */
538
539 static
540 ALWAYS_INLINE
541 uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
542 {
543         uint8_t t;
544         if (a > b) {
545                 t = a;
546                 a = b;
547                 b = t;
548         };
549         /* here b >= a */
550         if (b > c) {
551                 b = c;
552                 if (a > b)
553                         b = a;
554         }
555         return b;
556 }
557
558 #define mpush(lz,hz,dz) \
559 { \
560         stackLo[sp] = lz; \
561         stackHi[sp] = hz; \
562         stackD [sp] = dz; \
563         sp++; \
564 }
565
566 #define mpop(lz,hz,dz) \
567 { \
568         sp--; \
569         lz = stackLo[sp]; \
570         hz = stackHi[sp]; \
571         dz = stackD [sp]; \
572 }
573
574 #define mnextsize(az) (nextHi[az] - nextLo[az])
575
576 #define mnextswap(az,bz) \
577 { \
578         int32_t tz; \
579         tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
580         tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
581         tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
582 }
583
584 #define MAIN_QSORT_SMALL_THRESH 20
585 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
586 #define MAIN_QSORT_STACK_SIZE   100
587
588 static
589 void mainQSort3(uint32_t* ptr,
590                 uint8_t*  block,
591                 uint16_t* quadrant,
592                 int32_t   nblock,
593                 int32_t   loSt,
594                 int32_t   hiSt,
595                 int32_t   dSt,
596                 int32_t*  budget)
597 {
598         int32_t unLo, unHi, ltLo, gtHi, n, m, med;
599         int32_t sp, lo, hi, d;
600
601         int32_t stackLo[MAIN_QSORT_STACK_SIZE];
602         int32_t stackHi[MAIN_QSORT_STACK_SIZE];
603         int32_t stackD [MAIN_QSORT_STACK_SIZE];
604
605         int32_t nextLo[3];
606         int32_t nextHi[3];
607         int32_t nextD [3];
608
609         sp = 0;
610         mpush(loSt, hiSt, dSt);
611
612         while (sp > 0) {
613                 AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
614
615                 mpop(lo, hi, d);
616                 if (hi - lo < MAIN_QSORT_SMALL_THRESH
617                  || d > MAIN_QSORT_DEPTH_THRESH
618                 ) {
619                         mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
620                         if (*budget < 0)
621                                 return;
622                         continue;
623                 }
624                 med = (int32_t) mmed3(block[ptr[lo          ] + d],
625                                       block[ptr[hi          ] + d],
626                                       block[ptr[(lo+hi) >> 1] + d]);
627
628                 unLo = ltLo = lo;
629                 unHi = gtHi = hi;
630
631                 while (1) {
632                         while (1) {
633                                 if (unLo > unHi)
634                                         break;
635                                 n = ((int32_t)block[ptr[unLo]+d]) - med;
636                                 if (n == 0) {
637                                         mswap(ptr[unLo], ptr[ltLo]);
638                                         ltLo++;
639                                         unLo++;
640                                         continue;
641                                 };
642                                 if (n >  0) break;
643                                 unLo++;
644                         }
645                         while (1) {
646                                 if (unLo > unHi)
647                                         break;
648                                 n = ((int32_t)block[ptr[unHi]+d]) - med;
649                                 if (n == 0) {
650                                         mswap(ptr[unHi], ptr[gtHi]);
651                                         gtHi--;
652                                         unHi--;
653                                         continue;
654                                 };
655                                 if (n <  0) break;
656                                 unHi--;
657                         }
658                         if (unLo > unHi)
659                                 break;
660                         mswap(ptr[unLo], ptr[unHi]);
661                         unLo++;
662                         unHi--;
663                 }
664
665                 AssertD(unHi == unLo-1, "mainQSort3(2)");
666
667                 if (gtHi < ltLo) {
668                         mpush(lo, hi, d + 1);
669                         continue;
670                 }
671
672                 n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
673                 m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
674
675                 n = lo + unLo - ltLo - 1;
676                 m = hi - (gtHi - unHi) + 1;
677
678                 nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
679                 nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
680                 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
681
682                 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
683                 if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
684                 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
685
686                 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
687                 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
688
689                 mpush(nextLo[0], nextHi[0], nextD[0]);
690                 mpush(nextLo[1], nextHi[1], nextD[1]);
691                 mpush(nextLo[2], nextHi[2], nextD[2]);
692         }
693 }
694
695 #undef mpush
696 #undef mpop
697 #undef mnextsize
698 #undef mnextswap
699 #undef MAIN_QSORT_SMALL_THRESH
700 #undef MAIN_QSORT_DEPTH_THRESH
701 #undef MAIN_QSORT_STACK_SIZE
702
703
704 /*---------------------------------------------*/
705 /* Pre:
706  *      nblock > N_OVERSHOOT
707  *      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
708  *      ((uint8_t*)block32) [0 .. nblock-1] holds block
709  *      ptr exists for [0 .. nblock-1]
710  *
711  * Post:
712  *      ((uint8_t*)block32) [0 .. nblock-1] holds block
713  *      All other areas of block32 destroyed
714  *      ftab[0 .. 65536] destroyed
715  *      ptr [0 .. nblock-1] holds sorted order
716  *      if (*budget < 0), sorting was abandoned
717  */
718
719 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
720 #define SETMASK (1 << 21)
721 #define CLEARMASK (~(SETMASK))
722
723 static NOINLINE
724 void mainSort(EState* state,
725                 uint32_t* ptr,
726                 uint8_t*  block,
727                 uint16_t* quadrant,
728                 uint32_t* ftab,
729                 int32_t   nblock,
730                 int32_t*  budget)
731 {
732         int32_t  i, j, k, ss, sb;
733         uint8_t  c1;
734         int32_t  numQSorted;
735         uint16_t s;
736         Bool     bigDone[256];
737         /* bbox: moved to EState to save stack
738         int32_t  runningOrder[256];
739         int32_t  copyStart[256];
740         int32_t  copyEnd  [256];
741         */
742 #define runningOrder (state->mainSort__runningOrder)
743 #define copyStart    (state->mainSort__copyStart)
744 #define copyEnd      (state->mainSort__copyEnd)
745
746         /*-- set up the 2-byte frequency table --*/
747         /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
748         memset(ftab, 0, 65537 * sizeof(ftab[0]));
749
750         j = block[0] << 8;
751         i = nblock - 1;
752 /* 3%, +300 bytes */
753 #if CONFIG_BZIP2_FEATURE_SPEED >= 2
754         for (; i >= 3; i -= 4) {
755                 quadrant[i] = 0;
756                 j = (j >> 8) | (((uint16_t)block[i]) << 8);
757                 ftab[j]++;
758                 quadrant[i-1] = 0;
759                 j = (j >> 8) | (((uint16_t)block[i-1]) << 8);
760                 ftab[j]++;
761                 quadrant[i-2] = 0;
762                 j = (j >> 8) | (((uint16_t)block[i-2]) << 8);
763                 ftab[j]++;
764                 quadrant[i-3] = 0;
765                 j = (j >> 8) | (((uint16_t)block[i-3]) << 8);
766                 ftab[j]++;
767         }
768 #endif
769         for (; i >= 0; i--) {
770                 quadrant[i] = 0;
771                 j = (j >> 8) | (((uint16_t)block[i]) << 8);
772                 ftab[j]++;
773         }
774
775         /*-- (emphasises close relationship of block & quadrant) --*/
776         for (i = 0; i < BZ_N_OVERSHOOT; i++) {
777                 block   [nblock+i] = block[i];
778                 quadrant[nblock+i] = 0;
779         }
780
781         /*-- Complete the initial radix sort --*/
782         j = ftab[0]; /* bbox: optimized */
783         for (i = 1; i <= 65536; i++) {
784                 j += ftab[i];
785                 ftab[i] = j;
786         }
787
788         s = block[0] << 8;
789         i = nblock - 1;
790 #if CONFIG_BZIP2_FEATURE_SPEED >= 2
791         for (; i >= 3; i -= 4) {
792                 s = (s >> 8) | (block[i] << 8);
793                 j = ftab[s] - 1;
794                 ftab[s] = j;
795                 ptr[j] = i;
796                 s = (s >> 8) | (block[i-1] << 8);
797                 j = ftab[s] - 1;
798                 ftab[s] = j;
799                 ptr[j] = i-1;
800                 s = (s >> 8) | (block[i-2] << 8);
801                 j = ftab[s] - 1;
802                 ftab[s] = j;
803                 ptr[j] = i-2;
804                 s = (s >> 8) | (block[i-3] << 8);
805                 j = ftab[s] - 1;
806                 ftab[s] = j;
807                 ptr[j] = i-3;
808         }
809 #endif
810         for (; i >= 0; i--) {
811                 s = (s >> 8) | (block[i] << 8);
812                 j = ftab[s] - 1;
813                 ftab[s] = j;
814                 ptr[j] = i;
815         }
816
817         /*
818          * Now ftab contains the first loc of every small bucket.
819          * Calculate the running order, from smallest to largest
820          * big bucket.
821          */
822         for (i = 0; i <= 255; i++) {
823                 bigDone     [i] = False;
824                 runningOrder[i] = i;
825         }
826
827         {
828                 int32_t vv;
829                 /* bbox: was: int32_t h = 1; */
830                 /* do h = 3 * h + 1; while (h <= 256); */
831                 uint32_t h = 364;
832
833                 do {
834                         /*h = h / 3;*/
835                         h = (h * 171) >> 9; /* bbox: fast h/3 */
836                         for (i = h; i <= 255; i++) {
837                                 vv = runningOrder[i];
838                                 j = i;
839                                 while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
840                                         runningOrder[j] = runningOrder[j-h];
841                                         j = j - h;
842                                         if (j <= (h - 1))
843                                                 goto zero;
844                                 }
845  zero:
846                                 runningOrder[j] = vv;
847                         }
848                 } while (h != 1);
849         }
850
851         /*
852          * The main sorting loop.
853          */
854
855         numQSorted = 0;
856
857         for (i = 0; i <= 255; i++) {
858
859                 /*
860                  * Process big buckets, starting with the least full.
861                  * Basically this is a 3-step process in which we call
862                  * mainQSort3 to sort the small buckets [ss, j], but
863                  * also make a big effort to avoid the calls if we can.
864                  */
865                 ss = runningOrder[i];
866
867                 /*
868                  * Step 1:
869                  * Complete the big bucket [ss] by quicksorting
870                  * any unsorted small buckets [ss, j], for j != ss.
871                  * Hopefully previous pointer-scanning phases have already
872                  * completed many of the small buckets [ss, j], so
873                  * we don't have to sort them at all.
874                  */
875                 for (j = 0; j <= 255; j++) {
876                         if (j != ss) {
877                                 sb = (ss << 8) + j;
878                                 if (!(ftab[sb] & SETMASK)) {
879                                         int32_t lo =  ftab[sb]   & CLEARMASK;
880                                         int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
881                                         if (hi > lo) {
882                                                 mainQSort3(
883                                                         ptr, block, quadrant, nblock,
884                                                         lo, hi, BZ_N_RADIX, budget
885                                                 );
886                                                 if (*budget < 0) return;
887                                                 numQSorted += (hi - lo + 1);
888                                         }
889                                 }
890                                 ftab[sb] |= SETMASK;
891                         }
892                 }
893
894                 AssertH(!bigDone[ss], 1006);
895
896                 /*
897                  * Step 2:
898                  * Now scan this big bucket [ss] so as to synthesise the
899                  * sorted order for small buckets [t, ss] for all t,
900                  * including, magically, the bucket [ss,ss] too.
901                  * This will avoid doing Real Work in subsequent Step 1's.
902                  */
903                 {
904                         for (j = 0; j <= 255; j++) {
905                                 copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
906                                 copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
907                         }
908                         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
909                                 k = ptr[j] - 1;
910                                 if (k < 0)
911                                         k += nblock;
912                                 c1 = block[k];
913                                 if (!bigDone[c1])
914                                         ptr[copyStart[c1]++] = k;
915                         }
916                         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
917                                 k = ptr[j]-1;
918                                 if (k < 0)
919                                         k += nblock;
920                                 c1 = block[k];
921                                 if (!bigDone[c1])
922                                         ptr[copyEnd[c1]--] = k;
923                         }
924                 }
925
926                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
927                  * Necessity for this case is demonstrated by compressing
928                  * a sequence of approximately 48.5 million of character
929                  * 251; 1.0.0/1.0.1 will then die here. */
930                 AssertH((copyStart[ss]-1 == copyEnd[ss]) \
931                      || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
932
933                 for (j = 0; j <= 255; j++)
934                         ftab[(j << 8) + ss] |= SETMASK;
935
936                 /*
937                  * Step 3:
938                  * The [ss] big bucket is now done.  Record this fact,
939                  * and update the quadrant descriptors.  Remember to
940                  * update quadrants in the overshoot area too, if
941                  * necessary.  The "if (i < 255)" test merely skips
942                  * this updating for the last bucket processed, since
943                  * updating for the last bucket is pointless.
944                  *
945                  * The quadrant array provides a way to incrementally
946                  * cache sort orderings, as they appear, so as to
947                  * make subsequent comparisons in fullGtU() complete
948                  * faster.  For repetitive blocks this makes a big
949                  * difference (but not big enough to be able to avoid
950                  * the fallback sorting mechanism, exponential radix sort).
951                  *
952                  * The precise meaning is: at all times:
953                  *
954                  *      for 0 <= i < nblock and 0 <= j <= nblock
955                  *
956                  *      if block[i] != block[j],
957                  *
958                  *              then the relative values of quadrant[i] and
959                  *                        quadrant[j] are meaningless.
960                  *
961                  *              else {
962                  *                      if quadrant[i] < quadrant[j]
963                  *                              then the string starting at i lexicographically
964                  *                              precedes the string starting at j
965                  *
966                  *                      else if quadrant[i] > quadrant[j]
967                  *                              then the string starting at j lexicographically
968                  *                              precedes the string starting at i
969                  *
970                  *                      else
971                  *                              the relative ordering of the strings starting
972                  *                              at i and j has not yet been determined.
973                  *              }
974                  */
975                 bigDone[ss] = True;
976
977                 if (i < 255) {
978                         int32_t bbStart = ftab[ss << 8] & CLEARMASK;
979                         int32_t bbSize  = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
980                         int32_t shifts  = 0;
981
982                         while ((bbSize >> shifts) > 65534) shifts++;
983
984                         for (j = bbSize-1; j >= 0; j--) {
985                                 int32_t a2update   = ptr[bbStart + j];
986                                 uint16_t qVal      = (uint16_t)(j >> shifts);
987                                 quadrant[a2update] = qVal;
988                                 if (a2update < BZ_N_OVERSHOOT)
989                                         quadrant[a2update + nblock] = qVal;
990                         }
991                         AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
992                 }
993         }
994 #undef runningOrder
995 #undef copyStart
996 #undef copyEnd
997 }
998
999 #undef BIGFREQ
1000 #undef SETMASK
1001 #undef CLEARMASK
1002
1003
1004 /*---------------------------------------------*/
1005 /* Pre:
1006  *      nblock > 0
1007  *      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1008  *        ((uint8_t*)arr2)[0 .. nblock-1] holds block
1009  *      arr1 exists for [0 .. nblock-1]
1010  *
1011  * Post:
1012  *      ((uint8_t*)arr2) [0 .. nblock-1] holds block
1013  *      All other areas of block destroyed
1014  *      ftab[0 .. 65536] destroyed
1015  *      arr1[0 .. nblock-1] holds sorted order
1016  */
1017 static NOINLINE
1018 void BZ2_blockSort(EState* s)
1019 {
1020         /* In original bzip2 1.0.4, it's a parameter, but 30
1021          * (which was the default) should work ok. */
1022         enum { wfact = 30 };
1023
1024         uint32_t* ptr    = s->ptr;
1025         uint8_t*  block  = s->block;
1026         uint32_t* ftab   = s->ftab;
1027         int32_t   nblock = s->nblock;
1028         uint16_t* quadrant;
1029         int32_t   budget;
1030         int32_t   i;
1031
1032         if (nblock < 10000) {
1033                 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1034         } else {
1035                 /* Calculate the location for quadrant, remembering to get
1036                  * the alignment right.  Assumes that &(block[0]) is at least
1037                  * 2-byte aligned -- this should be ok since block is really
1038                  * the first section of arr2.
1039                  */
1040                 i = nblock + BZ_N_OVERSHOOT;
1041                 if (i & 1) i++;
1042                 quadrant = (uint16_t*)(&(block[i]));
1043
1044                 /* (wfact-1) / 3 puts the default-factor-30
1045                  * transition point at very roughly the same place as
1046                  * with v0.1 and v0.9.0.
1047                  * Not that it particularly matters any more, since the
1048                  * resulting compressed stream is now the same regardless
1049                  * of whether or not we use the main sort or fallback sort.
1050                  */
1051                 budget = nblock * ((wfact-1) / 3);
1052
1053                 mainSort(s, ptr, block, quadrant, ftab, nblock, &budget);
1054                 if (budget < 0) {
1055                         fallbackSort(s->arr1, s->arr2, ftab, nblock);
1056                 }
1057         }
1058
1059         s->origPtr = -1;
1060         for (i = 0; i < s->nblock; i++)
1061                 if (ptr[i] == 0) {
1062                         s->origPtr = i;
1063                         break;
1064                 };
1065
1066         AssertH(s->origPtr != -1, 1003);
1067 }
1068
1069
1070 /*-------------------------------------------------------------*/
1071 /*--- end                                       blocksort.c ---*/
1072 /*-------------------------------------------------------------*/