lib/bzip2/bzlib_blocksort.c

   1
   2 /*-------------------------------------------------------------*/
   3 /*--- Block sorting machinery                               ---*/
   4 /*---                                           blocksort.c ---*/
   5 /*-------------------------------------------------------------*/
   6
   7 /*--
   8   This file is a part of bzip2 and/or libbzip2, a program and
   9   library for lossless, block-sorting data compression.
  10
  11   Copyright (C) 1996-2002 Julian R Seward.  All rights reserved.
  12
  13   Redistribution and use in source and binary forms, with or without
  14   modification, are permitted provided that the following conditions
  15   are met:
  16
  17   1. Redistributions of source code must retain the above copyright
  18      notice, this list of conditions and the following disclaimer.
  19
  20   2. The origin of this software must not be misrepresented; you must
  21      not claim that you wrote the original software.  If you use this
  22      software in a product, an acknowledgment in the product
  23      documentation would be appreciated but is not required.
  24
  25   3. Altered source versions must be plainly marked as such, and must
  26      not be misrepresented as being the original software.
  27
  28   4. The name of the author may not be used to endorse or promote
  29      products derived from this software without specific prior written
  30      permission.
  31
  32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43
  44   Julian Seward, Cambridge, UK.
  45   jseward@acm.org
  46   bzip2/libbzip2 version 1.0.6 of 6 September 2010
  47   Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
  48
  49   This program is based on (at least) the work of:
  50      Mike Burrows
  51      David Wheeler
  52      Peter Fenwick
  53      Alistair Moffat
  54      Radford Neal
  55      Ian H. Witten
  56      Robert Sedgewick
  57      Jon L. Bentley
  58
  59   For more information on these sources, see the manual.
  60 --*/
  61
  62 #include "bzlib_private.h"
  63 #include <log.h>
  64
  65 /*---------------------------------------------*/
  66 /*--- Fallback O(N log(N)^2) sorting        ---*/
  67 /*--- algorithm, for repetitive blocks      ---*/
  68 /*---------------------------------------------*/
  69
  70 /*---------------------------------------------*/
  71 static
  72 __inline__
  73 void fallbackSimpleSort ( UInt32* fmap,
  74                           UInt32* eclass,
  75                           Int32   lo,
  76                           Int32   hi )
  77 {
  78    Int32 i, j, tmp;
  79    UInt32 ec_tmp;
  80
  81    if (lo == hi) return;
  82
  83    if (hi - lo > 3) {
  84       for ( i = hi-4; i >= lo; i-- ) {
  85          tmp = fmap[i];
  86          ec_tmp = eclass[tmp];
  87          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
  88             fmap[j-4] = fmap[j];
  89          fmap[j-4] = tmp;
  90       }
  91    }
  92
  93    for ( i = hi-1; i >= lo; i-- ) {
  94       tmp = fmap[i];
  95       ec_tmp = eclass[tmp];
  96       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
  97          fmap[j-1] = fmap[j];
  98       fmap[j-1] = tmp;
  99    }
 100 }
 101
 102
 103 /*---------------------------------------------*/
 104 #define fswap(zz1, zz2) \
 105    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 106
 107 #define fvswap(zzp1, zzp2, zzn)       \
 108 {                                     \
 109    Int32 yyp1 = (zzp1);               \
 110    Int32 yyp2 = (zzp2);               \
 111    Int32 yyn  = (zzn);                \
 112    while (yyn > 0) {                  \
 113       fswap(fmap[yyp1], fmap[yyp2]);  \
 114       yyp1++; yyp2++; yyn--;          \
 115    }                                  \
 116 }
 117
 118
 119 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
 120
 121 #define fpush(lz,hz) { stackLo[sp] = lz; \
 122                        stackHi[sp] = hz; \
 123                        sp++; }
 124
 125 #define fpop(lz,hz) { sp--;              \
 126                       lz = stackLo[sp];  \
 127                       hz = stackHi[sp]; }
 128
 129 #define FALLBACK_QSORT_SMALL_THRESH 10
 130 #define FALLBACK_QSORT_STACK_SIZE   100
 131
 132
 133 static
 134 void fallbackQSort3 ( UInt32* fmap,
 135                       UInt32* eclass,
 136                       Int32   loSt,
 137                       Int32   hiSt )
 138 {
 139    Int32 unLo, unHi, ltLo, gtHi, n, m;
 140    Int32 sp, lo, hi;
 141    UInt32 med, r, r3;
 142    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
 143    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
 144
 145    r = 0;
 146
 147    sp = 0;
 148    fpush ( loSt, hiSt );
 149
 150    while (sp > 0) {
 151
 152       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
 153
 154       fpop ( lo, hi );
 155       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
 156          fallbackSimpleSort ( fmap, eclass, lo, hi );
 157          continue;
 158       }
 159
 160       /* Random partitioning.  Median of 3 sometimes fails to
 161          avoid bad cases.  Median of 9 seems to help but
 162          looks rather expensive.  This too seems to work but
 163          is cheaper.  Guidance for the magic constants
 164          7621 and 32768 is taken from Sedgewick's algorithms
 165          book, chapter 35.
 166       */
 167       r = ((r * 7621) + 1) % 32768;
 168       r3 = r % 3;
 169       if (r3 == 0) med = eclass[fmap[lo]]; else
 170       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
 171                    med = eclass[fmap[hi]];
 172
 173       unLo = ltLo = lo;
 174       unHi = gtHi = hi;
 175
 176       while (1) {
 177          while (1) {
 178             if (unLo > unHi) break;
 179             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
 180             if (n == 0) {
 181                fswap(fmap[unLo], fmap[ltLo]);
 182                ltLo++; unLo++;
 183                continue;
 184             };
 185             if (n > 0) break;
 186             unLo++;
 187          }
 188          while (1) {
 189             if (unLo > unHi) break;
 190             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
 191             if (n == 0) {
 192                fswap(fmap[unHi], fmap[gtHi]);
 193                gtHi--; unHi--;
 194                continue;
 195             };
 196             if (n < 0) break;
 197             unHi--;
 198          }
 199          if (unLo > unHi) break;
 200          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
 201       }
 202
 203       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
 204
 205       if (gtHi < ltLo) continue;
 206
 207       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
 208       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
 209
 210       n = lo + unLo - ltLo - 1;
 211       m = hi - (gtHi - unHi) + 1;
 212
 213       if (n - lo > hi - m) {
 214          fpush ( lo, n );
 215          fpush ( m, hi );
 216       } else {
 217          fpush ( m, hi );
 218          fpush ( lo, n );
 219       }
 220    }
 221 }
 222
 223 #undef fmin
 224 #undef fpush
 225 #undef fpop
 226 #undef fswap
 227 #undef fvswap
 228 #undef FALLBACK_QSORT_SMALL_THRESH
 229 #undef FALLBACK_QSORT_STACK_SIZE
 230
 231
 232 /*---------------------------------------------*/
 233 /* Pre:
 234       nblock > 0
 235       eclass exists for [0 .. nblock-1]
 236       ((UChar*)eclass) [0 .. nblock-1] holds block
 237       ptr exists for [0 .. nblock-1]
 238
 239    Post:
 240       ((UChar*)eclass) [0 .. nblock-1] holds block
 241       All other areas of eclass destroyed
 242       fmap [0 .. nblock-1] holds sorted order
 243       bhtab [ 0 .. 2+(nblock/32) ] destroyed
 244 */
 245
 246 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
 247 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
 248 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
 249 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
 250 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
 251
 252 static
 253 void fallbackSort ( UInt32* fmap,
 254                     UInt32* eclass,
 255                     UInt32* bhtab,
 256                     Int32   nblock,
 257                     Int32   verb )
 258 {
 259    Int32 ftab[257];
 260    Int32 ftabCopy[256];
 261    Int32 H, i, j, k, l, r, cc, cc1;
 262    Int32 nNotDone;
 263    Int32 nBhtab;
 264    UChar* eclass8 = (UChar*)eclass;
 265
 266    /*--
 267       Initial 1-char radix sort to generate
 268       initial fmap and initial BH bits.
 269    --*/
 270    if (verb >= 4)
 271       VPrintf0 ( "        bucket sorting ...\n" );
 272    for (i = 0; i < 257;    i++) ftab[i] = 0;
 273    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
 274    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
 275    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
 276
 277    for (i = 0; i < nblock; i++) {
 278       j = eclass8[i];
 279       k = ftab[j] - 1;
 280       ftab[j] = k;
 281       fmap[k] = i;
 282    }
 283
 284    nBhtab = 2 + (nblock / 32);
 285    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
 286    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
 287
 288    /*--
 289       Inductively refine the buckets.  Kind-of an
 290       "exponential radix sort" (!), inspired by the
 291       Manber-Myers suffix array construction algorithm.
 292    --*/
 293
 294    /*-- set sentinel bits for block-end detection --*/
 295    for (i = 0; i < 32; i++) {
 296       SET_BH(nblock + 2*i);
 297       CLEAR_BH(nblock + 2*i + 1);
 298    }
 299
 300    /*-- the log(N) loop --*/
 301    H = 1;
 302    while (1) {
 303
 304       if (verb >= 4)
 305          VPrintf1 ( "        depth %6d has ", H );
 306
 307       j = 0;
 308       for (i = 0; i < nblock; i++) {
 309          if (ISSET_BH(i)) j = i;
 310          k = fmap[i] - H; if (k < 0) k += nblock;
 311          eclass[k] = j;
 312       }
 313
 314       nNotDone = 0;
 315       r = -1;
 316       while (1) {
 317
 318          /*-- find the next non-singleton bucket --*/
 319          k = r + 1;
 320          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 321          if (ISSET_BH(k)) {
 322             while (WORD_BH(k) == 0xffffffff) k += 32;
 323             while (ISSET_BH(k)) k++;
 324          }
 325          l = k - 1;
 326          if (l >= nblock) break;
 327          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 328          if (!ISSET_BH(k)) {
 329             while (WORD_BH(k) == 0x00000000) k += 32;
 330             while (!ISSET_BH(k)) k++;
 331          }
 332          r = k - 1;
 333          if (r >= nblock) break;
 334
 335          /*-- now [l, r] bracket current bucket --*/
 336          if (r > l) {
 337             nNotDone += (r - l + 1);
 338             fallbackQSort3 ( fmap, eclass, l, r );
 339
 340             /*-- scan bucket and generate header bits-- */
 341             cc = -1;
 342             for (i = l; i <= r; i++) {
 343                cc1 = eclass[fmap[i]];
 344                if (cc != cc1) { SET_BH(i); cc = cc1; };
 345             }
 346          }
 347       }
 348
 349       if (verb >= 4)
 350          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
 351
 352       H *= 2;
 353       if (H > nblock || nNotDone == 0) break;
 354    }
 355
 356    /*--
 357       Reconstruct the original block in
 358       eclass8 [0 .. nblock-1], since the
 359       previous phase destroyed it.
 360    --*/
 361    if (verb >= 4)
 362       VPrintf0 ( "        reconstructing block ...\n" );
 363    j = 0;
 364    for (i = 0; i < nblock; i++) {
 365       while (ftabCopy[j] == 0) j++;
 366       ftabCopy[j]--;
 367       eclass8[fmap[i]] = (UChar)j;
 368    }
 369    AssertH ( j < 256, 1005 );
 370 }
 371
 372 #undef       SET_BH
 373 #undef     CLEAR_BH
 374 #undef     ISSET_BH
 375 #undef      WORD_BH
 376 #undef UNALIGNED_BH
 377
 378
 379 /*---------------------------------------------*/
 380 /*--- The main, O(N^2 log(N)) sorting       ---*/
 381 /*--- algorithm.  Faster for "normal"       ---*/
 382 /*--- non-repetitive blocks.                ---*/
 383 /*---------------------------------------------*/
 384
 385 /*---------------------------------------------*/
 386 static
 387 __inline__
 388 Bool mainGtU ( UInt32  i1,
 389                UInt32  i2,
 390                UChar*  block,
 391                UInt16* quadrant,
 392                UInt32  nblock,
 393                Int32*  budget )
 394 {
 395    Int32  k;
 396    UChar  c1, c2;
 397    UInt16 s1, s2;
 398
 399    AssertD ( i1 != i2, "mainGtU" );
 400    /* 1 */
 401    c1 = block[i1]; c2 = block[i2];
 402    if (c1 != c2) return (c1 > c2);
 403    i1++; i2++;
 404    /* 2 */
 405    c1 = block[i1]; c2 = block[i2];
 406    if (c1 != c2) return (c1 > c2);
 407    i1++; i2++;
 408    /* 3 */
 409    c1 = block[i1]; c2 = block[i2];
 410    if (c1 != c2) return (c1 > c2);
 411    i1++; i2++;
 412    /* 4 */
 413    c1 = block[i1]; c2 = block[i2];
 414    if (c1 != c2) return (c1 > c2);
 415    i1++; i2++;
 416    /* 5 */
 417    c1 = block[i1]; c2 = block[i2];
 418    if (c1 != c2) return (c1 > c2);
 419    i1++; i2++;
 420    /* 6 */
 421    c1 = block[i1]; c2 = block[i2];
 422    if (c1 != c2) return (c1 > c2);
 423    i1++; i2++;
 424    /* 7 */
 425    c1 = block[i1]; c2 = block[i2];
 426    if (c1 != c2) return (c1 > c2);
 427    i1++; i2++;
 428    /* 8 */
 429    c1 = block[i1]; c2 = block[i2];
 430    if (c1 != c2) return (c1 > c2);
 431    i1++; i2++;
 432    /* 9 */
 433    c1 = block[i1]; c2 = block[i2];
 434    if (c1 != c2) return (c1 > c2);
 435    i1++; i2++;
 436    /* 10 */
 437    c1 = block[i1]; c2 = block[i2];
 438    if (c1 != c2) return (c1 > c2);
 439    i1++; i2++;
 440    /* 11 */
 441    c1 = block[i1]; c2 = block[i2];
 442    if (c1 != c2) return (c1 > c2);
 443    i1++; i2++;
 444    /* 12 */
 445    c1 = block[i1]; c2 = block[i2];
 446    if (c1 != c2) return (c1 > c2);
 447    i1++; i2++;
 448
 449    k = nblock + 8;
 450
 451    do {
 452       /* 1 */
 453       c1 = block[i1]; c2 = block[i2];
 454       if (c1 != c2) return (c1 > c2);
 455       s1 = quadrant[i1]; s2 = quadrant[i2];
 456       if (s1 != s2) return (s1 > s2);
 457       i1++; i2++;
 458       /* 2 */
 459       c1 = block[i1]; c2 = block[i2];
 460       if (c1 != c2) return (c1 > c2);
 461       s1 = quadrant[i1]; s2 = quadrant[i2];
 462       if (s1 != s2) return (s1 > s2);
 463       i1++; i2++;
 464       /* 3 */
 465       c1 = block[i1]; c2 = block[i2];
 466       if (c1 != c2) return (c1 > c2);
 467       s1 = quadrant[i1]; s2 = quadrant[i2];
 468       if (s1 != s2) return (s1 > s2);
 469       i1++; i2++;
 470       /* 4 */
 471       c1 = block[i1]; c2 = block[i2];
 472       if (c1 != c2) return (c1 > c2);
 473       s1 = quadrant[i1]; s2 = quadrant[i2];
 474       if (s1 != s2) return (s1 > s2);
 475       i1++; i2++;
 476       /* 5 */
 477       c1 = block[i1]; c2 = block[i2];
 478       if (c1 != c2) return (c1 > c2);
 479       s1 = quadrant[i1]; s2 = quadrant[i2];
 480       if (s1 != s2) return (s1 > s2);
 481       i1++; i2++;
 482       /* 6 */
 483       c1 = block[i1]; c2 = block[i2];
 484       if (c1 != c2) return (c1 > c2);
 485       s1 = quadrant[i1]; s2 = quadrant[i2];
 486       if (s1 != s2) return (s1 > s2);
 487       i1++; i2++;
 488       /* 7 */
 489       c1 = block[i1]; c2 = block[i2];
 490       if (c1 != c2) return (c1 > c2);
 491       s1 = quadrant[i1]; s2 = quadrant[i2];
 492       if (s1 != s2) return (s1 > s2);
 493       i1++; i2++;
 494       /* 8 */
 495       c1 = block[i1]; c2 = block[i2];
 496       if (c1 != c2) return (c1 > c2);
 497       s1 = quadrant[i1]; s2 = quadrant[i2];
 498       if (s1 != s2) return (s1 > s2);
 499       i1++; i2++;
 500
 501       if (i1 >= nblock) i1 -= nblock;
 502       if (i2 >= nblock) i2 -= nblock;
 503
 504       k -= 8;
 505       (*budget)--;
 506    }
 507       while (k >= 0);
 508
 509    return False;
 510 }
 511
 512
 513 /*---------------------------------------------*/
 514 /*--
 515    Knuth's increments seem to work better
 516    than Incerpi-Sedgewick here.  Possibly
 517    because the number of elems to sort is
 518    usually small, typically <= 20.
 519 --*/
 520 static
 521 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
 522                    9841, 29524, 88573, 265720,
 523                    797161, 2391484 };
 524
 525 static
 526 void mainSimpleSort ( UInt32* ptr,
 527                       UChar*  block,
 528                       UInt16* quadrant,
 529                       Int32   nblock,
 530                       Int32   lo,
 531                       Int32   hi,
 532                       Int32   d,
 533                       Int32*  budget )
 534 {
 535    Int32 i, j, h, bigN, hp;
 536    UInt32 v;
 537
 538    bigN = hi - lo + 1;
 539    if (bigN < 2) return;
 540
 541    hp = 0;
 542    while (incs[hp] < bigN) hp++;
 543    hp--;
 544
 545    for (; hp >= 0; hp--) {
 546       h = incs[hp];
 547
 548       i = lo + h;
 549       while (True) {
 550
 551          /*-- copy 1 --*/
 552          if (i > hi) break;
 553          v = ptr[i];
 554          j = i;
 555          while ( mainGtU (
 556                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 557                  ) ) {
 558             ptr[j] = ptr[j-h];
 559             j = j - h;
 560             if (j <= (lo + h - 1)) break;
 561          }
 562          ptr[j] = v;
 563          i++;
 564
 565          /*-- copy 2 --*/
 566          if (i > hi) break;
 567          v = ptr[i];
 568          j = i;
 569          while ( mainGtU (
 570                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 571                  ) ) {
 572             ptr[j] = ptr[j-h];
 573             j = j - h;
 574             if (j <= (lo + h - 1)) break;
 575          }
 576          ptr[j] = v;
 577          i++;
 578
 579          /*-- copy 3 --*/
 580          if (i > hi) break;
 581          v = ptr[i];
 582          j = i;
 583          while ( mainGtU (
 584                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 585                  ) ) {
 586             ptr[j] = ptr[j-h];
 587             j = j - h;
 588             if (j <= (lo + h - 1)) break;
 589          }
 590          ptr[j] = v;
 591          i++;
 592
 593          if (*budget < 0) return;
 594       }
 595    }
 596 }
 597
 598
 599 /*---------------------------------------------*/
 600 /*--
 601    The following is an implementation of
 602    an elegant 3-way quicksort for strings,
 603    described in a paper "Fast Algorithms for
 604    Sorting and Searching Strings", by Robert
 605    Sedgewick and Jon L. Bentley.
 606 --*/
 607
 608 #define mswap(zz1, zz2) \
 609    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 610
 611 #define mvswap(zzp1, zzp2, zzn)       \
 612 {                                     \
 613    Int32 yyp1 = (zzp1);               \
 614    Int32 yyp2 = (zzp2);               \
 615    Int32 yyn  = (zzn);                \
 616    while (yyn > 0) {                  \
 617       mswap(ptr[yyp1], ptr[yyp2]);    \
 618       yyp1++; yyp2++; yyn--;          \
 619    }                                  \
 620 }
 621
 622 static
 623 __inline__
 624 UChar mmed3 ( UChar a, UChar b, UChar c )
 625 {
 626    UChar t;
 627    if (a > b) { t = a; a = b; b = t; };
 628    if (b > c) {
 629       b = c;
 630       if (a > b) b = a;
 631    }
 632    return b;
 633 }
 634
 635 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
 636
 637 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
 638                           stackHi[sp] = hz; \
 639                           stackD [sp] = dz; \
 640                           sp++; }
 641
 642 #define mpop(lz,hz,dz) { sp--;             \
 643                          lz = stackLo[sp]; \
 644                          hz = stackHi[sp]; \
 645                          dz = stackD [sp]; }
 646
 647
 648 #define mnextsize(az) (nextHi[az]-nextLo[az])
 649
 650 #define mnextswap(az,bz)                                        \
 651    { Int32 tz;                                                  \
 652      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
 653      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
 654      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
 655
 656
 657 #define MAIN_QSORT_SMALL_THRESH 20
 658 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
 659 #define MAIN_QSORT_STACK_SIZE 100
 660
 661 static
 662 void mainQSort3 ( UInt32* ptr,
 663                   UChar*  block,
 664                   UInt16* quadrant,
 665                   Int32   nblock,
 666                   Int32   loSt,
 667                   Int32   hiSt,
 668                   Int32   dSt,
 669                   Int32*  budget )
 670 {
 671    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
 672    Int32 sp, lo, hi, d;
 673
 674    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
 675    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
 676    Int32 stackD [MAIN_QSORT_STACK_SIZE];
 677
 678    Int32 nextLo[3];
 679    Int32 nextHi[3];
 680    Int32 nextD [3];
 681
 682    sp = 0;
 683    mpush ( loSt, hiSt, dSt );
 684
 685    while (sp > 0) {
 686
 687       AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
 688
 689       mpop ( lo, hi, d );
 690       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
 691           d > MAIN_QSORT_DEPTH_THRESH) {
 692          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
 693          if (*budget < 0) return;
 694          continue;
 695       }
 696
 697       med = (Int32)
 698             mmed3 ( block[ptr[ lo         ]+d],
 699                     block[ptr[ hi         ]+d],
 700                     block[ptr[ (lo+hi)>>1 ]+d] );
 701
 702       unLo = ltLo = lo;
 703       unHi = gtHi = hi;
 704
 705       while (True) {
 706          while (True) {
 707             if (unLo > unHi) break;
 708             n = ((Int32)block[ptr[unLo]+d]) - med;
 709             if (n == 0) {
 710                mswap(ptr[unLo], ptr[ltLo]);
 711                ltLo++; unLo++; continue;
 712             };
 713             if (n >  0) break;
 714             unLo++;
 715          }
 716          while (True) {
 717             if (unLo > unHi) break;
 718             n = ((Int32)block[ptr[unHi]+d]) - med;
 719             if (n == 0) {
 720                mswap(ptr[unHi], ptr[gtHi]);
 721                gtHi--; unHi--; continue;
 722             };
 723             if (n <  0) break;
 724             unHi--;
 725          }
 726          if (unLo > unHi) break;
 727          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
 728       }
 729
 730       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
 731
 732       if (gtHi < ltLo) {
 733          mpush(lo, hi, d+1 );
 734          continue;
 735       }
 736
 737       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
 738       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
 739
 740       n = lo + unLo - ltLo - 1;
 741       m = hi - (gtHi - unHi) + 1;
 742
 743       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
 744       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
 745       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
 746
 747       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 748       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
 749       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 750
 751       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
 752       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
 753
 754       mpush (nextLo[0], nextHi[0], nextD[0]);
 755       mpush (nextLo[1], nextHi[1], nextD[1]);
 756       mpush (nextLo[2], nextHi[2], nextD[2]);
 757    }
 758 }
 759
 760 #undef mswap
 761 #undef mvswap
 762 #undef mpush
 763 #undef mpop
 764 #undef mmin
 765 #undef mnextsize
 766 #undef mnextswap
 767 #undef MAIN_QSORT_SMALL_THRESH
 768 #undef MAIN_QSORT_DEPTH_THRESH
 769 #undef MAIN_QSORT_STACK_SIZE
 770
 771
 772 /*---------------------------------------------*/
 773 /* Pre:
 774       nblock > N_OVERSHOOT
 775       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
 776       ((UChar*)block32) [0 .. nblock-1] holds block
 777       ptr exists for [0 .. nblock-1]
 778
 779    Post:
 780       ((UChar*)block32) [0 .. nblock-1] holds block
 781       All other areas of block32 destroyed
 782       ftab [0 .. 65536 ] destroyed
 783       ptr [0 .. nblock-1] holds sorted order
 784       if (*budget < 0), sorting was abandoned
 785 */
 786
 787 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
 788 #define SETMASK (1 << 21)
 789 #define CLEARMASK (~(SETMASK))
 790
 791 static
 792 void mainSort ( UInt32* ptr,
 793                 UChar*  block,
 794                 UInt16* quadrant,
 795                 UInt32* ftab,
 796                 Int32   nblock,
 797                 Int32   verb,
 798                 Int32*  budget )
 799 {
 800    Int32  i, j, k, ss, sb;
 801    Int32  runningOrder[256];
 802    Bool   bigDone[256];
 803    Int32  copyStart[256];
 804    Int32  copyEnd  [256];
 805    UChar  c1;
 806    Int32  numQSorted;
 807    UInt16 s;
 808    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
 809
 810    /*-- set up the 2-byte frequency table --*/
 811    for (i = 65536; i >= 0; i--) ftab[i] = 0;
 812
 813    j = block[0] << 8;
 814    i = nblock-1;
 815    for (; i >= 3; i -= 4) {
 816       quadrant[i] = 0;
 817       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 818       ftab[j]++;
 819       quadrant[i-1] = 0;
 820       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
 821       ftab[j]++;
 822       quadrant[i-2] = 0;
 823       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
 824       ftab[j]++;
 825       quadrant[i-3] = 0;
 826       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
 827       ftab[j]++;
 828    }
 829    for (; i >= 0; i--) {
 830       quadrant[i] = 0;
 831       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 832       ftab[j]++;
 833    }
 834
 835    /*-- (emphasises close relationship of block & quadrant) --*/
 836    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
 837       block   [nblock+i] = block[i];
 838       quadrant[nblock+i] = 0;
 839    }
 840
 841    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
 842
 843    /*-- Complete the initial radix sort --*/
 844    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
 845
 846    s = block[0] << 8;
 847    i = nblock-1;
 848    for (; i >= 3; i -= 4) {
 849       s = (s >> 8) | (block[i] << 8);
 850       j = ftab[s] -1;
 851       ftab[s] = j;
 852       ptr[j] = i;
 853       s = (s >> 8) | (block[i-1] << 8);
 854       j = ftab[s] -1;
 855       ftab[s] = j;
 856       ptr[j] = i-1;
 857       s = (s >> 8) | (block[i-2] << 8);
 858       j = ftab[s] -1;
 859       ftab[s] = j;
 860       ptr[j] = i-2;
 861       s = (s >> 8) | (block[i-3] << 8);
 862       j = ftab[s] -1;
 863       ftab[s] = j;
 864       ptr[j] = i-3;
 865    }
 866    for (; i >= 0; i--) {
 867       s = (s >> 8) | (block[i] << 8);
 868       j = ftab[s] -1;
 869       ftab[s] = j;
 870       ptr[j] = i;
 871    }
 872
 873    /*--
 874       Now ftab contains the first loc of every small bucket.
 875       Calculate the running order, from smallest to largest
 876       big bucket.
 877    --*/
 878    for (i = 0; i <= 255; i++) {
 879       bigDone     [i] = False;
 880       runningOrder[i] = i;
 881    }
 882
 883    {
 884       Int32 vv;
 885       Int32 h = 1;
 886       do h = 3 * h + 1; while (h <= 256);
 887       do {
 888          h = h / 3;
 889          for (i = h; i <= 255; i++) {
 890             vv = runningOrder[i];
 891             j = i;
 892             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
 893                runningOrder[j] = runningOrder[j-h];
 894                j = j - h;
 895                if (j <= (h - 1)) goto zero;
 896             }
 897             zero:
 898             runningOrder[j] = vv;
 899          }
 900       } while (h != 1);
 901    }
 902
 903    /*--
 904       The main sorting loop.
 905    --*/
 906
 907    numQSorted = 0;
 908
 909    for (i = 0; i <= 255; i++) {
 910
 911       /*--
 912          Process big buckets, starting with the least full.
 913          Basically this is a 3-step process in which we call
 914          mainQSort3 to sort the small buckets [ss, j], but
 915          also make a big effort to avoid the calls if we can.
 916       --*/
 917       ss = runningOrder[i];
 918
 919       /*--
 920          Step 1:
 921          Complete the big bucket [ss] by quicksorting
 922          any unsorted small buckets [ss, j], for j != ss.
 923          Hopefully previous pointer-scanning phases have already
 924          completed many of the small buckets [ss, j], so
 925          we don't have to sort them at all.
 926       --*/
 927       for (j = 0; j <= 255; j++) {
 928          if (j != ss) {
 929             sb = (ss << 8) + j;
 930             if ( ! (ftab[sb] & SETMASK) ) {
 931                Int32 lo = ftab[sb]   & CLEARMASK;
 932                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
 933                if (hi > lo) {
 934                   if (verb >= 4)
 935                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
 936                                 "done %d   this %d\n",
 937                                 ss, j, numQSorted, hi - lo + 1 );
 938                   mainQSort3 (
 939                      ptr, block, quadrant, nblock,
 940                      lo, hi, BZ_N_RADIX, budget
 941                   );
 942                   numQSorted += (hi - lo + 1);
 943                   if (*budget < 0) return;
 944                }
 945             }
 946             ftab[sb] |= SETMASK;
 947          }
 948       }
 949
 950       AssertH ( !bigDone[ss], 1006 );
 951
 952       /*--
 953          Step 2:
 954          Now scan this big bucket [ss] so as to synthesise the
 955          sorted order for small buckets [t, ss] for all t,
 956          including, magically, the bucket [ss,ss] too.
 957          This will avoid doing Real Work in subsequent Step 1's.
 958       --*/
 959       {
 960          for (j = 0; j <= 255; j++) {
 961             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
 962             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
 963          }
 964          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
 965             k = ptr[j]-1; if (k < 0) k += nblock;
 966             c1 = block[k];
 967             if (!bigDone[c1])
 968                ptr[ copyStart[c1]++ ] = k;
 969          }
 970          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
 971             k = ptr[j]-1; if (k < 0) k += nblock;
 972             c1 = block[k];
 973             if (!bigDone[c1])
 974                ptr[ copyEnd[c1]-- ] = k;
 975          }
 976       }
 977
 978       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
 979                 ||
 980                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
 981                    Necessity for this case is demonstrated by compressing
 982                    a sequence of approximately 48.5 million of character
 983                    251; 1.0.0/1.0.1 will then die here. */
 984                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
 985                 1007 )
 986
 987       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
 988
 989       /*--
 990          Step 3:
 991          The [ss] big bucket is now done.  Record this fact,
 992          and update the quadrant descriptors.  Remember to
 993          update quadrants in the overshoot area too, if
 994          necessary.  The "if (i < 255)" test merely skips
 995          this updating for the last bucket processed, since
 996          updating for the last bucket is pointless.
 997
 998          The quadrant array provides a way to incrementally
 999          cache sort orderings, as they appear, so as to
1000          make subsequent comparisons in fullGtU() complete
1001          faster.  For repetitive blocks this makes a big
1002          difference (but not big enough to be able to avoid
1003          the fallback sorting mechanism, exponential radix sort).
1004
1005          The precise meaning is: at all times:
1006
1007             for 0 <= i < nblock and 0 <= j <= nblock
1008
1009             if block[i] != block[j],
1010
1011                then the relative values of quadrant[i] and
1012                     quadrant[j] are meaningless.
1013
1014                else {
1015                   if quadrant[i] < quadrant[j]
1016                      then the string starting at i lexicographically
1017                      precedes the string starting at j
1018
1019                   else if quadrant[i] > quadrant[j]
1020                      then the string starting at j lexicographically
1021                      precedes the string starting at i
1022
1023                   else
1024                      the relative ordering of the strings starting
1025                      at i and j has not yet been determined.
1026                }
1027       --*/
1028       bigDone[ss] = True;
1029
1030       if (i < 255) {
1031          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
1032          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
1033          Int32 shifts   = 0;
1034
1035          while ((bbSize >> shifts) > 65534) shifts++;
1036
1037          for (j = bbSize-1; j >= 0; j--) {
1038             Int32 a2update     = ptr[bbStart + j];
1039             UInt16 qVal        = (UInt16)(j >> shifts);
1040             quadrant[a2update] = qVal;
1041             if (a2update < BZ_N_OVERSHOOT)
1042                quadrant[a2update + nblock] = qVal;
1043          }
1044          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1045       }
1046
1047    }
1048
1049    if (verb >= 4)
1050       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1051                  nblock, numQSorted, nblock - numQSorted );
1052 }
1053
1054 #undef BIGFREQ
1055 #undef SETMASK
1056 #undef CLEARMASK
1057
1058
1059 /*---------------------------------------------*/
1060 /* Pre:
1061       nblock > 0
1062       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1063       ((UChar*)arr2)  [0 .. nblock-1] holds block
1064       arr1 exists for [0 .. nblock-1]
1065
1066    Post:
1067       ((UChar*)arr2) [0 .. nblock-1] holds block
1068       All other areas of block destroyed
1069       ftab [ 0 .. 65536 ] destroyed
1070       arr1 [0 .. nblock-1] holds sorted order
1071 */
1072 void BZ2_blockSort ( EState* s )
1073 {
1074    UInt32* ptr    = s->ptr;
1075    UChar*  block  = s->block;
1076    UInt32* ftab   = s->ftab;
1077    Int32   nblock = s->nblock;
1078    Int32   verb   = s->verbosity;
1079    Int32   wfact  = s->workFactor;
1080    UInt16* quadrant;
1081    Int32   budget;
1082    Int32   budgetInit;
1083    Int32   i;
1084
1085    if (nblock < 10000) {
1086       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1087    } else {
1088       /* Calculate the location for quadrant, remembering to get
1089          the alignment right.  Assumes that &(block[0]) is at least
1090          2-byte aligned -- this should be ok since block is really
1091          the first section of arr2.
1092       */
1093       i = nblock+BZ_N_OVERSHOOT;
1094       if (i & 1) i++;
1095       quadrant = (UInt16*)(&(block[i]));
1096
1097       /* (wfact-1) / 3 puts the default-factor-30
1098          transition point at very roughly the same place as
1099          with v0.1 and v0.9.0.
1100          Not that it particularly matters any more, since the
1101          resulting compressed stream is now the same regardless
1102          of whether or not we use the main sort or fallback sort.
1103       */
1104       if (wfact < 1  ) wfact = 1;
1105       if (wfact > 100) wfact = 100;
1106       budgetInit = nblock * ((wfact-1) / 3);
1107       budget = budgetInit;
1108
1109       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1110       if (verb >= 3)
1111          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1112                     budgetInit - budget,
1113                     nblock,
1114                     (float)(budgetInit - budget) /
1115                     (float)(nblock==0 ? 1 : nblock) );
1116       if (budget < 0) {
1117          if (verb >= 2)
1118             VPrintf0 ( "    too repetitive; using fallback"
1119                        " sorting algorithm\n" );
1120          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1121       }
1122    }
1123
1124    s->origPtr = -1;
1125    for (i = 0; i < s->nblock; i++)
1126       if (ptr[i] == 0)
1127          { s->origPtr = i; break; };
1128
1129    AssertH( s->origPtr != -1, 1003 );
1130 }
1131
1132
1133 /*-------------------------------------------------------------*/
1134 /*--- end                                       blocksort.c ---*/
1135 /*-------------------------------------------------------------*/