lib/bzip2/bzlib_blocksort.c

   1
   2 /*-------------------------------------------------------------*/
   3 /*--- Block sorting machinery                               ---*/
   4 /*---                                           blocksort.c ---*/
   5 /*-------------------------------------------------------------*/
   6
   7 /*--
   8   This file is a part of bzip2 and/or libbzip2, a program and
   9   library for lossless, block-sorting data compression.
  10
  11   Copyright (C) 1996-2002 Julian R Seward.  All rights reserved.
  12
  13   Redistribution and use in source and binary forms, with or without
  14   modification, are permitted provided that the following conditions
  15   are met:
  16
  17   1. Redistributions of source code must retain the above copyright
  18      notice, this list of conditions and the following disclaimer.
  19
  20   2. The origin of this software must not be misrepresented; you must
  21      not claim that you wrote the original software.  If you use this
  22      software in a product, an acknowledgment in the product
  23      documentation would be appreciated but is not required.
  24
  25   3. Altered source versions must be plainly marked as such, and must
  26      not be misrepresented as being the original software.
  27
  28   4. The name of the author may not be used to endorse or promote
  29      products derived from this software without specific prior written
  30      permission.
  31
  32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43
  44   Julian Seward, Cambridge, UK.
  45   jseward@acm.org
  46   bzip2/libbzip2 version 1.0.6 of 6 September 2010
  47   Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
  48
  49   This program is based on (at least) the work of:
  50      Mike Burrows
  51      David Wheeler
  52      Peter Fenwick
  53      Alistair Moffat
  54      Radford Neal
  55      Ian H. Witten
  56      Robert Sedgewick
  57      Jon L. Bentley
  58
  59   For more information on these sources, see the manual.
  60 --*/
  61
  62 #include "bzlib_private.h"
  63
  64 /*---------------------------------------------*/
  65 /*--- Fallback O(N log(N)^2) sorting        ---*/
  66 /*--- algorithm, for repetitive blocks      ---*/
  67 /*---------------------------------------------*/
  68
  69 /*---------------------------------------------*/
  70 static
  71 __inline__
  72 void fallbackSimpleSort ( UInt32* fmap,
  73                           UInt32* eclass,
  74                           Int32   lo,
  75                           Int32   hi )
  76 {
  77    Int32 i, j, tmp;
  78    UInt32 ec_tmp;
  79
  80    if (lo == hi) return;
  81
  82    if (hi - lo > 3) {
  83       for ( i = hi-4; i >= lo; i-- ) {
  84          tmp = fmap[i];
  85          ec_tmp = eclass[tmp];
  86          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
  87             fmap[j-4] = fmap[j];
  88          fmap[j-4] = tmp;
  89       }
  90    }
  91
  92    for ( i = hi-1; i >= lo; i-- ) {
  93       tmp = fmap[i];
  94       ec_tmp = eclass[tmp];
  95       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
  96          fmap[j-1] = fmap[j];
  97       fmap[j-1] = tmp;
  98    }
  99 }
 100
 101
 102 /*---------------------------------------------*/
 103 #define fswap(zz1, zz2) \
 104    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 105
 106 #define fvswap(zzp1, zzp2, zzn)       \
 107 {                                     \
 108    Int32 yyp1 = (zzp1);               \
 109    Int32 yyp2 = (zzp2);               \
 110    Int32 yyn  = (zzn);                \
 111    while (yyn > 0) {                  \
 112       fswap(fmap[yyp1], fmap[yyp2]);  \
 113       yyp1++; yyp2++; yyn--;          \
 114    }                                  \
 115 }
 116
 117
 118 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
 119
 120 #define fpush(lz,hz) { stackLo[sp] = lz; \
 121                        stackHi[sp] = hz; \
 122                        sp++; }
 123
 124 #define fpop(lz,hz) { sp--;              \
 125                       lz = stackLo[sp];  \
 126                       hz = stackHi[sp]; }
 127
 128 #define FALLBACK_QSORT_SMALL_THRESH 10
 129 #define FALLBACK_QSORT_STACK_SIZE   100
 130
 131
 132 static
 133 void fallbackQSort3 ( UInt32* fmap,
 134                       UInt32* eclass,
 135                       Int32   loSt,
 136                       Int32   hiSt )
 137 {
 138    Int32 unLo, unHi, ltLo, gtHi, n, m;
 139    Int32 sp, lo, hi;
 140    UInt32 med, r, r3;
 141    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
 142    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
 143
 144    r = 0;
 145
 146    sp = 0;
 147    fpush ( loSt, hiSt );
 148
 149    while (sp > 0) {
 150
 151       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
 152
 153       fpop ( lo, hi );
 154       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
 155          fallbackSimpleSort ( fmap, eclass, lo, hi );
 156          continue;
 157       }
 158
 159       /* Random partitioning.  Median of 3 sometimes fails to
 160          avoid bad cases.  Median of 9 seems to help but
 161          looks rather expensive.  This too seems to work but
 162          is cheaper.  Guidance for the magic constants
 163          7621 and 32768 is taken from Sedgewick's algorithms
 164          book, chapter 35.
 165       */
 166       r = ((r * 7621) + 1) % 32768;
 167       r3 = r % 3;
 168       if (r3 == 0) med = eclass[fmap[lo]]; else
 169       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
 170                    med = eclass[fmap[hi]];
 171
 172       unLo = ltLo = lo;
 173       unHi = gtHi = hi;
 174
 175       while (1) {
 176          while (1) {
 177             if (unLo > unHi) break;
 178             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
 179             if (n == 0) {
 180                fswap(fmap[unLo], fmap[ltLo]);
 181                ltLo++; unLo++;
 182                continue;
 183             };
 184             if (n > 0) break;
 185             unLo++;
 186          }
 187          while (1) {
 188             if (unLo > unHi) break;
 189             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
 190             if (n == 0) {
 191                fswap(fmap[unHi], fmap[gtHi]);
 192                gtHi--; unHi--;
 193                continue;
 194             };
 195             if (n < 0) break;
 196             unHi--;
 197          }
 198          if (unLo > unHi) break;
 199          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
 200       }
 201
 202       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
 203
 204       if (gtHi < ltLo) continue;
 205
 206       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
 207       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
 208
 209       n = lo + unLo - ltLo - 1;
 210       m = hi - (gtHi - unHi) + 1;
 211
 212       if (n - lo > hi - m) {
 213          fpush ( lo, n );
 214          fpush ( m, hi );
 215       } else {
 216          fpush ( m, hi );
 217          fpush ( lo, n );
 218       }
 219    }
 220 }
 221
 222 #undef fmin
 223 #undef fpush
 224 #undef fpop
 225 #undef fswap
 226 #undef fvswap
 227 #undef FALLBACK_QSORT_SMALL_THRESH
 228 #undef FALLBACK_QSORT_STACK_SIZE
 229
 230
 231 /*---------------------------------------------*/
 232 /* Pre:
 233       nblock > 0
 234       eclass exists for [0 .. nblock-1]
 235       ((UChar*)eclass) [0 .. nblock-1] holds block
 236       ptr exists for [0 .. nblock-1]
 237
 238    Post:
 239       ((UChar*)eclass) [0 .. nblock-1] holds block
 240       All other areas of eclass destroyed
 241       fmap [0 .. nblock-1] holds sorted order
 242       bhtab [ 0 .. 2+(nblock/32) ] destroyed
 243 */
 244
 245 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
 246 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
 247 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
 248 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
 249 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
 250
 251 static
 252 void fallbackSort ( UInt32* fmap,
 253                     UInt32* eclass,
 254                     UInt32* bhtab,
 255                     Int32   nblock,
 256                     Int32   verb )
 257 {
 258    Int32 ftab[257];
 259    Int32 ftabCopy[256];
 260    Int32 H, i, j, k, l, r, cc, cc1;
 261    Int32 nNotDone;
 262    Int32 nBhtab;
 263    UChar* eclass8 = (UChar*)eclass;
 264
 265    /*--
 266       Initial 1-char radix sort to generate
 267       initial fmap and initial BH bits.
 268    --*/
 269    if (verb >= 4)
 270       VPrintf0 ( "        bucket sorting ...\n" );
 271    for (i = 0; i < 257;    i++) ftab[i] = 0;
 272    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
 273    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
 274    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
 275
 276    for (i = 0; i < nblock; i++) {
 277       j = eclass8[i];
 278       k = ftab[j] - 1;
 279       ftab[j] = k;
 280       fmap[k] = i;
 281    }
 282
 283    nBhtab = 2 + (nblock / 32);
 284    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
 285    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
 286
 287    /*--
 288       Inductively refine the buckets.  Kind-of an
 289       "exponential radix sort" (!), inspired by the
 290       Manber-Myers suffix array construction algorithm.
 291    --*/
 292
 293    /*-- set sentinel bits for block-end detection --*/
 294    for (i = 0; i < 32; i++) {
 295       SET_BH(nblock + 2*i);
 296       CLEAR_BH(nblock + 2*i + 1);
 297    }
 298
 299    /*-- the log(N) loop --*/
 300    H = 1;
 301    while (1) {
 302
 303       if (verb >= 4)
 304          VPrintf1 ( "        depth %6d has ", H );
 305
 306       j = 0;
 307       for (i = 0; i < nblock; i++) {
 308          if (ISSET_BH(i)) j = i;
 309          k = fmap[i] - H; if (k < 0) k += nblock;
 310          eclass[k] = j;
 311       }
 312
 313       nNotDone = 0;
 314       r = -1;
 315       while (1) {
 316
 317          /*-- find the next non-singleton bucket --*/
 318          k = r + 1;
 319          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 320          if (ISSET_BH(k)) {
 321             while (WORD_BH(k) == 0xffffffff) k += 32;
 322             while (ISSET_BH(k)) k++;
 323          }
 324          l = k - 1;
 325          if (l >= nblock) break;
 326          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 327          if (!ISSET_BH(k)) {
 328             while (WORD_BH(k) == 0x00000000) k += 32;
 329             while (!ISSET_BH(k)) k++;
 330          }
 331          r = k - 1;
 332          if (r >= nblock) break;
 333
 334          /*-- now [l, r] bracket current bucket --*/
 335          if (r > l) {
 336             nNotDone += (r - l + 1);
 337             fallbackQSort3 ( fmap, eclass, l, r );
 338
 339             /*-- scan bucket and generate header bits-- */
 340             cc = -1;
 341             for (i = l; i <= r; i++) {
 342                cc1 = eclass[fmap[i]];
 343                if (cc != cc1) { SET_BH(i); cc = cc1; };
 344             }
 345          }
 346       }
 347
 348       if (verb >= 4)
 349          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
 350
 351       H *= 2;
 352       if (H > nblock || nNotDone == 0) break;
 353    }
 354
 355    /*--
 356       Reconstruct the original block in
 357       eclass8 [0 .. nblock-1], since the
 358       previous phase destroyed it.
 359    --*/
 360    if (verb >= 4)
 361       VPrintf0 ( "        reconstructing block ...\n" );
 362    j = 0;
 363    for (i = 0; i < nblock; i++) {
 364       while (ftabCopy[j] == 0) j++;
 365       ftabCopy[j]--;
 366       eclass8[fmap[i]] = (UChar)j;
 367    }
 368    AssertH ( j < 256, 1005 );
 369 }
 370
 371 #undef       SET_BH
 372 #undef     CLEAR_BH
 373 #undef     ISSET_BH
 374 #undef      WORD_BH
 375 #undef UNALIGNED_BH
 376
 377
 378 /*---------------------------------------------*/
 379 /*--- The main, O(N^2 log(N)) sorting       ---*/
 380 /*--- algorithm.  Faster for "normal"       ---*/
 381 /*--- non-repetitive blocks.                ---*/
 382 /*---------------------------------------------*/
 383
 384 /*---------------------------------------------*/
 385 static
 386 __inline__
 387 Bool mainGtU ( UInt32  i1,
 388                UInt32  i2,
 389                UChar*  block,
 390                UInt16* quadrant,
 391                UInt32  nblock,
 392                Int32*  budget )
 393 {
 394    Int32  k;
 395    UChar  c1, c2;
 396    UInt16 s1, s2;
 397
 398    AssertD ( i1 != i2, "mainGtU" );
 399    /* 1 */
 400    c1 = block[i1]; c2 = block[i2];
 401    if (c1 != c2) return (c1 > c2);
 402    i1++; i2++;
 403    /* 2 */
 404    c1 = block[i1]; c2 = block[i2];
 405    if (c1 != c2) return (c1 > c2);
 406    i1++; i2++;
 407    /* 3 */
 408    c1 = block[i1]; c2 = block[i2];
 409    if (c1 != c2) return (c1 > c2);
 410    i1++; i2++;
 411    /* 4 */
 412    c1 = block[i1]; c2 = block[i2];
 413    if (c1 != c2) return (c1 > c2);
 414    i1++; i2++;
 415    /* 5 */
 416    c1 = block[i1]; c2 = block[i2];
 417    if (c1 != c2) return (c1 > c2);
 418    i1++; i2++;
 419    /* 6 */
 420    c1 = block[i1]; c2 = block[i2];
 421    if (c1 != c2) return (c1 > c2);
 422    i1++; i2++;
 423    /* 7 */
 424    c1 = block[i1]; c2 = block[i2];
 425    if (c1 != c2) return (c1 > c2);
 426    i1++; i2++;
 427    /* 8 */
 428    c1 = block[i1]; c2 = block[i2];
 429    if (c1 != c2) return (c1 > c2);
 430    i1++; i2++;
 431    /* 9 */
 432    c1 = block[i1]; c2 = block[i2];
 433    if (c1 != c2) return (c1 > c2);
 434    i1++; i2++;
 435    /* 10 */
 436    c1 = block[i1]; c2 = block[i2];
 437    if (c1 != c2) return (c1 > c2);
 438    i1++; i2++;
 439    /* 11 */
 440    c1 = block[i1]; c2 = block[i2];
 441    if (c1 != c2) return (c1 > c2);
 442    i1++; i2++;
 443    /* 12 */
 444    c1 = block[i1]; c2 = block[i2];
 445    if (c1 != c2) return (c1 > c2);
 446    i1++; i2++;
 447
 448    k = nblock + 8;
 449
 450    do {
 451       /* 1 */
 452       c1 = block[i1]; c2 = block[i2];
 453       if (c1 != c2) return (c1 > c2);
 454       s1 = quadrant[i1]; s2 = quadrant[i2];
 455       if (s1 != s2) return (s1 > s2);
 456       i1++; i2++;
 457       /* 2 */
 458       c1 = block[i1]; c2 = block[i2];
 459       if (c1 != c2) return (c1 > c2);
 460       s1 = quadrant[i1]; s2 = quadrant[i2];
 461       if (s1 != s2) return (s1 > s2);
 462       i1++; i2++;
 463       /* 3 */
 464       c1 = block[i1]; c2 = block[i2];
 465       if (c1 != c2) return (c1 > c2);
 466       s1 = quadrant[i1]; s2 = quadrant[i2];
 467       if (s1 != s2) return (s1 > s2);
 468       i1++; i2++;
 469       /* 4 */
 470       c1 = block[i1]; c2 = block[i2];
 471       if (c1 != c2) return (c1 > c2);
 472       s1 = quadrant[i1]; s2 = quadrant[i2];
 473       if (s1 != s2) return (s1 > s2);
 474       i1++; i2++;
 475       /* 5 */
 476       c1 = block[i1]; c2 = block[i2];
 477       if (c1 != c2) return (c1 > c2);
 478       s1 = quadrant[i1]; s2 = quadrant[i2];
 479       if (s1 != s2) return (s1 > s2);
 480       i1++; i2++;
 481       /* 6 */
 482       c1 = block[i1]; c2 = block[i2];
 483       if (c1 != c2) return (c1 > c2);
 484       s1 = quadrant[i1]; s2 = quadrant[i2];
 485       if (s1 != s2) return (s1 > s2);
 486       i1++; i2++;
 487       /* 7 */
 488       c1 = block[i1]; c2 = block[i2];
 489       if (c1 != c2) return (c1 > c2);
 490       s1 = quadrant[i1]; s2 = quadrant[i2];
 491       if (s1 != s2) return (s1 > s2);
 492       i1++; i2++;
 493       /* 8 */
 494       c1 = block[i1]; c2 = block[i2];
 495       if (c1 != c2) return (c1 > c2);
 496       s1 = quadrant[i1]; s2 = quadrant[i2];
 497       if (s1 != s2) return (s1 > s2);
 498       i1++; i2++;
 499
 500       if (i1 >= nblock) i1 -= nblock;
 501       if (i2 >= nblock) i2 -= nblock;
 502
 503       k -= 8;
 504       (*budget)--;
 505    }
 506       while (k >= 0);
 507
 508    return False;
 509 }
 510
 511
 512 /*---------------------------------------------*/
 513 /*--
 514    Knuth's increments seem to work better
 515    than Incerpi-Sedgewick here.  Possibly
 516    because the number of elems to sort is
 517    usually small, typically <= 20.
 518 --*/
 519 static
 520 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
 521                    9841, 29524, 88573, 265720,
 522                    797161, 2391484 };
 523
 524 static
 525 void mainSimpleSort ( UInt32* ptr,
 526                       UChar*  block,
 527                       UInt16* quadrant,
 528                       Int32   nblock,
 529                       Int32   lo,
 530                       Int32   hi,
 531                       Int32   d,
 532                       Int32*  budget )
 533 {
 534    Int32 i, j, h, bigN, hp;
 535    UInt32 v;
 536
 537    bigN = hi - lo + 1;
 538    if (bigN < 2) return;
 539
 540    hp = 0;
 541    while (incs[hp] < bigN) hp++;
 542    hp--;
 543
 544    for (; hp >= 0; hp--) {
 545       h = incs[hp];
 546
 547       i = lo + h;
 548       while (True) {
 549
 550          /*-- copy 1 --*/
 551          if (i > hi) break;
 552          v = ptr[i];
 553          j = i;
 554          while ( mainGtU (
 555                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 556                  ) ) {
 557             ptr[j] = ptr[j-h];
 558             j = j - h;
 559             if (j <= (lo + h - 1)) break;
 560          }
 561          ptr[j] = v;
 562          i++;
 563
 564          /*-- copy 2 --*/
 565          if (i > hi) break;
 566          v = ptr[i];
 567          j = i;
 568          while ( mainGtU (
 569                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 570                  ) ) {
 571             ptr[j] = ptr[j-h];
 572             j = j - h;
 573             if (j <= (lo + h - 1)) break;
 574          }
 575          ptr[j] = v;
 576          i++;
 577
 578          /*-- copy 3 --*/
 579          if (i > hi) break;
 580          v = ptr[i];
 581          j = i;
 582          while ( mainGtU (
 583                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 584                  ) ) {
 585             ptr[j] = ptr[j-h];
 586             j = j - h;
 587             if (j <= (lo + h - 1)) break;
 588          }
 589          ptr[j] = v;
 590          i++;
 591
 592          if (*budget < 0) return;
 593       }
 594    }
 595 }
 596
 597
 598 /*---------------------------------------------*/
 599 /*--
 600    The following is an implementation of
 601    an elegant 3-way quicksort for strings,
 602    described in a paper "Fast Algorithms for
 603    Sorting and Searching Strings", by Robert
 604    Sedgewick and Jon L. Bentley.
 605 --*/
 606
 607 #define mswap(zz1, zz2) \
 608    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 609
 610 #define mvswap(zzp1, zzp2, zzn)       \
 611 {                                     \
 612    Int32 yyp1 = (zzp1);               \
 613    Int32 yyp2 = (zzp2);               \
 614    Int32 yyn  = (zzn);                \
 615    while (yyn > 0) {                  \
 616       mswap(ptr[yyp1], ptr[yyp2]);    \
 617       yyp1++; yyp2++; yyn--;          \
 618    }                                  \
 619 }
 620
 621 static
 622 __inline__
 623 UChar mmed3 ( UChar a, UChar b, UChar c )
 624 {
 625    UChar t;
 626    if (a > b) { t = a; a = b; b = t; };
 627    if (b > c) {
 628       b = c;
 629       if (a > b) b = a;
 630    }
 631    return b;
 632 }
 633
 634 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
 635
 636 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
 637                           stackHi[sp] = hz; \
 638                           stackD [sp] = dz; \
 639                           sp++; }
 640
 641 #define mpop(lz,hz,dz) { sp--;             \
 642                          lz = stackLo[sp]; \
 643                          hz = stackHi[sp]; \
 644                          dz = stackD [sp]; }
 645
 646
 647 #define mnextsize(az) (nextHi[az]-nextLo[az])
 648
 649 #define mnextswap(az,bz)                                        \
 650    { Int32 tz;                                                  \
 651      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
 652      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
 653      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
 654
 655
 656 #define MAIN_QSORT_SMALL_THRESH 20
 657 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
 658 #define MAIN_QSORT_STACK_SIZE 100
 659
 660 static
 661 void mainQSort3 ( UInt32* ptr,
 662                   UChar*  block,
 663                   UInt16* quadrant,
 664                   Int32   nblock,
 665                   Int32   loSt,
 666                   Int32   hiSt,
 667                   Int32   dSt,
 668                   Int32*  budget )
 669 {
 670    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
 671    Int32 sp, lo, hi, d;
 672
 673    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
 674    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
 675    Int32 stackD [MAIN_QSORT_STACK_SIZE];
 676
 677    Int32 nextLo[3];
 678    Int32 nextHi[3];
 679    Int32 nextD [3];
 680
 681    sp = 0;
 682    mpush ( loSt, hiSt, dSt );
 683
 684    while (sp > 0) {
 685
 686       AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
 687
 688       mpop ( lo, hi, d );
 689       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
 690           d > MAIN_QSORT_DEPTH_THRESH) {
 691          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
 692          if (*budget < 0) return;
 693          continue;
 694       }
 695
 696       med = (Int32)
 697             mmed3 ( block[ptr[ lo         ]+d],
 698                     block[ptr[ hi         ]+d],
 699                     block[ptr[ (lo+hi)>>1 ]+d] );
 700
 701       unLo = ltLo = lo;
 702       unHi = gtHi = hi;
 703
 704       while (True) {
 705          while (True) {
 706             if (unLo > unHi) break;
 707             n = ((Int32)block[ptr[unLo]+d]) - med;
 708             if (n == 0) {
 709                mswap(ptr[unLo], ptr[ltLo]);
 710                ltLo++; unLo++; continue;
 711             };
 712             if (n >  0) break;
 713             unLo++;
 714          }
 715          while (True) {
 716             if (unLo > unHi) break;
 717             n = ((Int32)block[ptr[unHi]+d]) - med;
 718             if (n == 0) {
 719                mswap(ptr[unHi], ptr[gtHi]);
 720                gtHi--; unHi--; continue;
 721             };
 722             if (n <  0) break;
 723             unHi--;
 724          }
 725          if (unLo > unHi) break;
 726          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
 727       }
 728
 729       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
 730
 731       if (gtHi < ltLo) {
 732          mpush(lo, hi, d+1 );
 733          continue;
 734       }
 735
 736       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
 737       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
 738
 739       n = lo + unLo - ltLo - 1;
 740       m = hi - (gtHi - unHi) + 1;
 741
 742       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
 743       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
 744       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
 745
 746       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 747       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
 748       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 749
 750       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
 751       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
 752
 753       mpush (nextLo[0], nextHi[0], nextD[0]);
 754       mpush (nextLo[1], nextHi[1], nextD[1]);
 755       mpush (nextLo[2], nextHi[2], nextD[2]);
 756    }
 757 }
 758
 759 #undef mswap
 760 #undef mvswap
 761 #undef mpush
 762 #undef mpop
 763 #undef mmin
 764 #undef mnextsize
 765 #undef mnextswap
 766 #undef MAIN_QSORT_SMALL_THRESH
 767 #undef MAIN_QSORT_DEPTH_THRESH
 768 #undef MAIN_QSORT_STACK_SIZE
 769
 770
 771 /*---------------------------------------------*/
 772 /* Pre:
 773       nblock > N_OVERSHOOT
 774       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
 775       ((UChar*)block32) [0 .. nblock-1] holds block
 776       ptr exists for [0 .. nblock-1]
 777
 778    Post:
 779       ((UChar*)block32) [0 .. nblock-1] holds block
 780       All other areas of block32 destroyed
 781       ftab [0 .. 65536 ] destroyed
 782       ptr [0 .. nblock-1] holds sorted order
 783       if (*budget < 0), sorting was abandoned
 784 */
 785
 786 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
 787 #define SETMASK (1 << 21)
 788 #define CLEARMASK (~(SETMASK))
 789
 790 static
 791 void mainSort ( UInt32* ptr,
 792                 UChar*  block,
 793                 UInt16* quadrant,
 794                 UInt32* ftab,
 795                 Int32   nblock,
 796                 Int32   verb,
 797                 Int32*  budget )
 798 {
 799    Int32  i, j, k, ss, sb;
 800    Int32  runningOrder[256];
 801    Bool   bigDone[256];
 802    Int32  copyStart[256];
 803    Int32  copyEnd  [256];
 804    UChar  c1;
 805    Int32  numQSorted;
 806    UInt16 s;
 807    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
 808
 809    /*-- set up the 2-byte frequency table --*/
 810    for (i = 65536; i >= 0; i--) ftab[i] = 0;
 811
 812    j = block[0] << 8;
 813    i = nblock-1;
 814    for (; i >= 3; i -= 4) {
 815       quadrant[i] = 0;
 816       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 817       ftab[j]++;
 818       quadrant[i-1] = 0;
 819       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
 820       ftab[j]++;
 821       quadrant[i-2] = 0;
 822       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
 823       ftab[j]++;
 824       quadrant[i-3] = 0;
 825       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
 826       ftab[j]++;
 827    }
 828    for (; i >= 0; i--) {
 829       quadrant[i] = 0;
 830       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 831       ftab[j]++;
 832    }
 833
 834    /*-- (emphasises close relationship of block & quadrant) --*/
 835    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
 836       block   [nblock+i] = block[i];
 837       quadrant[nblock+i] = 0;
 838    }
 839
 840    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
 841
 842    /*-- Complete the initial radix sort --*/
 843    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
 844
 845    s = block[0] << 8;
 846    i = nblock-1;
 847    for (; i >= 3; i -= 4) {
 848       s = (s >> 8) | (block[i] << 8);
 849       j = ftab[s] -1;
 850       ftab[s] = j;
 851       ptr[j] = i;
 852       s = (s >> 8) | (block[i-1] << 8);
 853       j = ftab[s] -1;
 854       ftab[s] = j;
 855       ptr[j] = i-1;
 856       s = (s >> 8) | (block[i-2] << 8);
 857       j = ftab[s] -1;
 858       ftab[s] = j;
 859       ptr[j] = i-2;
 860       s = (s >> 8) | (block[i-3] << 8);
 861       j = ftab[s] -1;
 862       ftab[s] = j;
 863       ptr[j] = i-3;
 864    }
 865    for (; i >= 0; i--) {
 866       s = (s >> 8) | (block[i] << 8);
 867       j = ftab[s] -1;
 868       ftab[s] = j;
 869       ptr[j] = i;
 870    }
 871
 872    /*--
 873       Now ftab contains the first loc of every small bucket.
 874       Calculate the running order, from smallest to largest
 875       big bucket.
 876    --*/
 877    for (i = 0; i <= 255; i++) {
 878       bigDone     [i] = False;
 879       runningOrder[i] = i;
 880    }
 881
 882    {
 883       Int32 vv;
 884       Int32 h = 1;
 885       do h = 3 * h + 1; while (h <= 256);
 886       do {
 887          h = h / 3;
 888          for (i = h; i <= 255; i++) {
 889             vv = runningOrder[i];
 890             j = i;
 891             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
 892                runningOrder[j] = runningOrder[j-h];
 893                j = j - h;
 894                if (j <= (h - 1)) goto zero;
 895             }
 896             zero:
 897             runningOrder[j] = vv;
 898          }
 899       } while (h != 1);
 900    }
 901
 902    /*--
 903       The main sorting loop.
 904    --*/
 905
 906    numQSorted = 0;
 907
 908    for (i = 0; i <= 255; i++) {
 909
 910       /*--
 911          Process big buckets, starting with the least full.
 912          Basically this is a 3-step process in which we call
 913          mainQSort3 to sort the small buckets [ss, j], but
 914          also make a big effort to avoid the calls if we can.
 915       --*/
 916       ss = runningOrder[i];
 917
 918       /*--
 919          Step 1:
 920          Complete the big bucket [ss] by quicksorting
 921          any unsorted small buckets [ss, j], for j != ss.
 922          Hopefully previous pointer-scanning phases have already
 923          completed many of the small buckets [ss, j], so
 924          we don't have to sort them at all.
 925       --*/
 926       for (j = 0; j <= 255; j++) {
 927          if (j != ss) {
 928             sb = (ss << 8) + j;
 929             if ( ! (ftab[sb] & SETMASK) ) {
 930                Int32 lo = ftab[sb]   & CLEARMASK;
 931                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
 932                if (hi > lo) {
 933                   if (verb >= 4)
 934                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
 935                                 "done %d   this %d\n",
 936                                 ss, j, numQSorted, hi - lo + 1 );
 937                   mainQSort3 (
 938                      ptr, block, quadrant, nblock,
 939                      lo, hi, BZ_N_RADIX, budget
 940                   );
 941                   numQSorted += (hi - lo + 1);
 942                   if (*budget < 0) return;
 943                }
 944             }
 945             ftab[sb] |= SETMASK;
 946          }
 947       }
 948
 949       AssertH ( !bigDone[ss], 1006 );
 950
 951       /*--
 952          Step 2:
 953          Now scan this big bucket [ss] so as to synthesise the
 954          sorted order for small buckets [t, ss] for all t,
 955          including, magically, the bucket [ss,ss] too.
 956          This will avoid doing Real Work in subsequent Step 1's.
 957       --*/
 958       {
 959          for (j = 0; j <= 255; j++) {
 960             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
 961             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
 962          }
 963          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
 964             k = ptr[j]-1; if (k < 0) k += nblock;
 965             c1 = block[k];
 966             if (!bigDone[c1])
 967                ptr[ copyStart[c1]++ ] = k;
 968          }
 969          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
 970             k = ptr[j]-1; if (k < 0) k += nblock;
 971             c1 = block[k];
 972             if (!bigDone[c1])
 973                ptr[ copyEnd[c1]-- ] = k;
 974          }
 975       }
 976
 977       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
 978                 ||
 979                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
 980                    Necessity for this case is demonstrated by compressing
 981                    a sequence of approximately 48.5 million of character
 982                    251; 1.0.0/1.0.1 will then die here. */
 983                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
 984                 1007 )
 985
 986       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
 987
 988       /*--
 989          Step 3:
 990          The [ss] big bucket is now done.  Record this fact,
 991          and update the quadrant descriptors.  Remember to
 992          update quadrants in the overshoot area too, if
 993          necessary.  The "if (i < 255)" test merely skips
 994          this updating for the last bucket processed, since
 995          updating for the last bucket is pointless.
 996
 997          The quadrant array provides a way to incrementally
 998          cache sort orderings, as they appear, so as to
 999          make subsequent comparisons in fullGtU() complete
1000          faster.  For repetitive blocks this makes a big
1001          difference (but not big enough to be able to avoid
1002          the fallback sorting mechanism, exponential radix sort).
1003
1004          The precise meaning is: at all times:
1005
1006             for 0 <= i < nblock and 0 <= j <= nblock
1007
1008             if block[i] != block[j],
1009
1010                then the relative values of quadrant[i] and
1011                     quadrant[j] are meaningless.
1012
1013                else {
1014                   if quadrant[i] < quadrant[j]
1015                      then the string starting at i lexicographically
1016                      precedes the string starting at j
1017
1018                   else if quadrant[i] > quadrant[j]
1019                      then the string starting at j lexicographically
1020                      precedes the string starting at i
1021
1022                   else
1023                      the relative ordering of the strings starting
1024                      at i and j has not yet been determined.
1025                }
1026       --*/
1027       bigDone[ss] = True;
1028
1029       if (i < 255) {
1030          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
1031          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
1032          Int32 shifts   = 0;
1033
1034          while ((bbSize >> shifts) > 65534) shifts++;
1035
1036          for (j = bbSize-1; j >= 0; j--) {
1037             Int32 a2update     = ptr[bbStart + j];
1038             UInt16 qVal        = (UInt16)(j >> shifts);
1039             quadrant[a2update] = qVal;
1040             if (a2update < BZ_N_OVERSHOOT)
1041                quadrant[a2update + nblock] = qVal;
1042          }
1043          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1044       }
1045
1046    }
1047
1048    if (verb >= 4)
1049       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1050                  nblock, numQSorted, nblock - numQSorted );
1051 }
1052
1053 #undef BIGFREQ
1054 #undef SETMASK
1055 #undef CLEARMASK
1056
1057
1058 /*---------------------------------------------*/
1059 /* Pre:
1060       nblock > 0
1061       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1062       ((UChar*)arr2)  [0 .. nblock-1] holds block
1063       arr1 exists for [0 .. nblock-1]
1064
1065    Post:
1066       ((UChar*)arr2) [0 .. nblock-1] holds block
1067       All other areas of block destroyed
1068       ftab [ 0 .. 65536 ] destroyed
1069       arr1 [0 .. nblock-1] holds sorted order
1070 */
1071 void BZ2_blockSort ( EState* s )
1072 {
1073    UInt32* ptr    = s->ptr;
1074    UChar*  block  = s->block;
1075    UInt32* ftab   = s->ftab;
1076    Int32   nblock = s->nblock;
1077    Int32   verb   = s->verbosity;
1078    Int32   wfact  = s->workFactor;
1079    UInt16* quadrant;
1080    Int32   budget;
1081    Int32   budgetInit;
1082    Int32   i;
1083
1084    if (nblock < 10000) {
1085       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1086    } else {
1087       /* Calculate the location for quadrant, remembering to get
1088          the alignment right.  Assumes that &(block[0]) is at least
1089          2-byte aligned -- this should be ok since block is really
1090          the first section of arr2.
1091       */
1092       i = nblock+BZ_N_OVERSHOOT;
1093       if (i & 1) i++;
1094       quadrant = (UInt16*)(&(block[i]));
1095
1096       /* (wfact-1) / 3 puts the default-factor-30
1097          transition point at very roughly the same place as
1098          with v0.1 and v0.9.0.
1099          Not that it particularly matters any more, since the
1100          resulting compressed stream is now the same regardless
1101          of whether or not we use the main sort or fallback sort.
1102       */
1103       if (wfact < 1  ) wfact = 1;
1104       if (wfact > 100) wfact = 100;
1105       budgetInit = nblock * ((wfact-1) / 3);
1106       budget = budgetInit;
1107
1108       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1109       if (verb >= 3)
1110          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1111                     budgetInit - budget,
1112                     nblock,
1113                     (float)(budgetInit - budget) /
1114                     (float)(nblock==0 ? 1 : nblock) );
1115       if (budget < 0) {
1116          if (verb >= 2)
1117             VPrintf0 ( "    too repetitive; using fallback"
1118                        " sorting algorithm\n" );
1119          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1120       }
1121    }
1122
1123    s->origPtr = -1;
1124    for (i = 0; i < s->nblock; i++)
1125       if (ptr[i] == 0)
1126          { s->origPtr = i; break; };
1127
1128    AssertH( s->origPtr != -1, 1003 );
1129 }
1130
1131
1132 /*-------------------------------------------------------------*/
1133 /*--- end                                       blocksort.c ---*/
1134 /*-------------------------------------------------------------*/