plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

blocksort.c (32172B)


      1 
      2 /*-------------------------------------------------------------*/
      3 /*--- Block sorting machinery                               ---*/
      4 /*---                                           blocksort.c ---*/
      5 /*-------------------------------------------------------------*/
      6 
      7 /*--
      8   This file is a part of bzip2 and/or libbzip2, a program and
      9   library for lossless, block-sorting data compression.
     10 
     11   Copyright (C) 1996-2000 Julian R Seward.  All rights reserved.
     12 
     13   Redistribution and use in source and binary forms, with or without
     14   modification, are permitted provided that the following conditions
     15   are met:
     16 
     17   1. Redistributions of source code must retain the above copyright
     18      notice, this list of conditions and the following disclaimer.
     19 
     20   2. The origin of this software must not be misrepresented; you must
     21      not claim that you wrote the original software.  If you use this
     22      software in a product, an acknowledgment in the product
     23      documentation would be appreciated but is not required.
     24 
     25   3. Altered source versions must be plainly marked as such, and must
     26      not be misrepresented as being the original software.
     27 
     28   4. The name of the author may not be used to endorse or promote
     29      products derived from this software without specific prior written
     30      permission.
     31 
     32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
     33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
     36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
     38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
     40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     43 
     44   Julian Seward, Cambridge, UK.
     45   jseward@acm.org
     46   bzip2/libbzip2 version 1.0 of 21 March 2000
     47 
     48   This program is based on (at least) the work of:
     49      Mike Burrows
     50      David Wheeler
     51      Peter Fenwick
     52      Alistair Moffat
     53      Radford Neal
     54      Ian H. Witten
     55      Robert Sedgewick
     56      Jon L. Bentley
     57 
     58   For more information on these sources, see the manual.
     59 
     60   To get some idea how the block sorting algorithms in this file
     61   work, read my paper
     62      On the Performance of BWT Sorting Algorithms
     63   in Proceedings of the IEEE Data Compression Conference 2000,
     64   Snowbird, Utah, USA, 27-30 March 2000.  The main sort in this
     65   file implements the algorithm called  cache  in the paper.
     66 --*/
     67 
     68 #include "os.h"
     69 #include "bzlib.h"
     70 #include "bzlib_private.h"
     71 
     72 /*---------------------------------------------*/
     73 /*--- Fallback O(N log(N)^2) sorting        ---*/
     74 /*--- algorithm, for repetitive blocks      ---*/
     75 /*---------------------------------------------*/
     76 
     77 /*---------------------------------------------*/
     78 static
     79 __inline__
     80 void fallbackSimpleSort ( UInt32* fmap,
     81                           UInt32* eclass,
     82                           Int32   lo,
     83                           Int32   hi )
     84 {
     85    Int32 i, j, tmp;
     86    UInt32 ec_tmp;
     87 
     88    if (lo == hi) return;
     89 
     90    if (hi - lo > 3) {
     91       for ( i = hi-4; i >= lo; i-- ) {
     92          tmp = fmap[i];
     93          ec_tmp = eclass[tmp];
     94          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
     95             fmap[j-4] = fmap[j];
     96          fmap[j-4] = tmp;
     97       }
     98    }
     99 
    100    for ( i = hi-1; i >= lo; i-- ) {
    101       tmp = fmap[i];
    102       ec_tmp = eclass[tmp];
    103       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
    104          fmap[j-1] = fmap[j];
    105       fmap[j-1] = tmp;
    106    }
    107 }
    108 
    109 
    110 /*---------------------------------------------*/
    111 #define fswap(zz1, zz2) \
    112    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
    113 
    114 #define fvswap(zzp1, zzp2, zzn)       \
    115 {                                     \
    116    Int32 yyp1 = (zzp1);               \
    117    Int32 yyp2 = (zzp2);               \
    118    Int32 yyn  = (zzn);                \
    119    while (yyn > 0) {                  \
    120       fswap(fmap[yyp1], fmap[yyp2]);  \
    121       yyp1++; yyp2++; yyn--;          \
    122    }                                  \
    123 }
    124 
    125 
    126 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
    127 
    128 #define fpush(lz,hz) { stackLo[sp] = lz; \
    129                        stackHi[sp] = hz; \
    130                        sp++; }
    131 
    132 #define fpop(lz,hz) { sp--;              \
    133                       lz = stackLo[sp];  \
    134                       hz = stackHi[sp]; }
    135 
    136 #define FALLBACK_QSORT_SMALL_THRESH 10
    137 #define FALLBACK_QSORT_STACK_SIZE   100
    138 
    139 
    140 static
    141 void fallbackQSort3 ( UInt32* fmap,
    142                       UInt32* eclass,
    143                       Int32   loSt,
    144                       Int32   hiSt )
    145 {
    146    Int32 unLo, unHi, ltLo, gtHi, n, m;
    147    Int32 sp, lo, hi;
    148    UInt32 med, r, r3;
    149    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
    150    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
    151 
    152    r = 0;
    153 
    154    sp = 0;
    155    fpush ( loSt, hiSt );
    156 
    157    while (sp > 0) {
    158 
    159       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 );
    160 
    161       fpop ( lo, hi );
    162       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
    163          fallbackSimpleSort ( fmap, eclass, lo, hi );
    164          continue;
    165       }
    166 
    167       /* Random partitioning.  Median of 3 sometimes fails to
    168          avoid bad cases.  Median of 9 seems to help but
    169          looks rather expensive.  This too seems to work but
    170          is cheaper.  Guidance for the magic constants
    171          7621 and 32768 is taken from Sedgewick's algorithms
    172          book, chapter 35.
    173       */
    174       r = ((r * 7621) + 1) % 32768;
    175       r3 = r % 3;
    176       if (r3 == 0) med = eclass[fmap[lo]]; else
    177       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
    178                    med = eclass[fmap[hi]];
    179 
    180       unLo = ltLo = lo;
    181       unHi = gtHi = hi;
    182 
    183       while (1) {
    184          while (1) {
    185             if (unLo > unHi) break;
    186             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
    187             if (n == 0) {
    188                fswap(fmap[unLo], fmap[ltLo]);
    189                ltLo++; unLo++;
    190                continue;
    191             };
    192             if (n > 0) break;
    193             unLo++;
    194          }
    195          while (1) {
    196             if (unLo > unHi) break;
    197             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
    198             if (n == 0) {
    199                fswap(fmap[unHi], fmap[gtHi]);
    200                gtHi--; unHi--;
    201                continue;
    202             };
    203             if (n < 0) break;
    204             unHi--;
    205          }
    206          if (unLo > unHi) break;
    207          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
    208       }
    209 
    210       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
    211 
    212       if (gtHi < ltLo) continue;
    213 
    214       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
    215       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
    216 
    217       n = lo + unLo - ltLo - 1;
    218       m = hi - (gtHi - unHi) + 1;
    219 
    220       if (n - lo > hi - m) {
    221          fpush ( lo, n );
    222          fpush ( m, hi );
    223       } else {
    224          fpush ( m, hi );
    225          fpush ( lo, n );
    226       }
    227    }
    228 }
    229 
    230 #undef fmin
    231 #undef fpush
    232 #undef fpop
    233 #undef fswap
    234 #undef fvswap
    235 #undef FALLBACK_QSORT_SMALL_THRESH
    236 #undef FALLBACK_QSORT_STACK_SIZE
    237 
    238 
    239 /*---------------------------------------------*/
    240 /* Pre:
    241       nblock > 0
    242       eclass exists for [0 .. nblock-1]
    243       ((UChar*)eclass) [0 .. nblock-1] holds block
    244       ptr exists for [0 .. nblock-1]
    245 
    246    Post:
    247       ((UChar*)eclass) [0 .. nblock-1] holds block
    248       All other areas of eclass destroyed
    249       fmap [0 .. nblock-1] holds sorted order
    250       bhtab [ 0 .. 2+(nblock/32) ] destroyed
    251 */
    252 
    253 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
    254 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
    255 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
    256 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
    257 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
    258 
    259 static
    260 void fallbackSort ( UInt32* fmap,
    261                     UInt32* eclass,
    262                     UInt32* bhtab,
    263                     Int32   nblock,
    264                     Int32   verb )
    265 {
    266    Int32 ftab[257];
    267    Int32 ftabCopy[256];
    268    Int32 H, i, j, k, l, r, cc, cc1;
    269    Int32 nNotDone;
    270    Int32 nBhtab;
    271    UChar* eclass8 = (UChar*)eclass;
    272 
    273    /*--
    274       Initial 1-char radix sort to generate
    275       initial fmap and initial BH bits.
    276    --*/
    277    if (verb >= 4)
    278       VPrintf0 ( "        bucket sorting ...\n" );
    279    for (i = 0; i < 257;    i++) ftab[i] = 0;
    280    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
    281    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
    282    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
    283 
    284    for (i = 0; i < nblock; i++) {
    285       j = eclass8[i];
    286       k = ftab[j] - 1;
    287       ftab[j] = k;
    288       fmap[k] = i;
    289    }
    290 
    291    nBhtab = 2 + (nblock / 32);
    292    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
    293    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
    294 
    295    /*--
    296       Inductively refine the buckets.  Kind-of an
    297       "exponential radix sort" (!), inspired by the
    298       Manber-Myers suffix array construction algorithm.
    299    --*/
    300 
    301    /*-- set sentinel bits for block-end detection --*/
    302    for (i = 0; i < 32; i++) {
    303       SET_BH(nblock + 2*i);
    304       CLEAR_BH(nblock + 2*i + 1);
    305    }
    306 
    307    /*-- the log(N) loop --*/
    308    H = 1;
    309    while (1) {
    310 
    311       if (verb >= 4)
    312          VPrintf1 ( "        depth %6d has ", H );
    313 
    314       j = 0;
    315       for (i = 0; i < nblock; i++) {
    316          if (ISSET_BH(i)) j = i;
    317          k = fmap[i] - H; if (k < 0) k += nblock;
    318          eclass[k] = j;
    319       }
    320 
    321       nNotDone = 0;
    322       r = -1;
    323       while (1) {
    324 
    325 	 /*-- find the next non-singleton bucket --*/
    326          k = r + 1;
    327          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
    328          if (ISSET_BH(k)) {
    329             while (WORD_BH(k) == 0xffffffff) k += 32;
    330             while (ISSET_BH(k)) k++;
    331          }
    332          l = k - 1;
    333          if (l >= nblock) break;
    334          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
    335          if (!ISSET_BH(k)) {
    336             while (WORD_BH(k) == 0x00000000) k += 32;
    337             while (!ISSET_BH(k)) k++;
    338          }
    339          r = k - 1;
    340          if (r >= nblock) break;
    341 
    342          /*-- now [l, r] bracket current bucket --*/
    343          if (r > l) {
    344             nNotDone += (r - l + 1);
    345             fallbackQSort3 ( fmap, eclass, l, r );
    346 
    347             /*-- scan bucket and generate header bits-- */
    348             cc = -1;
    349             for (i = l; i <= r; i++) {
    350                cc1 = eclass[fmap[i]];
    351                if (cc != cc1) { SET_BH(i); cc = cc1; };
    352             }
    353          }
    354       }
    355 
    356       if (verb >= 4)
    357          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
    358 
    359       H *= 2;
    360       if (H > nblock || nNotDone == 0) break;
    361    }
    362 
    363    /*--
    364       Reconstruct the original block in
    365       eclass8 [0 .. nblock-1], since the
    366       previous phase destroyed it.
    367    --*/
    368    if (verb >= 4)
    369       VPrintf0 ( "        reconstructing block ...\n" );
    370    j = 0;
    371    for (i = 0; i < nblock; i++) {
    372       while (ftabCopy[j] == 0) j++;
    373       ftabCopy[j]--;
    374       eclass8[fmap[i]] = (UChar)j;
    375    }
    376    AssertH ( j < 256, 1005 );
    377 }
    378 
    379 #undef       SET_BH
    380 #undef     CLEAR_BH
    381 #undef     ISSET_BH
    382 #undef      WORD_BH
    383 #undef UNALIGNED_BH
    384 
    385 
    386 /*---------------------------------------------*/
    387 /*--- The main, O(N^2 log(N)) sorting       ---*/
    388 /*--- algorithm.  Faster for "normal"       ---*/
    389 /*--- non-repetitive blocks.                ---*/
    390 /*---------------------------------------------*/
    391 
    392 /*---------------------------------------------*/
    393 static
    394 __inline__
    395 Bool mainGtU ( UInt32  i1,
    396                UInt32  i2,
    397                UChar*  block,
    398                UInt16* quadrant,
    399                UInt32  nblock,
    400                Int32*  budget )
    401 {
    402    Int32  k;
    403    UChar  c1, c2;
    404    UInt16 s1, s2;
    405 
    406    AssertD ( i1 != i2, "mainGtU" );
    407    /* 1 */
    408    c1 = block[i1]; c2 = block[i2];
    409    if (c1 != c2) return (c1 > c2);
    410    i1++; i2++;
    411    /* 2 */
    412    c1 = block[i1]; c2 = block[i2];
    413    if (c1 != c2) return (c1 > c2);
    414    i1++; i2++;
    415    /* 3 */
    416    c1 = block[i1]; c2 = block[i2];
    417    if (c1 != c2) return (c1 > c2);
    418    i1++; i2++;
    419    /* 4 */
    420    c1 = block[i1]; c2 = block[i2];
    421    if (c1 != c2) return (c1 > c2);
    422    i1++; i2++;
    423    /* 5 */
    424    c1 = block[i1]; c2 = block[i2];
    425    if (c1 != c2) return (c1 > c2);
    426    i1++; i2++;
    427    /* 6 */
    428    c1 = block[i1]; c2 = block[i2];
    429    if (c1 != c2) return (c1 > c2);
    430    i1++; i2++;
    431    /* 7 */
    432    c1 = block[i1]; c2 = block[i2];
    433    if (c1 != c2) return (c1 > c2);
    434    i1++; i2++;
    435    /* 8 */
    436    c1 = block[i1]; c2 = block[i2];
    437    if (c1 != c2) return (c1 > c2);
    438    i1++; i2++;
    439    /* 9 */
    440    c1 = block[i1]; c2 = block[i2];
    441    if (c1 != c2) return (c1 > c2);
    442    i1++; i2++;
    443    /* 10 */
    444    c1 = block[i1]; c2 = block[i2];
    445    if (c1 != c2) return (c1 > c2);
    446    i1++; i2++;
    447    /* 11 */
    448    c1 = block[i1]; c2 = block[i2];
    449    if (c1 != c2) return (c1 > c2);
    450    i1++; i2++;
    451    /* 12 */
    452    c1 = block[i1]; c2 = block[i2];
    453    if (c1 != c2) return (c1 > c2);
    454    i1++; i2++;
    455 
    456    k = nblock + 8;
    457 
    458    do {
    459       /* 1 */
    460       c1 = block[i1]; c2 = block[i2];
    461       if (c1 != c2) return (c1 > c2);
    462       s1 = quadrant[i1]; s2 = quadrant[i2];
    463       if (s1 != s2) return (s1 > s2);
    464       i1++; i2++;
    465       /* 2 */
    466       c1 = block[i1]; c2 = block[i2];
    467       if (c1 != c2) return (c1 > c2);
    468       s1 = quadrant[i1]; s2 = quadrant[i2];
    469       if (s1 != s2) return (s1 > s2);
    470       i1++; i2++;
    471       /* 3 */
    472       c1 = block[i1]; c2 = block[i2];
    473       if (c1 != c2) return (c1 > c2);
    474       s1 = quadrant[i1]; s2 = quadrant[i2];
    475       if (s1 != s2) return (s1 > s2);
    476       i1++; i2++;
    477       /* 4 */
    478       c1 = block[i1]; c2 = block[i2];
    479       if (c1 != c2) return (c1 > c2);
    480       s1 = quadrant[i1]; s2 = quadrant[i2];
    481       if (s1 != s2) return (s1 > s2);
    482       i1++; i2++;
    483       /* 5 */
    484       c1 = block[i1]; c2 = block[i2];
    485       if (c1 != c2) return (c1 > c2);
    486       s1 = quadrant[i1]; s2 = quadrant[i2];
    487       if (s1 != s2) return (s1 > s2);
    488       i1++; i2++;
    489       /* 6 */
    490       c1 = block[i1]; c2 = block[i2];
    491       if (c1 != c2) return (c1 > c2);
    492       s1 = quadrant[i1]; s2 = quadrant[i2];
    493       if (s1 != s2) return (s1 > s2);
    494       i1++; i2++;
    495       /* 7 */
    496       c1 = block[i1]; c2 = block[i2];
    497       if (c1 != c2) return (c1 > c2);
    498       s1 = quadrant[i1]; s2 = quadrant[i2];
    499       if (s1 != s2) return (s1 > s2);
    500       i1++; i2++;
    501       /* 8 */
    502       c1 = block[i1]; c2 = block[i2];
    503       if (c1 != c2) return (c1 > c2);
    504       s1 = quadrant[i1]; s2 = quadrant[i2];
    505       if (s1 != s2) return (s1 > s2);
    506       i1++; i2++;
    507 
    508       if (i1 >= nblock) i1 -= nblock;
    509       if (i2 >= nblock) i2 -= nblock;
    510 
    511       k -= 8;
    512       (*budget)--;
    513    }
    514       while (k >= 0);
    515 
    516    return False;
    517 }
    518 
    519 
    520 /*---------------------------------------------*/
    521 /*--
    522    Knuth's increments seem to work better
    523    than Incerpi-Sedgewick here.  Possibly
    524    because the number of elems to sort is
    525    usually small, typically <= 20.
    526 --*/
    527 static
    528 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
    529                    9841, 29524, 88573, 265720,
    530                    797161, 2391484 };
    531 
    532 static
    533 void mainSimpleSort ( UInt32* ptr,
    534                       UChar*  block,
    535                       UInt16* quadrant,
    536                       Int32   nblock,
    537                       Int32   lo,
    538                       Int32   hi,
    539                       Int32   d,
    540                       Int32*  budget )
    541 {
    542    Int32 i, j, h, bigN, hp;
    543    UInt32 v;
    544 
    545    bigN = hi - lo + 1;
    546    if (bigN < 2) return;
    547 
    548    hp = 0;
    549    while (incs[hp] < bigN) hp++;
    550    hp--;
    551 
    552    for (; hp >= 0; hp--) {
    553       h = incs[hp];
    554 
    555       i = lo + h;
    556       while (True) {
    557 
    558          /*-- copy 1 --*/
    559          if (i > hi) break;
    560          v = ptr[i];
    561          j = i;
    562          while ( mainGtU (
    563                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
    564                  ) ) {
    565             ptr[j] = ptr[j-h];
    566             j = j - h;
    567             if (j <= (lo + h - 1)) break;
    568          }
    569          ptr[j] = v;
    570          i++;
    571 
    572          /*-- copy 2 --*/
    573          if (i > hi) break;
    574          v = ptr[i];
    575          j = i;
    576          while ( mainGtU (
    577                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
    578                  ) ) {
    579             ptr[j] = ptr[j-h];
    580             j = j - h;
    581             if (j <= (lo + h - 1)) break;
    582          }
    583          ptr[j] = v;
    584          i++;
    585 
    586          /*-- copy 3 --*/
    587          if (i > hi) break;
    588          v = ptr[i];
    589          j = i;
    590          while ( mainGtU (
    591                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
    592                  ) ) {
    593             ptr[j] = ptr[j-h];
    594             j = j - h;
    595             if (j <= (lo + h - 1)) break;
    596          }
    597          ptr[j] = v;
    598          i++;
    599 
    600          if (*budget < 0) return;
    601       }
    602    }
    603 }
    604 
    605 
    606 /*---------------------------------------------*/
    607 /*--
    608    The following is an implementation of
    609    an elegant 3-way quicksort for strings,
    610    described in a paper "Fast Algorithms for
    611    Sorting and Searching Strings", by Robert
    612    Sedgewick and Jon L. Bentley.
    613 --*/
    614 
    615 #define mswap(zz1, zz2) \
    616    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
    617 
    618 #define mvswap(zzp1, zzp2, zzn)       \
    619 {                                     \
    620    Int32 yyp1 = (zzp1);               \
    621    Int32 yyp2 = (zzp2);               \
    622    Int32 yyn  = (zzn);                \
    623    while (yyn > 0) {                  \
    624       mswap(ptr[yyp1], ptr[yyp2]);    \
    625       yyp1++; yyp2++; yyn--;          \
    626    }                                  \
    627 }
    628 
    629 static
    630 __inline__
    631 UChar mmed3 ( UChar a, UChar b, UChar c )
    632 {
    633    UChar t;
    634    if (a > b) { t = a; a = b; b = t; };
    635    if (b > c) {
    636       b = c;
    637       if (a > b) b = a;
    638    }
    639    return b;
    640 }
    641 
    642 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
    643 
    644 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
    645                           stackHi[sp] = hz; \
    646                           stackD [sp] = dz; \
    647                           sp++; }
    648 
    649 #define mpop(lz,hz,dz) { sp--;             \
    650                          lz = stackLo[sp]; \
    651                          hz = stackHi[sp]; \
    652                          dz = stackD [sp]; }
    653 
    654 
    655 #define mnextsize(az) (nextHi[az]-nextLo[az])
    656 
    657 #define mnextswap(az,bz)                                        \
    658    { Int32 tz;                                                  \
    659      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
    660      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
    661      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
    662 
    663 
    664 #define MAIN_QSORT_SMALL_THRESH 20
    665 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
    666 #define MAIN_QSORT_STACK_SIZE 100
    667 
    668 static
    669 void mainQSort3 ( UInt32* ptr,
    670                   UChar*  block,
    671                   UInt16* quadrant,
    672                   Int32   nblock,
    673                   Int32   loSt,
    674                   Int32   hiSt,
    675                   Int32   dSt,
    676                   Int32*  budget )
    677 {
    678    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
    679    Int32 sp, lo, hi, d;
    680 
    681    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
    682    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
    683    Int32 stackD [MAIN_QSORT_STACK_SIZE];
    684 
    685    Int32 nextLo[3];
    686    Int32 nextHi[3];
    687    Int32 nextD [3];
    688 
    689    sp = 0;
    690    mpush ( loSt, hiSt, dSt );
    691 
    692    while (sp > 0) {
    693 
    694       AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 );
    695 
    696       mpop ( lo, hi, d );
    697       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
    698           d > MAIN_QSORT_DEPTH_THRESH) {
    699          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
    700          if (*budget < 0) return;
    701          continue;
    702       }
    703 
    704       med = (Int32)
    705             mmed3 ( block[ptr[ lo         ]+d],
    706                     block[ptr[ hi         ]+d],
    707                     block[ptr[ (lo+hi)>>1 ]+d] );
    708 
    709       unLo = ltLo = lo;
    710       unHi = gtHi = hi;
    711 
    712       while (True) {
    713          while (True) {
    714             if (unLo > unHi) break;
    715             n = ((Int32)block[ptr[unLo]+d]) - med;
    716             if (n == 0) {
    717                mswap(ptr[unLo], ptr[ltLo]);
    718                ltLo++; unLo++; continue;
    719             };
    720             if (n >  0) break;
    721             unLo++;
    722          }
    723          while (True) {
    724             if (unLo > unHi) break;
    725             n = ((Int32)block[ptr[unHi]+d]) - med;
    726             if (n == 0) {
    727                mswap(ptr[unHi], ptr[gtHi]);
    728                gtHi--; unHi--; continue;
    729             };
    730             if (n <  0) break;
    731             unHi--;
    732          }
    733          if (unLo > unHi) break;
    734          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
    735       }
    736 
    737       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
    738 
    739       if (gtHi < ltLo) {
    740          mpush(lo, hi, d+1 );
    741          continue;
    742       }
    743 
    744       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
    745       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
    746 
    747       n = lo + unLo - ltLo - 1;
    748       m = hi - (gtHi - unHi) + 1;
    749 
    750       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
    751       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
    752       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
    753 
    754       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
    755       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
    756       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
    757 
    758       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
    759       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
    760 
    761       mpush (nextLo[0], nextHi[0], nextD[0]);
    762       mpush (nextLo[1], nextHi[1], nextD[1]);
    763       mpush (nextLo[2], nextHi[2], nextD[2]);
    764    }
    765 }
    766 
    767 #undef mswap
    768 #undef mvswap
    769 #undef mpush
    770 #undef mpop
    771 #undef mmin
    772 #undef mnextsize
    773 #undef mnextswap
    774 #undef MAIN_QSORT_SMALL_THRESH
    775 #undef MAIN_QSORT_DEPTH_THRESH
    776 #undef MAIN_QSORT_STACK_SIZE
    777 
    778 
    779 /*---------------------------------------------*/
    780 /* Pre:
    781       nblock > N_OVERSHOOT
    782       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
    783       ((UChar*)block32) [0 .. nblock-1] holds block
    784       ptr exists for [0 .. nblock-1]
    785 
    786    Post:
    787       ((UChar*)block32) [0 .. nblock-1] holds block
    788       All other areas of block32 destroyed
    789       ftab [0 .. 65536 ] destroyed
    790       ptr [0 .. nblock-1] holds sorted order
    791       if (*budget < 0), sorting was abandoned
    792 */
    793 
    794 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
    795 #define SETMASK (1 << 21)
    796 #define CLEARMASK (~(SETMASK))
    797 
    798 static
    799 void mainSort ( UInt32* ptr,
    800                 UChar*  block,
    801                 UInt16* quadrant,
    802                 UInt32* ftab,
    803                 Int32   nblock,
    804                 Int32   verb,
    805                 Int32*  budget )
    806 {
    807    Int32  i, j, k, ss, sb;
    808    Int32  runningOrder[256];
    809    Bool   bigDone[256];
    810    Int32  copyStart[256];
    811    Int32  copyEnd  [256];
    812    UChar  c1;
    813    Int32  numQSorted;
    814    UInt16 s;
    815    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
    816 
    817    /*-- set up the 2-byte frequency table --*/
    818    for (i = 65536; i >= 0; i--) ftab[i] = 0;
    819 
    820    j = block[0] << 8;
    821    i = nblock-1;
    822    for (; i >= 3; i -= 4) {
    823       quadrant[i] = 0;
    824       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
    825       ftab[j]++;
    826       quadrant[i-1] = 0;
    827       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
    828       ftab[j]++;
    829       quadrant[i-2] = 0;
    830       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
    831       ftab[j]++;
    832       quadrant[i-3] = 0;
    833       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
    834       ftab[j]++;
    835    }
    836    for (; i >= 0; i--) {
    837       quadrant[i] = 0;
    838       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
    839       ftab[j]++;
    840    }
    841 
    842    /*-- (emphasises close relationship of block & quadrant) --*/
    843    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
    844       block   [nblock+i] = block[i];
    845       quadrant[nblock+i] = 0;
    846    }
    847 
    848    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
    849 
    850    /*-- Complete the initial radix sort --*/
    851    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
    852 
    853    s = block[0] << 8;
    854    i = nblock-1;
    855    for (; i >= 3; i -= 4) {
    856       s = (s >> 8) | (block[i] << 8);
    857       j = ftab[s] -1;
    858       ftab[s] = j;
    859       ptr[j] = i;
    860       s = (s >> 8) | (block[i-1] << 8);
    861       j = ftab[s] -1;
    862       ftab[s] = j;
    863       ptr[j] = i-1;
    864       s = (s >> 8) | (block[i-2] << 8);
    865       j = ftab[s] -1;
    866       ftab[s] = j;
    867       ptr[j] = i-2;
    868       s = (s >> 8) | (block[i-3] << 8);
    869       j = ftab[s] -1;
    870       ftab[s] = j;
    871       ptr[j] = i-3;
    872    }
    873    for (; i >= 0; i--) {
    874       s = (s >> 8) | (block[i] << 8);
    875       j = ftab[s] -1;
    876       ftab[s] = j;
    877       ptr[j] = i;
    878    }
    879 
    880    /*--
    881       Now ftab contains the first loc of every small bucket.
    882       Calculate the running order, from smallest to largest
    883       big bucket.
    884    --*/
    885    for (i = 0; i <= 255; i++) {
    886       bigDone     [i] = False;
    887       runningOrder[i] = i;
    888    }
    889 
    890    {
    891       Int32 vv;
    892       Int32 h = 1;
    893       do h = 3 * h + 1; while (h <= 256);
    894       do {
    895          h = h / 3;
    896          for (i = h; i <= 255; i++) {
    897             vv = runningOrder[i];
    898             j = i;
    899             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
    900                runningOrder[j] = runningOrder[j-h];
    901                j = j - h;
    902                if (j <= (h - 1)) goto zero;
    903             }
    904             zero:
    905             runningOrder[j] = vv;
    906          }
    907       } while (h != 1);
    908    }
    909 
    910    /*--
    911       The main sorting loop.
    912    --*/
    913 
    914    numQSorted = 0;
    915 
    916    for (i = 0; i <= 255; i++) {
    917 
    918       /*--
    919          Process big buckets, starting with the least full.
    920          Basically this is a 3-step process in which we call
    921          mainQSort3 to sort the small buckets [ss, j], but
    922          also make a big effort to avoid the calls if we can.
    923       --*/
    924       ss = runningOrder[i];
    925 
    926       /*--
    927          Step 1:
    928          Complete the big bucket [ss] by quicksorting
    929          any unsorted small buckets [ss, j], for j != ss.
    930          Hopefully previous pointer-scanning phases have already
    931          completed many of the small buckets [ss, j], so
    932          we don't have to sort them at all.
    933       --*/
    934       for (j = 0; j <= 255; j++) {
    935          if (j != ss) {
    936             sb = (ss << 8) + j;
    937             if ( ! (ftab[sb] & SETMASK) ) {
    938                Int32 lo = ftab[sb]   & CLEARMASK;
    939                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
    940                if (hi > lo) {
    941                   if (verb >= 4)
    942                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
    943                                 "done %d   this %d\n",
    944                                 ss, j, numQSorted, hi - lo + 1 );
    945                   mainQSort3 (
    946                      ptr, block, quadrant, nblock,
    947                      lo, hi, BZ_N_RADIX, budget
    948                   );
    949                   numQSorted += (hi - lo + 1);
    950                   if (*budget < 0) return;
    951                }
    952             }
    953             ftab[sb] |= SETMASK;
    954          }
    955       }
    956 
    957       AssertH ( !bigDone[ss], 1006 );
    958 
    959       /*--
    960          Step 2:
    961          Now scan this big bucket [ss] so as to synthesise the
    962          sorted order for small buckets [t, ss] for all t,
    963          including, magically, the bucket [ss,ss] too.
    964          This will avoid doing Real Work in subsequent Step 1's.
    965       --*/
    966       {
    967          for (j = 0; j <= 255; j++) {
    968             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
    969             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
    970          }
    971          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
    972             k = ptr[j]-1; if (k < 0) k += nblock;
    973             c1 = block[k];
    974             if (!bigDone[c1])
    975                ptr[ copyStart[c1]++ ] = k;
    976          }
    977          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
    978             k = ptr[j]-1; if (k < 0) k += nblock;
    979             c1 = block[k];
    980             if (!bigDone[c1])
    981                ptr[ copyEnd[c1]-- ] = k;
    982          }
    983       }
    984 
    985       AssertH ( copyStart[ss]-1 == copyEnd[ss], 1007 );
    986 
    987       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
    988 
    989       /*--
    990          Step 3:
    991          The [ss] big bucket is now done.  Record this fact,
    992          and update the quadrant descriptors.  Remember to
    993          update quadrants in the overshoot area too, if
    994          necessary.  The "if (i < 255)" test merely skips
    995          this updating for the last bucket processed, since
    996          updating for the last bucket is pointless.
    997 
    998          The quadrant array provides a way to incrementally
    999          cache sort orderings, as they appear, so as to
   1000          make subsequent comparisons in fullGtU() complete
   1001          faster.  For repetitive blocks this makes a big
   1002          difference (but not big enough to be able to avoid
   1003          the fallback sorting mechanism, exponential radix sort).
   1004 
   1005          The precise meaning is: at all times:
   1006 
   1007             for 0 <= i < nblock and 0 <= j <= nblock
   1008 
   1009             if block[i] != block[j],
   1010 
   1011                then the relative values of quadrant[i] and
   1012                     quadrant[j] are meaningless.
   1013 
   1014                else {
   1015                   if quadrant[i] < quadrant[j]
   1016                      then the string starting at i lexicographically
   1017                      precedes the string starting at j
   1018 
   1019                   else if quadrant[i] > quadrant[j]
   1020                      then the string starting at j lexicographically
   1021                      precedes the string starting at i
   1022 
   1023                   else
   1024                      the relative ordering of the strings starting
   1025                      at i and j has not yet been determined.
   1026                }
   1027       --*/
   1028       bigDone[ss] = True;
   1029 
   1030       if (i < 255) {
   1031          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
   1032          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
   1033          Int32 shifts   = 0;
   1034 
   1035          while ((bbSize >> shifts) > 65534) shifts++;
   1036 
   1037          for (j = bbSize-1; j >= 0; j--) {
   1038             Int32 a2update     = ptr[bbStart + j];
   1039             UInt16 qVal        = (UInt16)(j >> shifts);
   1040             quadrant[a2update] = qVal;
   1041             if (a2update < BZ_N_OVERSHOOT)
   1042                quadrant[a2update + nblock] = qVal;
   1043          }
   1044          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
   1045       }
   1046 
   1047    }
   1048 
   1049    if (verb >= 4)
   1050       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
   1051                  nblock, numQSorted, nblock - numQSorted );
   1052 }
   1053 
   1054 #undef BIGFREQ
   1055 #undef SETMASK
   1056 #undef CLEARMASK
   1057 
   1058 
   1059 /*---------------------------------------------*/
   1060 /* Pre:
   1061       nblock > 0
   1062       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
   1063       ((UChar*)arr2)  [0 .. nblock-1] holds block
   1064       arr1 exists for [0 .. nblock-1]
   1065 
   1066    Post:
   1067       ((UChar*)arr2) [0 .. nblock-1] holds block
   1068       All other areas of block destroyed
   1069       ftab [ 0 .. 65536 ] destroyed
   1070       arr1 [0 .. nblock-1] holds sorted order
   1071 */
   1072 void BZ2_blockSort ( EState* s )
   1073 {
   1074    UInt32* ptr    = s->ptr;
   1075    UChar*  block  = s->block;
   1076    UInt32* ftab   = s->ftab;
   1077    Int32   nblock = s->nblock;
   1078    Int32   verb   = s->verbosity;
   1079    Int32   wfact  = s->workFactor;
   1080    UInt16* quadrant;
   1081    Int32   budget;
   1082    Int32   budgetInit;
   1083    Int32   i;
   1084 
   1085    if (nblock < 10000) {
   1086       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
   1087    } else {
   1088       /* Calculate the location for quadrant, remembering to get
   1089          the alignment right.  Assumes that &(block[0]) is at least
   1090          2-byte aligned -- this should be ok since block is really
   1091          the first section of arr2.
   1092       */
   1093       i = nblock+BZ_N_OVERSHOOT;
   1094       if (i & 1) i++;
   1095       quadrant = (UInt16*)(&(block[i]));
   1096 
   1097       /* (wfact-1) / 3 puts the default-factor-30
   1098          transition point at very roughly the same place as
   1099          with v0.1 and v0.9.0.
   1100          Not that it particularly matters any more, since the
   1101          resulting compressed stream is now the same regardless
   1102          of whether or not we use the main sort or fallback sort.
   1103       */
   1104       if (wfact < 1  ) wfact = 1;
   1105       if (wfact > 100) wfact = 100;
   1106       budgetInit = nblock * ((wfact-1) / 3);
   1107       budget = budgetInit;
   1108 
   1109       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
   1110       if (verb >= 3)
   1111          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
   1112                     budgetInit - budget,
   1113                     nblock,
   1114                     (float)(budgetInit - budget) /
   1115                     (float)(nblock==0 ? 1 : nblock) );
   1116       if (budget < 0) {
   1117          if (verb >= 2)
   1118             VPrintf0 ( "    too repetitive; using fallback"
   1119                        " sorting algorithm\n" );
   1120          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
   1121       }
   1122    }
   1123 
   1124    s->origPtr = -1;
   1125    for (i = 0; i < s->nblock; i++)
   1126       if (ptr[i] == 0)
   1127          { s->origPtr = i; break; };
   1128 
   1129    AssertH( s->origPtr != -1, 1003 );
   1130 }
   1131 
   1132 
   1133 /*-------------------------------------------------------------*/
   1134 /*--- end                                       blocksort.c ---*/
   1135 /*-------------------------------------------------------------*/