1 // BZip2OutputStream.cs
\r
2 // Copyright (C) 2001 Mike Krueger
\r
4 // This program is free software; you can redistribute it and/or
\r
5 // modify it under the terms of the GNU General Public License
\r
6 // as published by the Free Software Foundation; either version 2
\r
7 // of the License, or (at your option) any later version.
\r
9 // This program is distributed in the hope that it will be useful,
\r
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
12 // GNU General Public License for more details.
\r
14 // You should have received a copy of the GNU General Public License
\r
15 // along with this program; if not, write to the Free Software
\r
16 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
\r
18 // Linking this library statically or dynamically with other modules is
\r
19 // making a combined work based on this library. Thus, the terms and
\r
20 // conditions of the GNU General Public License cover the whole
\r
23 // As a special exception, the copyright holders of this library give you
\r
24 // permission to link this library with independent modules to produce an
\r
25 // executable, regardless of the license terms of these independent
\r
26 // modules, and to copy and distribute the resulting executable under
\r
27 // terms of your choice, provided that you also meet, for each linked
\r
28 // independent module, the terms and conditions of the license of that
\r
29 // module. An independent module is a module which is not derived from
\r
30 // or based on this library. If you modify this library, you may extend
\r
31 // this exception to your version of the library, but you are not
\r
32 // obligated to do so. If you do not wish to do so, delete this
\r
33 // exception statement from your version.
\r
38 using ICSharpCode.SharpZipLib.Checksums;
\r
40 namespace ICSharpCode.SharpZipLib.BZip2
\r
44 /// An output stream that compresses into the BZip2 format (without the file
\r
45 /// header chars) into another stream.
\r
46 /// TODO: Update to BZip2 1.0.1, 1.0.2
\r
48 public class BZip2OutputStream : Stream
\r
51 /// I needed to implement the abstract member.
\r
53 public override bool CanRead {
\r
55 return baseStream.CanRead;
\r
60 /// I needed to implement the abstract member.
\r
62 public override bool CanSeek {
\r
64 return baseStream.CanSeek;
\r
69 /// I needed to implement the abstract member.
\r
71 public override bool CanWrite {
\r
73 return baseStream.CanWrite;
\r
78 /// I needed to implement the abstract member.
\r
80 public override long Length {
\r
82 return baseStream.Length;
\r
87 /// I needed to implement the abstract member.
\r
89 public override long Position {
\r
91 return baseStream.Position;
\r
94 baseStream.Position = value;
\r
99 /// I needed to implement the abstract member.
\r
101 public override long Seek(long offset, SeekOrigin origin)
\r
103 return baseStream.Seek(offset, origin);
\r
107 /// I needed to implement the abstract member.
\r
109 public override void SetLength(long val)
\r
111 baseStream.SetLength(val);
\r
115 /// I needed to implement the abstract member.
\r
117 public override int ReadByte()
\r
119 return baseStream.ReadByte();
\r
123 /// I needed to implement the abstract member.
\r
125 public override int Read(byte[] b, int off, int len)
\r
127 return baseStream.Read(b, off, len);
\r
130 public override void Write(byte[] buf, int off, int len)
\r
132 for (int i = 0; i < len; ++i) {
\r
133 WriteByte(buf[off + i]);
\r
137 readonly static int SETMASK = (1 << 21);
\r
138 readonly static int CLEARMASK = (~SETMASK);
\r
139 readonly static int GREATER_ICOST = 15;
\r
140 readonly static int LESSER_ICOST = 0;
\r
141 readonly static int SMALL_THRESH = 20;
\r
142 readonly static int DEPTH_THRESH = 10;
\r
145 If you are ever unlucky/improbable enough
\r
146 to get a stack overflow whilst sorting,
\r
147 increase the following constant and try
\r
148 again. In practice I have never seen the
\r
149 stack go above 27 elems, so the following
\r
150 limit seems very generous.
\r
152 readonly static int QSORT_STACK_SIZE = 1000;
\r
154 static void Panic()
\r
156 //Console.WriteLine("panic");
\r
163 for (i = 0; i < 256; i++) {
\r
165 seqToUnseq[nInUse] = (char)i;
\r
166 unseqToSeq[i] = (char)nInUse;
\r
172 static void HbMakeCodeLengths(char[] len, int[] freq, int alphaSize, int maxLen)
\r
175 Nodes and heap entries run from 1. Entry 0
\r
176 for both the heap and nodes is a sentinel.
\r
178 int nNodes, nHeap, n1, n2, j, k;
\r
181 int[] heap = new int[BZip2Constants.MAX_ALPHA_SIZE + 2];
\r
182 int[] weight = new int[BZip2Constants.MAX_ALPHA_SIZE * 2];
\r
183 int[] parent = new int[BZip2Constants.MAX_ALPHA_SIZE * 2];
\r
185 for (int i = 0; i < alphaSize; ++i) {
\r
186 weight[i+1] = (freq[i] == 0 ? 1 : freq[i]) << 8;
\r
190 nNodes = alphaSize;
\r
197 for (int i = 1; i <= alphaSize; ++i) {
\r
202 int tmp = heap[zz];
\r
203 while (weight[tmp] < weight[heap[zz >> 1]]) {
\r
204 heap[zz] = heap[zz >> 1];
\r
209 if (!(nHeap < (BZip2Constants.MAX_ALPHA_SIZE+2))) {
\r
213 while (nHeap > 1) {
\r
215 heap[1] = heap[nHeap];
\r
219 int tmp = heap[zz];
\r
225 if (yy < nHeap && weight[heap[yy+1]] < weight[heap[yy]]) {
\r
228 if (weight[tmp] < weight[heap[yy]]) {
\r
232 heap[zz] = heap[yy];
\r
237 heap[1] = heap[nHeap];
\r
248 if (yy < nHeap && weight[heap[yy+1]] < weight[heap[yy]]) {
\r
251 if (weight[tmp] < weight[heap[yy]]) {
\r
254 heap[zz] = heap[yy];
\r
259 parent[n1] = parent[n2] = nNodes;
\r
261 weight[nNodes] = (int)((weight[n1] & 0xffffff00) + (weight[n2] & 0xffffff00)) |
\r
262 (int)(1 + (((weight[n1] & 0x000000ff) > (weight[n2] & 0x000000ff)) ? (weight[n1] & 0x000000ff) : (weight[n2] & 0x000000ff)));
\r
264 parent[nNodes] = -1;
\r
266 heap[nHeap] = nNodes;
\r
270 while (weight[tmp] < weight[heap[zz >> 1]]) {
\r
271 heap[zz] = heap[zz >> 1];
\r
276 if (!(nNodes < (BZip2Constants.MAX_ALPHA_SIZE * 2))) {
\r
281 for (int i = 1; i <= alphaSize; ++i) {
\r
284 while (parent[k] >= 0) {
\r
288 len[i - 1] = (char)j;
\r
298 for (int i = 1; i < alphaSize; ++i) {
\r
299 j = weight[i] >> 8;
\r
301 weight[i] = j << 8;
\r
307 index of the last char in the block, so
\r
308 the block size == last + 1.
\r
313 index in zptr[] of original string after sorting.
\r
318 always: in the range 0 .. 9.
\r
319 The current block size is 100000 * this number.
\r
323 bool blockRandomised;
\r
329 IChecksum mCrc = new StrangeCRC();
\r
331 bool[] inUse = new bool[256];
\r
334 char[] seqToUnseq = new char[256];
\r
335 char[] unseqToSeq = new char[256];
\r
337 char[] selector = new char[BZip2Constants.MAX_SELECTORS];
\r
338 char[] selectorMtf = new char[BZip2Constants.MAX_SELECTORS];
\r
348 int[] mtfFreq = new int[BZip2Constants.MAX_ALPHA_SIZE];
\r
351 * Used when sorting. If too many long comparisons
\r
352 * happen, we stop sorting, randomise the block
\r
353 * slightly, and try again.
\r
359 int nBlocksRandomised;
\r
361 int currentChar = -1;
\r
364 public BZip2OutputStream(Stream inStream) : this(inStream, 9)
\r
368 public BZip2OutputStream(Stream inStream, int inBlockSize)
\r
375 BsSetStream(inStream);
\r
378 if(inBlockSize > 9) {
\r
381 if(inBlockSize < 1) {
\r
384 blockSize100k = inBlockSize;
\r
385 AllocateCompressStructures();
\r
390 public override void WriteByte(byte bv)
\r
392 int b = (256 + bv) % 256;
\r
393 if (currentChar != -1) {
\r
394 if (currentChar == b) {
\r
396 if (runLength > 254) {
\r
414 if (last < allowableBlockSize) {
\r
415 inUse[currentChar] = true;
\r
416 for (int i = 0; i < runLength; i++) {
\r
417 mCrc.Update(currentChar);
\r
420 switch (runLength) {
\r
423 block[last + 1] = (byte)currentChar;
\r
427 block[last + 1] = (byte)currentChar;
\r
429 block[last + 1] = (byte)currentChar;
\r
433 block[last + 1] = (byte)currentChar;
\r
435 block[last + 1] = (byte)currentChar;
\r
437 block[last + 1] = (byte)currentChar;
\r
440 inUse[runLength - 4] = true;
\r
442 block[last + 1] = (byte)currentChar;
\r
444 block[last + 1] = (byte)currentChar;
\r
446 block[last + 1] = (byte)currentChar;
\r
448 block[last + 1] = (byte)currentChar;
\r
450 block[last + 1] = (byte)(runLength - 4);
\r
460 bool closed = false;
\r
462 public void Finalize()
\r
467 public override void Close()
\r
473 if (runLength > 0) {
\r
483 baseStream.Close();
\r
486 public override void Flush()
\r
489 baseStream.Flush();
\r
492 uint blockCRC, combinedCRC;
\r
498 nBlocksRandomised = 0;
\r
500 /*--- Write `magic' bytes h indicating file-format == huffmanised,
\r
501 followed by a digit indicating blockSize100k.
\r
503 BsPutUChar('B'); // -jr- 18-Nov-2003 added to match BZ2 1.02 header already in BZip class but that sux a lot
\r
507 BsPutUChar('0' + blockSize100k);
\r
512 int allowableBlockSize;
\r
521 for (int i = 0; i < 256; i++) {
\r
525 /*--- 20 is just a paranoia constant ---*/
\r
526 allowableBlockSize = BZip2Constants.baseBlockSize * blockSize100k - 20;
\r
531 if (last < 0) { //-jr- dont do anything for empty files, (makes empty files compatible with original Bzip)
\r
535 blockCRC = (uint)mCrc.Value;
\r
536 combinedCRC = (combinedCRC << 1) | (combinedCRC >> 31);
\r
537 combinedCRC ^= blockCRC;
\r
539 /*-- sort the block and establish posn of original string --*/
\r
540 DoReversibleTransformation();
\r
543 A 6-byte block header, the value chosen arbitrarily
\r
544 as 0x314159265359 :-). A 32 bit value does not really
\r
545 give a strong enough guarantee that the value will not
\r
546 appear by chance in the compressed datastream. Worst-case
\r
547 probability of this event, for a 900k block, is about
\r
548 2.0e-3 for 32 bits, 1.0e-5 for 40 bits and 4.0e-8 for 48 bits.
\r
549 For a compressed file of size 100Gb -- about 100000 blocks --
\r
550 only a 48-bit marker will do. NB: normal compression/
\r
551 decompression do *not* rely on these statistical properties.
\r
552 They are only important when trying to recover blocks from
\r
562 /*-- Now the block's CRC, so it is in a known place. --*/
\r
563 BsPutint((int)blockCRC);
\r
565 /*-- Now a single bit indicating randomisation. --*/
\r
566 if (blockRandomised) {
\r
568 nBlocksRandomised++;
\r
573 /*-- Finally, block's contents proper. --*/
\r
574 MoveToFrontCodeAndSend();
\r
577 void EndCompression()
\r
580 Now another magic 48-bit number, 0x177245385090, to
\r
581 indicate the end of the last block. (sqrt(pi), if
\r
582 you want to know. I did want to use e, but it contains
\r
583 too much repetition -- 27 18 28 18 28 46 -- for me
\r
584 to feel statistically comfortable. Call me paranoid.)
\r
593 BsPutint((int)combinedCRC);
\r
595 BsFinishedWithStream();
\r
598 void HbAssignCodes (int[] code, char[] length, int minLen, int maxLen, int alphaSize)
\r
601 for (int n = minLen; n <= maxLen; ++n) {
\r
602 for (int i = 0; i < alphaSize; ++i) {
\r
603 if (length[i] == n) {
\r
612 void BsSetStream(Stream f)
\r
620 void BsFinishedWithStream()
\r
622 while (bsLive > 0)
\r
624 int ch = (bsBuff >> 24);
\r
625 baseStream.WriteByte((byte)ch); // write 8-bit
\r
632 void BsW(int n, int v)
\r
634 while (bsLive >= 8) {
\r
635 int ch = (bsBuff >> 24);
\r
636 baseStream.WriteByte((byte)ch); // write 8-bit
\r
641 bsBuff |= (v << (32 - bsLive - n));
\r
645 void BsPutUChar(int c)
\r
650 void BsPutint(int u)
\r
652 BsW(8, (u >> 24) & 0xFF);
\r
653 BsW(8, (u >> 16) & 0xFF);
\r
654 BsW(8, (u >> 8) & 0xFF);
\r
658 void BsPutIntVS(int numBits, int c)
\r
663 void SendMTFValues()
\r
665 char[][] len = new char[BZip2Constants.N_GROUPS][];
\r
666 for (int i = 0; i < BZip2Constants.N_GROUPS; ++i) {
\r
667 len[i] = new char[BZip2Constants.MAX_ALPHA_SIZE];
\r
670 int gs, ge, totc, bt, bc, iter;
\r
671 int nSelectors = 0, alphaSize, minLen, maxLen, selCtr;
\r
672 int nGroups, nBytes;
\r
674 alphaSize = nInUse + 2;
\r
675 for (int t = 0; t < BZip2Constants.N_GROUPS; t++) {
\r
676 for (int v = 0; v < alphaSize; v++) {
\r
677 len[t][v] = (char)GREATER_ICOST;
\r
681 /*--- Decide how many coding tables to use ---*/
\r
688 } else if (nMTF < 600) {
\r
690 } else if (nMTF < 1200) {
\r
692 } else if (nMTF < 2400) {
\r
698 /*--- Generate an initial set of coding tables ---*/
\r
699 int nPart = nGroups;
\r
702 while (nPart > 0) {
\r
703 int tFreq = remF / nPart;
\r
706 while (aFreq < tFreq && ge < alphaSize - 1) {
\r
708 aFreq += mtfFreq[ge];
\r
711 if (ge > gs && nPart != nGroups && nPart != 1 && ((nGroups - nPart) % 2 == 1)) {
\r
712 aFreq -= mtfFreq[ge];
\r
716 for (int v = 0; v < alphaSize; v++) {
\r
717 if (v >= gs && v <= ge) {
\r
718 len[nPart - 1][v] = (char)LESSER_ICOST;
\r
720 len[nPart - 1][v] = (char)GREATER_ICOST;
\r
729 int[][] rfreq = new int[BZip2Constants.N_GROUPS][];
\r
730 for (int i = 0; i < BZip2Constants.N_GROUPS; ++i) {
\r
731 rfreq[i] = new int[BZip2Constants.MAX_ALPHA_SIZE];
\r
734 int[] fave = new int[BZip2Constants.N_GROUPS];
\r
735 short[] cost = new short[BZip2Constants.N_GROUPS];
\r
737 Iterate up to N_ITERS times to improve the tables.
\r
739 for (iter = 0; iter < BZip2Constants.N_ITERS; ++iter) {
\r
740 for (int t = 0; t < nGroups; ++t) {
\r
744 for (int t = 0; t < nGroups; ++t) {
\r
745 for (int v = 0; v < alphaSize; ++v) {
\r
754 /*--- Set group start & end marks. --*/
\r
758 ge = gs + BZip2Constants.G_SIZE - 1;
\r
764 Calculate the cost of this group as coded
\r
765 by each of the coding tables.
\r
767 for (int t = 0; t < nGroups; t++) {
\r
771 if (nGroups == 6) {
\r
772 short cost0, cost1, cost2, cost3, cost4, cost5;
\r
773 cost0 = cost1 = cost2 = cost3 = cost4 = cost5 = 0;
\r
774 for (int i = gs; i <= ge; ++i) {
\r
775 short icv = szptr[i];
\r
776 cost0 += (short)len[0][icv];
\r
777 cost1 += (short)len[1][icv];
\r
778 cost2 += (short)len[2][icv];
\r
779 cost3 += (short)len[3][icv];
\r
780 cost4 += (short)len[4][icv];
\r
781 cost5 += (short)len[5][icv];
\r
790 for (int i = gs; i <= ge; ++i) {
\r
791 short icv = szptr[i];
\r
792 for (int t = 0; t < nGroups; t++) {
\r
793 cost[t] += (short)len[t][icv];
\r
799 Find the coding table which is best for this group,
\r
800 and record its identity in the selector table.
\r
804 for (int t = 0; t < nGroups; ++t) {
\r
805 if (cost[t] < bc) {
\r
812 selector[nSelectors] = (char)bt;
\r
816 Increment the symbol frequencies for the selected table.
\r
818 for (int i = gs; i <= ge; ++i) {
\r
819 ++rfreq[bt][szptr[i]];
\r
826 Recompute the tables based on the accumulated frequencies.
\r
828 for (int t = 0; t < nGroups; ++t) {
\r
829 HbMakeCodeLengths(len[t], rfreq[t], alphaSize, 20);
\r
837 if (!(nGroups < 8)) {
\r
840 if (!(nSelectors < 32768 && nSelectors <= (2 + (900000 / BZip2Constants.G_SIZE)))) {
\r
844 /*--- Compute MTF values for the selectors. ---*/
\r
845 char[] pos = new char[BZip2Constants.N_GROUPS];
\r
846 char ll_i, tmp2, tmp;
\r
847 for (int i = 0; i < nGroups; i++) {
\r
850 for (int i = 0; i < nSelectors; i++) {
\r
851 ll_i = selector[i];
\r
854 while (ll_i != tmp) {
\r
861 selectorMtf[i] = (char)j;
\r
864 int[][] code = new int[BZip2Constants.N_GROUPS][];
\r
866 for (int i = 0; i < BZip2Constants.N_GROUPS; ++i) {
\r
867 code[i] = new int[BZip2Constants.MAX_ALPHA_SIZE];
\r
870 /*--- Assign actual codes for the tables. --*/
\r
871 for (int t = 0; t < nGroups; t++) {
\r
874 for (int i = 0; i < alphaSize; i++) {
\r
875 if (len[t][i] > maxLen) {
\r
876 maxLen = len[t][i];
\r
878 if (len[t][i] < minLen) {
\r
879 minLen = len[t][i];
\r
888 HbAssignCodes(code[t], len[t], minLen, maxLen, alphaSize);
\r
891 /*--- Transmit the mapping table. ---*/
\r
892 bool[] inUse16 = new bool[16];
\r
893 for (int i = 0; i < 16; ++i) {
\r
894 inUse16[i] = false;
\r
895 for (int j = 0; j < 16; ++j) {
\r
896 if (inUse[i * 16 + j]) {
\r
897 inUse16[i] = true;
\r
898 } // TODO : insert break;
\r
903 for (int i = 0; i < 16; ++i) {
\r
911 for (int i = 0; i < 16; ++i) {
\r
913 for (int j = 0; j < 16; ++j) {
\r
914 if (inUse[i * 16 + j]) {
\r
923 /*--- Now the selectors. ---*/
\r
926 BsW(15, nSelectors);
\r
927 for (int i = 0; i < nSelectors; ++i) {
\r
928 for (int j = 0; j < selectorMtf[i]; ++j) {
\r
934 /*--- Now the coding tables. ---*/
\r
937 for (int t = 0; t < nGroups; ++t) {
\r
938 int curr = len[t][0];
\r
940 for (int i = 0; i < alphaSize; ++i) {
\r
941 while (curr < len[t][i]) {
\r
945 while (curr > len[t][i]) {
\r
953 /*--- And finally, the block data proper ---*/
\r
961 ge = gs + BZip2Constants.G_SIZE - 1;
\r
966 for (int i = gs; i <= ge; i++) {
\r
967 BsW(len[selector[selCtr]][szptr[i]], code[selector[selCtr]][szptr[i]]);
\r
973 if (!(selCtr == nSelectors)) {
\r
978 void MoveToFrontCodeAndSend ()
\r
980 BsPutIntVS(24, origPtr);
\r
981 GenerateMTFValues();
\r
987 void SimpleSort(int lo, int hi, int d)
\r
989 int i, j, h, bigN, hp;
\r
992 bigN = hi - lo + 1;
\r
998 while (incs[hp] < bigN) {
\r
1003 for (; hp >= 0; hp--) {
\r
1013 while (FullGtU(zptr[j-h]+d, v+d)) {
\r
1014 zptr[j] = zptr[j-h];
\r
1016 if (j <= (lo + h - 1))
\r
1028 while (FullGtU ( zptr[j-h]+d, v+d )) {
\r
1029 zptr[j] = zptr[j-h];
\r
1031 if (j <= (lo + h - 1)) {
\r
1044 while (FullGtU ( zptr[j-h]+d, v+d)) {
\r
1045 zptr[j] = zptr[j-h];
\r
1047 if (j <= (lo + h - 1)) {
\r
1054 if (workDone > workLimit && firstAttempt) {
\r
1061 void Vswap(int p1, int p2, int n )
\r
1066 zptr[p1] = zptr[p2];
\r
1074 byte Med3(byte a, byte b, byte c )
\r
1100 void QSort3(int loSt, int hiSt, int dSt)
\r
1102 int unLo, unHi, ltLo, gtHi, med, n, m;
\r
1103 int sp, lo, hi, d;
\r
1104 StackElem[] stack = new StackElem[QSORT_STACK_SIZE];
\r
1105 for (int count = 0; count < QSORT_STACK_SIZE; count++) {
\r
1106 stack[count] = new StackElem();
\r
1111 stack[sp].ll = loSt;
\r
1112 stack[sp].hh = hiSt;
\r
1113 stack[sp].dd = dSt;
\r
1117 if (sp >= QSORT_STACK_SIZE) {
\r
1122 lo = stack[sp].ll;
\r
1123 hi = stack[sp].hh;
\r
1126 if (hi - lo < SMALL_THRESH || d > DEPTH_THRESH) {
\r
1127 SimpleSort(lo, hi, d);
\r
1128 if (workDone > workLimit && firstAttempt) {
\r
1134 med = Med3(block[zptr[lo] + d + 1],
\r
1135 block[zptr[hi ] + d + 1],
\r
1136 block[zptr[(lo + hi) >> 1] + d + 1]);
\r
1143 if (unLo > unHi) {
\r
1146 n = ((int)block[zptr[unLo]+d + 1]) - med;
\r
1149 temp = zptr[unLo];
\r
1150 zptr[unLo] = zptr[ltLo];
\r
1151 zptr[ltLo] = temp;
\r
1162 if (unLo > unHi) {
\r
1165 n = ((int)block[zptr[unHi]+d + 1]) - med;
\r
1168 temp = zptr[unHi];
\r
1169 zptr[unHi] = zptr[gtHi];
\r
1170 zptr[gtHi] = temp;
\r
1180 if (unLo > unHi) {
\r
1184 int temp = zptr[unLo];
\r
1185 zptr[unLo] = zptr[unHi];
\r
1186 zptr[unHi] = temp;
\r
1192 if (gtHi < ltLo) {
\r
1193 stack[sp].ll = lo;
\r
1194 stack[sp].hh = hi;
\r
1195 stack[sp].dd = d+1;
\r
1200 n = ((ltLo-lo) < (unLo-ltLo)) ? (ltLo-lo) : (unLo-ltLo);
\r
1201 Vswap(lo, unLo-n, n);
\r
1202 m = ((hi-gtHi) < (gtHi-unHi)) ? (hi-gtHi) : (gtHi-unHi);
\r
1203 Vswap(unLo, hi-m+1, m);
\r
1205 n = lo + unLo - ltLo - 1;
\r
1206 m = hi - (gtHi - unHi) + 1;
\r
1208 stack[sp].ll = lo;
\r
1213 stack[sp].ll = n + 1;
\r
1214 stack[sp].hh = m - 1;
\r
1215 stack[sp].dd = d+1;
\r
1219 stack[sp].hh = hi;
\r
1228 int[] runningOrder = new int[256];
\r
1229 int[] copy = new int[256];
\r
1230 bool[] bigDone = new bool[256];
\r
1235 In the various block-sized structures, live data runs
\r
1236 from 0 to last+NUM_OVERSHOOT_BYTES inclusive. First,
\r
1237 set up the overshoot area for block.
\r
1240 // if (verbosity >= 4) fprintf ( stderr, " sort initialise ...\n" );
\r
1241 for (i = 0; i < BZip2Constants.NUM_OVERSHOOT_BYTES; i++) {
\r
1242 block[last + i + 2] = block[(i % (last + 1)) + 1];
\r
1244 for (i = 0; i <= last + BZip2Constants.NUM_OVERSHOOT_BYTES; i++) {
\r
1248 block[0] = (byte)(block[last + 1]);
\r
1250 if (last < 4000) {
\r
1252 Use simpleSort(), since the full sorting mechanism
\r
1253 has quite a large constant overhead.
\r
1255 for (i = 0; i <= last; i++) {
\r
1258 firstAttempt = false;
\r
1259 workDone = workLimit = 0;
\r
1260 SimpleSort(0, last, 0);
\r
1263 for (i = 0; i <= 255; i++) {
\r
1264 bigDone[i] = false;
\r
1266 for (i = 0; i <= 65536; i++) {
\r
1271 for (i = 0; i <= last; i++) {
\r
1272 c2 = block[i + 1];
\r
1273 ftab[(c1 << 8) + c2]++;
\r
1277 for (i = 1; i <= 65536; i++) {
\r
1278 ftab[i] += ftab[i - 1];
\r
1282 for (i = 0; i < last; i++) {
\r
1283 c2 = block[i + 2];
\r
1284 j = (c1 << 8) + c2;
\r
1287 zptr[ftab[j]] = i;
\r
1290 j = ((block[last + 1]) << 8) + (block[1]);
\r
1292 zptr[ftab[j]] = last;
\r
1295 Now ftab contains the first loc of every small bucket.
\r
1296 Calculate the running order, from smallest to largest
\r
1300 for (i = 0; i <= 255; i++) {
\r
1301 runningOrder[i] = i;
\r
1308 } while (h <= 256);
\r
1311 for (i = h; i <= 255; i++) {
\r
1312 vv = runningOrder[i];
\r
1314 while ((ftab[((runningOrder[j-h])+1) << 8] - ftab[(runningOrder[j-h]) << 8]) > (ftab[((vv)+1) << 8] - ftab[(vv) << 8])) {
\r
1315 runningOrder[j] = runningOrder[j-h];
\r
1317 if (j <= (h - 1)) {
\r
1321 runningOrder[j] = vv;
\r
1326 The main sorting loop.
\r
1328 for (i = 0; i <= 255; i++) {
\r
1331 Process big buckets, starting with the least full.
\r
1333 ss = runningOrder[i];
\r
1336 Complete the big bucket [ss] by quicksorting
\r
1337 any unsorted small buckets [ss, j]. Hopefully
\r
1338 previous pointer-scanning phases have already
\r
1339 completed many of the small buckets [ss, j], so
\r
1340 we don't have to sort them at all.
\r
1342 for (j = 0; j <= 255; j++) {
\r
1343 sb = (ss << 8) + j;
\r
1344 if(!((ftab[sb] & SETMASK) == SETMASK)) {
\r
1345 int lo = ftab[sb] & CLEARMASK;
\r
1346 int hi = (ftab[sb+1] & CLEARMASK) - 1;
\r
1348 QSort3(lo, hi, 2);
\r
1349 numQSorted += (hi - lo + 1);
\r
1350 if (workDone > workLimit && firstAttempt) {
\r
1354 ftab[sb] |= SETMASK;
\r
1359 The ss big bucket is now done. Record this fact,
\r
1360 and update the quadrant descriptors. Remember to
\r
1361 update quadrants in the overshoot area too, if
\r
1362 necessary. The "if (i < 255)" test merely skips
\r
1363 this updating for the last bucket processed, since
\r
1364 updating for the last bucket is pointless.
\r
1366 bigDone[ss] = true;
\r
1369 int bbStart = ftab[ss << 8] & CLEARMASK;
\r
1370 int bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
\r
1373 while ((bbSize >> shifts) > 65534) {
\r
1377 for (j = 0; j < bbSize; j++) {
\r
1378 int a2update = zptr[bbStart + j];
\r
1379 int qVal = (j >> shifts);
\r
1380 quadrant[a2update] = qVal;
\r
1381 if (a2update < BZip2Constants.NUM_OVERSHOOT_BYTES) {
\r
1382 quadrant[a2update + last + 1] = qVal;
\r
1386 if (!(((bbSize-1) >> shifts) <= 65535)) {
\r
1392 Now scan this big bucket so as to synthesise the
\r
1393 sorted order for small buckets [t, ss] for all t != ss.
\r
1395 for (j = 0; j <= 255; j++) {
\r
1396 copy[j] = ftab[(j << 8) + ss] & CLEARMASK;
\r
1399 for (j = ftab[ss << 8] & CLEARMASK; j < (ftab[(ss+1) << 8] & CLEARMASK); j++) {
\r
1400 c1 = block[zptr[j]];
\r
1401 if (!bigDone[c1]) {
\r
1402 zptr[copy[c1]] = zptr[j] == 0 ? last : zptr[j] - 1;
\r
1407 for (j = 0; j <= 255; j++) {
\r
1408 ftab[(j << 8) + ss] |= SETMASK;
\r
1414 void RandomiseBlock()
\r
1419 for (i = 0; i < 256; i++) {
\r
1423 for (i = 0; i <= last; i++) {
\r
1424 if (rNToGo == 0) {
\r
1425 rNToGo = (int)BZip2Constants.rNums[rTPos];
\r
1427 if (rTPos == 512) {
\r
1432 block[i + 1] ^= (byte)((rNToGo == 1) ? 1 : 0);
\r
1433 // handle 16 bit signed numbers
\r
1434 block[i + 1] &= 0xFF;
\r
1436 inUse[block[i + 1]] = true;
\r
1440 void DoReversibleTransformation()
\r
1442 workLimit = workFactor * last;
\r
1444 blockRandomised = false;
\r
1445 firstAttempt = true;
\r
1449 if (workDone > workLimit && firstAttempt) {
\r
1451 workLimit = workDone = 0;
\r
1452 blockRandomised = true;
\r
1453 firstAttempt = false;
\r
1458 for (int i = 0; i <= last; i++) {
\r
1459 if (zptr[i] == 0) {
\r
1465 if (origPtr == -1) {
\r
1470 bool FullGtU(int i1, int i2)
\r
1476 c1 = block[i1 + 1];
\r
1477 c2 = block[i2 + 1];
\r
1484 c1 = block[i1 + 1];
\r
1485 c2 = block[i2 + 1];
\r
1492 c1 = block[i1 + 1];
\r
1493 c2 = block[i2 + 1];
\r
1500 c1 = block[i1 + 1];
\r
1501 c2 = block[i2 + 1];
\r
1508 c1 = block[i1 + 1];
\r
1509 c2 = block[i2 + 1];
\r
1516 c1 = block[i1 + 1];
\r
1517 c2 = block[i2 + 1];
\r
1527 c1 = block[i1 + 1];
\r
1528 c2 = block[i2 + 1];
\r
1532 s1 = quadrant[i1];
\r
1533 s2 = quadrant[i2];
\r
1540 c1 = block[i1 + 1];
\r
1541 c2 = block[i2 + 1];
\r
1545 s1 = quadrant[i1];
\r
1546 s2 = quadrant[i2];
\r
1553 c1 = block[i1 + 1];
\r
1554 c2 = block[i2 + 1];
\r
1558 s1 = quadrant[i1];
\r
1559 s2 = quadrant[i2];
\r
1566 c1 = block[i1 + 1];
\r
1567 c2 = block[i2 + 1];
\r
1571 s1 = quadrant[i1];
\r
1572 s2 = quadrant[i2];
\r
1596 Knuth's increments seem to work better
\r
1597 than Incerpi-Sedgewick here. Possibly
\r
1598 because the number of elems to sort is
\r
1599 usually small, typically <= 20.
\r
1601 readonly int[] incs = new int[] {
\r
1602 1, 4, 13, 40, 121, 364, 1093, 3280,
\r
1603 9841, 29524, 88573, 265720,
\r
1607 void AllocateCompressStructures()
\r
1609 int n = BZip2Constants.baseBlockSize * blockSize100k;
\r
1610 block = new byte[(n + 1 + BZip2Constants.NUM_OVERSHOOT_BYTES)];
\r
1611 quadrant = new int[(n + BZip2Constants.NUM_OVERSHOOT_BYTES)];
\r
1612 zptr = new int[n];
\r
1613 ftab = new int[65537];
\r
1615 if (block == null || quadrant == null || zptr == null || ftab == null) {
\r
1616 // int totalDraw = (n + 1 + NUM_OVERSHOOT_BYTES) + (n + NUM_OVERSHOOT_BYTES) + n + 65537;
\r
1617 // compressOutOfMemory ( totalDraw, n );
\r
1621 The back end needs a place to store the MTF values
\r
1622 whilst it calculates the coding tables. We could
\r
1623 put them in the zptr array. However, these values
\r
1624 will fit in a short, so we overlay szptr at the
\r
1625 start of zptr, in the hope of reducing the number
\r
1626 of cache misses induced by the multiple traversals
\r
1627 of the MTF values when calculating coding tables.
\r
1628 Seems to improve compression speed by about 1%.
\r
1633 szptr = new short[2 * n];
\r
1636 void GenerateMTFValues()
\r
1638 char[] yy = new char[256];
\r
1649 for (i = 0; i <= EOB; i++) {
\r
1655 for (i = 0; i < nInUse; i++) {
\r
1660 for (i = 0; i <= last; i++) {
\r
1663 ll_i = unseqToSeq[block[zptr[i]]];
\r
1667 while (ll_i != tmp) {
\r
1681 switch (zPend % 2) {
\r
1683 szptr[wr] = (short)BZip2Constants.RUNA;
\r
1685 mtfFreq[BZip2Constants.RUNA]++;
\r
1688 szptr[wr] = (short)BZip2Constants.RUNB;
\r
1690 mtfFreq[BZip2Constants.RUNB]++;
\r
1696 zPend = (zPend - 2) / 2;
\r
1700 szptr[wr] = (short)(j + 1);
\r
1709 switch (zPend % 2) {
\r
1711 szptr[wr] = (short)BZip2Constants.RUNA;
\r
1713 mtfFreq[BZip2Constants.RUNA]++;
\r
1716 szptr[wr] = (short)BZip2Constants.RUNB;
\r
1718 mtfFreq[BZip2Constants.RUNB]++;
\r
1724 zPend = (zPend - 2) / 2;
\r
1728 szptr[wr] = (short)EOB;
\r
1737 /* This file was derived from a file containing under this license:
\r
1739 * This file is a part of bzip2 and/or libbzip2, a program and
\r
1740 * library for lossless, block-sorting data compression.
\r
1742 * Copyright (C) 1996-1998 Julian R Seward. All rights reserved.
\r
1744 * Redistribution and use in source and binary forms, with or without
\r
1745 * modification, are permitted provided that the following conditions
\r
1748 * 1. Redistributions of source code must retain the above copyright
\r
1749 * notice, this list of conditions and the following disclaimer.
\r
1751 * 2. The origin of this software must not be misrepresented; you must
\r
1752 * not claim that you wrote the original software. If you use this
\r
1753 * software in a product, an acknowledgment in the product
\r
1754 * documentation would be appreciated but is not required.
\r
1756 * 3. Altered source versions must be plainly marked as such, and must
\r
1757 * not be misrepresented as being the original software.
\r
1759 * 4. The name of the author may not be used to endorse or promote
\r
1760 * products derived from this software without specific prior written
\r
1763 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
\r
1764 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
\r
1765 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
\r
1766 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
\r
1767 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
\r
1768 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
\r
1769 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
\r
1770 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
\r
1771 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
\r
1772 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
\r
1773 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\r
1775 * Java version ported by Keiron Liddle, Aftex Software <keiron@aftexsw.com> 1999-2001
\r