001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.compress.harmony.pack200; 018 019import java.io.EOFException; 020import java.io.IOException; 021import java.io.InputStream; 022import java.util.ArrayList; 023import java.util.List; 024 025/** 026 * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" 027 * encoding mechanism. It uses a variable-length encoding and a modified sign representation such that small numbers are 028 * represented as a single byte, whilst larger numbers take more bytes to encode. The number may be signed or unsigned; 029 * if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's complement. The 030 * Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. 031 * So a delta encoding of the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute 032 * value of a coded integer to fall outside of the 'small number' range, whilst still being encoded as a single byte. 033 * 034 * A BHSD codec is configured with four parameters: 035 * <dl> 036 * <dt>B</dt> 037 * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through 038 * coding (where each byte is encoded as itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd> 039 * <dt>H</dt> 040 * <dd>The radix of the integer. Values are defined as a sequence of values, where value <code>n</code> is multiplied by 041 * <code>H^<sup>n</sup></code>. So the number 1234 may be represented as the sequence 4 3 2 1 with a radix (H) of 10. 042 * Note that other permutations are also possible; 43 2 1 will also encode 1234. The co-parameter L is defined as 256-H. 043 * This is important because only the last value in a sequence may be < L; all prior values must be > L.</dd> 044 * <dt>S</dt> 045 * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's 046 * complement) or 2 (signed, two's complement)</dd> 047 * <dt>D</dt> 048 * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding 049 * of 1 indicates that values are cumulative; a sequence of <code>1 1 1 1 1</code> will represent the sequence 050 * <code>1 2 3 4 5</code>. For this reason, the codec supports two variants of decode; one 051 * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a <code>last</code> parameter. 052 * If the codec is a non-delta encoding, then the value is ignored if passed. If the codec is a delta encoding, it is a 053 * run-time error to call the value without the extra parameter, and the previous value should be returned. (It was 054 * designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for 055 * each use.) 056 * <dt> 057 * </dl> 058 * 059 * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted 060 * (1,256,0,0) or (1,256). The {@link #toString()} method prints out the condensed form of the encoding. Often, the last 061 * character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B value. Those that start with U 062 * ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the 063 * word Delta ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used. 064 * 065 */ 066public final class BHSDCodec extends Codec { 067 068 /** 069 * The maximum number of bytes in each coding word 070 */ 071 private final int b; 072 073 /** 074 * Whether delta encoding is used (0=false,1=true) 075 */ 076 private final int d; 077 078 /** 079 * The radix of the encoding 080 */ 081 private final int h; 082 083 /** 084 * The co-parameter of h; 256-h 085 */ 086 private final int l; 087 088 /** 089 * Represents signed numbers or not (0=unsigned,1/2=signed) 090 */ 091 private final int s; 092 093 private long cardinality; 094 095 private final long smallest; 096 097 private final long largest; 098 099 /** 100 * radix^i powers 101 */ 102 private final long[] powers; 103 104 /** 105 * Constructs an unsigned, non-delta Codec with the given B and H values. 106 * 107 * @param b the maximum number of bytes that a value can be encoded as [1..5] 108 * @param h the radix of the encoding [1..256] 109 */ 110 public BHSDCodec(final int b, final int h) { 111 this(b, h, 0, 0); 112 } 113 114 /** 115 * Constructs a non-delta Codec with the given B, H and S values. 116 * 117 * @param b the maximum number of bytes that a value can be encoded as [1..5] 118 * @param h the radix of the encoding [1..256] 119 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 120 * is signed with ?) 121 */ 122 public BHSDCodec(final int b, final int h, final int s) { 123 this(b, h, s, 0); 124 } 125 126 /** 127 * Constructs a Codec with the given B, H, S and D values. 128 * 129 * @param b the maximum number of bytes that a value can be encoded as [1..5] 130 * @param h the radix of the encoding [1..256] 131 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 132 * is signed with ?) 133 * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta) 134 */ 135 public BHSDCodec(final int b, final int h, final int s, final int d) { 136 if (b < 1 || b > 5) { 137 throw new IllegalArgumentException("1<=b<=5"); 138 } 139 if (h < 1 || h > 256) { 140 throw new IllegalArgumentException("1<=h<=256"); 141 } 142 if (s < 0 || s > 2) { 143 throw new IllegalArgumentException("0<=s<=2"); 144 } 145 if (d < 0 || d > 1) { 146 throw new IllegalArgumentException("0<=d<=1"); 147 } 148 if (b == 1 && h != 256) { 149 throw new IllegalArgumentException("b=1 -> h=256"); 150 } 151 if (h == 256 && b == 5) { 152 throw new IllegalArgumentException("h=256 -> b!=5"); 153 } 154 this.b = b; 155 this.h = h; 156 this.s = s; 157 this.d = d; 158 this.l = 256 - h; 159 if (h == 1) { 160 cardinality = b * 255 + 1; 161 } else { 162 cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b)); 163 } 164 smallest = calculateSmallest(); 165 largest = calculateLargest(); 166 167 powers = new long[b]; 168 for (int c = 0; c < b; c++) { 169 powers[c] = (long) Math.pow(h, c); 170 } 171 } 172 173 /** 174 * Returns the cardinality of this codec; that is, the number of distinct values that it can contain. 175 * 176 * @return the cardinality of this codec 177 */ 178 public long cardinality() { 179 return cardinality; 180 } 181 182 @Override 183 public int decode(final InputStream in) throws IOException, Pack200Exception { 184 if (d != 0) { 185 throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error"); 186 } 187 return decode(in, 0); 188 } 189 190 @Override 191 public int decode(final InputStream in, final long last) throws IOException, Pack200Exception { 192 int n = 0; 193 long z = 0; 194 long x = 0; 195 196 do { 197 x = in.read(); 198 lastBandLength++; 199 z += x * powers[n]; 200 n++; 201 } while (x >= l && n < b); 202 203 if (x == -1) { 204 throw new EOFException("End of stream reached whilst decoding"); 205 } 206 207 if (isSigned()) { 208 final int u = ((1 << s) - 1); 209 if ((z & u) == u) { 210 z = z >>> s ^ -1L; 211 } else { 212 z = z - (z >>> s); 213 } 214 } 215 // This algorithm does the same thing, but is probably slower. Leaving 216 // in for now for readability 217 // if(isSigned()) { 218 // long u = z; 219 // long twoPowS = (long)Math.pow(2, s); 220 // double twoPowSMinusOne = twoPowS-1; 221 // if(u % twoPowS < twoPowSMinusOne) { 222 // if(cardinality < Math.pow(2, 32)) { 223 // z = (long) (u - (Math.floor(u/ twoPowS))); 224 // } else { 225 // z = cast32((long) (u - (Math.floor(u/ twoPowS)))); 226 // } 227 // } else { 228 // z = (long) (-Math.floor(u/ twoPowS) - 1); 229 // } 230 // } 231 if (isDelta()) { 232 z += last; 233 } 234 return (int) z; 235 } 236 237 @Override 238 public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception { 239 final int[] band = super.decodeInts(n, in); 240 if (isDelta()) { 241 for (int i = 0; i < band.length; i++) { 242 while (band[i] > largest) { 243 band[i] -= cardinality; 244 } 245 while (band[i] < smallest) { 246 band[i] += cardinality; 247 } 248 } 249 } 250 return band; 251 } 252 253 @Override 254 public int[] decodeInts(final int n, final InputStream in, final int firstValue) 255 throws IOException, Pack200Exception { 256 final int[] band = super.decodeInts(n, in, firstValue); 257 if (isDelta()) { 258 for (int i = 0; i < band.length; i++) { 259 while (band[i] > largest) { 260 band[i] -= cardinality; 261 } 262 while (band[i] < smallest) { 263 band[i] += cardinality; 264 } 265 } 266 } 267 return band; 268 } 269 270 // private long cast32(long u) { 271 // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) - 272 // Math.pow(2, 31)); 273 // return u; 274 // } 275 276 /** 277 * True if this encoding can code the given value 278 * 279 * @param value the value to check 280 * @return <code>true</code> if the encoding can encode this value 281 */ 282 public boolean encodes(final long value) { 283 return value >= smallest && value <= largest; 284 } 285 286 @Override 287 public byte[] encode(final int value, final int last) throws Pack200Exception { 288 if (!encodes(value)) { 289 throw new Pack200Exception("The codec " + toString() + " does not encode the value " + value); 290 } 291 292 long z = value; 293 if (isDelta()) { 294 z -= last; 295 } 296 if (isSigned()) { 297 if (z < Integer.MIN_VALUE) { 298 z += 4294967296L; 299 } else if (z > Integer.MAX_VALUE) { 300 z -= 4294967296L; 301 } 302 if (z < 0) { 303 z = (-z << s) - 1; 304 } else if (s == 1) { 305 z = z << s; 306 } else { 307 z += (z - z % 3) / 3; 308 } 309 } else if (z < 0) { 310 // Need to use integer overflow here to represent negatives. 311 if (cardinality < 4294967296L) { 312 z += cardinality; 313 } else { 314 z += 4294967296L; // this value is equal to (1 << 32). 315 } 316 } 317 if (z < 0) { 318 throw new Pack200Exception("unable to encode"); 319 } 320 321 final List byteList = new ArrayList(); 322 for (int n = 0; n < b; n++) { 323 long byteN; 324 if (z < l) { 325 byteN = z; 326 } else { 327 byteN = z % h; 328 while (byteN < l) { 329 byteN += h; 330 } 331 } 332 byteList.add(Byte.valueOf((byte) byteN)); 333 if (byteN < l) { 334 break; 335 } 336 z -= byteN; 337 z /= h; 338 } 339 final byte[] bytes = new byte[byteList.size()]; 340 for (int i = 0; i < bytes.length; i++) { 341 bytes[i] = ((Byte) byteList.get(i)).byteValue(); 342 } 343 return bytes; 344 } 345 346 @Override 347 public byte[] encode(final int value) throws Pack200Exception { 348 return encode(value, 0); 349 } 350 351 /** 352 * Returns true if this codec is a delta codec 353 * 354 * @return true if this codec is a delta codec 355 */ 356 public boolean isDelta() { 357 return d != 0; 358 } 359 360 /** 361 * Returns true if this codec is a signed codec 362 * 363 * @return true if this codec is a signed codec 364 */ 365 public boolean isSigned() { 366 return s != 0; 367 } 368 369 /** 370 * Returns the largest value that this codec can represent. 371 * 372 * @return the largest value that this codec can represent. 373 */ 374 public long largest() { 375 return largest; 376 } 377 378 private long calculateLargest() { 379 long result; 380 // TODO This can probably be optimized into a better mathematical 381 // statement 382 if (d == 1) { 383 final BHSDCodec bh0 = new BHSDCodec(b, h); 384 return bh0.largest(); 385 } 386 if (s == 0) { 387 result = cardinality() - 1; 388 } else if (s == 1) { 389 result = cardinality() / 2 - 1; 390 } else if (s == 2) { 391 result = (3L * cardinality()) / 4 - 1; 392 } else { 393 throw new Error("Unknown s value"); 394 } 395 return Math.min((s == 0 ? ((long) Integer.MAX_VALUE) << 1 : Integer.MAX_VALUE) - 1, result); 396 } 397 398 /** 399 * Returns the smallest value that this codec can represent. 400 * 401 * @return the smallest value that this codec can represent. 402 */ 403 public long smallest() { 404 return smallest; 405 } 406 407 private long calculateSmallest() { 408 long result; 409 if (d == 1 || !isSigned()) { 410 if (cardinality >= 4294967296L) { // 2^32 411 result = Integer.MIN_VALUE; 412 } else { 413 result = 0; 414 } 415 } else { 416 result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s)); 417 } 418 return result; 419 } 420 421 /** 422 * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown. 423 */ 424 @Override 425 public String toString() { 426 final StringBuffer buffer = new StringBuffer(11); 427 buffer.append('('); 428 buffer.append(b); 429 buffer.append(','); 430 buffer.append(h); 431 if (s != 0 || d != 0) { 432 buffer.append(','); 433 buffer.append(s); 434 } 435 if (d != 0) { 436 buffer.append(','); 437 buffer.append(d); 438 } 439 buffer.append(')'); 440 return buffer.toString(); 441 } 442 443 /** 444 * @return the b 445 */ 446 public int getB() { 447 return b; 448 } 449 450 /** 451 * @return the h 452 */ 453 public int getH() { 454 return h; 455 } 456 457 /** 458 * @return the s 459 */ 460 public int getS() { 461 return s; 462 } 463 464 /** 465 * @return the l 466 */ 467 public int getL() { 468 return l; 469 } 470 471 @Override 472 public boolean equals(final Object o) { 473 if (o instanceof BHSDCodec) { 474 final BHSDCodec codec = (BHSDCodec) o; 475 return codec.b == b && codec.h == h && codec.s == s && codec.d == d; 476 } 477 return false; 478 } 479 480 @Override 481 public int hashCode() { 482 return ((b * 37 + h) * 37 + s) * 37 + d; 483 } 484}