--- /dev/null
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
+ * and Joseph Sutton <josephsutton@catalyst.net.nz>
+ *
+ * ** NOTE! The following LGPL license applies to this file.
+ * ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <talloc.h>
+
+#include "replace.h"
+#include "lzxpress_huffman.h"
+#include "lib/util/stable_sort.h"
+#include "lib/util/byteorder.h"
+#include "lib/util/bytearray.h"
+
+
+#define LZXPRESS_ERROR -1LL
+
+
+struct bitstream {
+ const uint8_t *bytes;
+ size_t byte_pos;
+ size_t byte_size;
+ uint32_t bits;
+ int remaining_bits;
+ uint16_t *table;
+};
+
+
+/**
+ * Determines the sort order of one prefix_code_symbol relative to another
+ */
+static int compare_uint16(const uint16_t *a, const uint16_t *b)
+{
+ if (*a < *b) {
+ return -1;
+ }
+ if (*a > *b) {
+ return 1;
+ }
+ return 0;
+}
+
+
+static bool fill_decomp_table(struct bitstream *input)
+{
+ /*
+ * There are 512 symbols, each encoded in 4 bits, which indicates
+ * their depth in the Huffman tree. The even numbers get the lower
+ * nibble of each byte, so that the byte hex values look backwards
+ * (i.e. 0xab encodes b then a). These are allocated Huffman codes in
+ * order of appearance, per depth.
+ *
+ * For example, if the first two bytes were:
+ *
+ * 0x23 0x53
+ *
+ * the first four codes have the lengths 3, 2, 3, 5.
+ * Let's call them A, B, C, D.
+ *
+ * Suppose there is no other codeword with length 1 (which is
+ * necessarily true in this example) or 2, but there might be others
+ * of length 3 or 4. Then we can say this about the codes:
+ *
+ * _ --*--_
+ * / \
+ * 0 1
+ * / \ / \
+ * 0 1 0 1
+ * B |\ / \ |\
+ * 0 1 0 1 0 1
+ * A C |\ /| | |\
+ *
+ * pos bits code
+ * A 3 010
+ * B 2 00
+ * C 3 011
+ * D 5 1????
+ *
+ * B has the shortest code, so takes the leftmost branch, 00. That
+ * ends the branch -- nothing else can start with 00. There are no
+ * more 2s, so we look at the 3s, starting as far left as possible. So
+ * A takes 010 and C takes 011. That means everything else has to
+ * start with 1xx. We don't know how many codewords of length 3 or 4
+ * there are; if there are none, D would end up with 10000, the
+ * leftmost available code of length 5. If the compressor is any good,
+ * there should be no unused leaf nodes left dangling at the end.
+ *
+ * (this is "Canonical Huffman Coding").
+ *
+ *
+ * But what symbols do these codes actually stand for?
+ * --------------------------------------------------
+ *
+ * Good question. The first 256 codes stand for the corresponding
+ * literal bytes. The codes from 256 to 511 stand for LZ77 matches,
+ * which have a distance and a length, encoded in a strange way that
+ * isn't entirely the purview of this function.
+ *
+ * What does the value 0 mean?
+ * ---------------------------
+ *
+ * The code does not occur. For example, if the next byte in the
+ * example above was 0x07, that would give the byte 0x04 a 7-long
+ * code, and no code to the 0x05 byte, which means we there is no way
+ * we going to see a 5 in the decoded stream.
+ *
+ * Isn't LZ77 + Huffman what zip/gzip/zlib do?
+ * -------------------------------------------
+ *
+ * Yes, DEFLATE is LZ77 + Huffman, but the details are quite different.
+ */
+ uint16_t symbols[512];
+ uint16_t sort_mem[512];
+ size_t i, n_symbols;
+ ssize_t code;
+ uint16_t len, prev_len;
+ const uint8_t *table_bytes = input->bytes + input->byte_pos;
+
+ if (input->byte_pos + 260 > input->byte_size) {
+ return false;
+ }
+
+ n_symbols = 0;
+ for (i = 0; i < 256; i++) {
+ uint16_t even = table_bytes[i] & 15;
+ uint16_t odd = table_bytes[i] >> 4;
+ if (even != 0) {
+ symbols[n_symbols] = (even << 9) + i * 2;
+ n_symbols++;
+ }
+ if (odd != 0) {
+ symbols[n_symbols] = (odd << 9) + i * 2 + 1;
+ n_symbols++;
+ }
+ }
+ input->byte_pos += 256;
+ if (n_symbols == 0) {
+ return false;
+ }
+
+ stable_sort(symbols, sort_mem, n_symbols, sizeof(uint16_t),
+ (samba_compare_fn_t)compare_uint16);
+
+ /*
+ * we're using an implicit binary tree, as you'd see in a heap.
+ * table[0] = unused
+ * table[1] = '0'
+ * table[2] = '1'
+ * table[3] = '00' <-- '00' and '01' are children of '0'
+ * table[4] = '01' <-- '0' is [0], children are [0 * 2 + {1,2}]
+ * table[5] = '10'
+ * table[6] = '11'
+ * table[7] = '000'
+ * table[8] = '001'
+ * table[9] = '010'
+ * table[10]= '011'
+ * table[11]= '100
+ *'
+ * table[1 << n - 1] = '0' * n
+ * table[1 << n - 1 + x] = n-bit wide x (left padded with '0')
+ * table[1 << n - 2] = '1' * (n - 1)
+ *
+ * table[i]->left = table[i*2 + 1]
+ * table[i]->right = table[i*2 + 2]
+ * table[0xffff] = unused (16 '0's, max len is 15)
+ *
+ * therefore e.g. table[70] = table[64 - 1 + 7]
+ * = table[1 << 6 - 1 + 7]
+ * = '000111' (binary 7, widened to 6 bits)
+ *
+ * and if '000111' is a code,
+ * '00011', '0001', '000', '00', '0' are unavailable prefixes.
+ * 34 16 7 3 1 are their indices
+ * and (i - 1) >> 1 is the path back from 70 through these.
+ *
+ * the lookup is
+ *
+ * 1 start with i = 0
+ * 2 extract a symbol bit (i = (i << 1) + bit + 1)
+ * 3 is table[i] == 0xffff?
+ * 4 yes -- goto 2
+ * 4 table[i] & 511 is the symbol, stop
+ *
+ * and the construction (here) is sort of the reverse.
+ *
+ * Most of this table is free space that can never be reached, and
+ * most of the activity is at the beginning (since all codes start
+ * there, and by design the shortest codes are the most common).
+ */
+ for (i = 0; i < 32; i++) {
+ /* prefill the table head */
+ input->table[i] = 0xffff;
+ }
+ code = -1;
+ prev_len = 0;
+ for (i = 0; i < n_symbols; i++) {
+ uint16_t s = symbols[i];
+ uint16_t prefix;
+ len = (s >> 9) & 15;
+ s &= 511;
+ code++;
+ while (len != prev_len) {
+ code <<= 1;
+ code++;
+ prev_len++;
+ }
+
+ if (code >= 65535) {
+ return false;
+ }
+ input->table[code] = s;
+ for(prefix = (code - 1) >> 1;
+ prefix > 31;
+ prefix = (prefix - 1) >> 1) {
+ input->table[prefix] = 0xffff;
+ }
+ }
+
+ /*
+ * check that the last code encodes 11111..., with right number of
+ * ones, pointing to the right symbol -- otherwise we have a dangling
+ * uninitialised symbol.
+ */
+ if (code != (1 << (len + 1)) - 2) {
+ return false;
+ }
+ return true;
+}
+
+
+#define CHECK_READ_32(dest) \
+ do { \
+ if (input->byte_pos + 4 > input->byte_size) { \
+ return LZXPRESS_ERROR; \
+ } \
+ dest = PULL_LE_U32(input->bytes, input->byte_pos); \
+ input->byte_pos += 4; \
+ } while (0)
+
+#define CHECK_READ_16(dest) \
+ do { \
+ if (input->byte_pos + 2 > input->byte_size) { \
+ return LZXPRESS_ERROR; \
+ } \
+ dest = PULL_LE_U16(input->bytes, input->byte_pos); \
+ input->byte_pos += 2; \
+ } while (0)
+
+#define CHECK_READ_8(dest) \
+ do { \
+ if (input->byte_pos >= input->byte_size) { \
+ return LZXPRESS_ERROR; \
+ } \
+ dest = PULL_LE_U8(input->bytes, input->byte_pos); \
+ input->byte_pos++; \
+ } while(0)
+
+
+static inline ssize_t pull_bits(struct bitstream *input)
+{
+ if (input->byte_pos + 1 < input->byte_size) {
+ uint16_t tmp;
+ CHECK_READ_16(tmp);
+ input->remaining_bits += 16;
+ input->bits <<= 16;
+ input->bits |= tmp;
+ } else if (input->byte_pos < input->byte_size) {
+ uint8_t tmp;
+ CHECK_READ_8(tmp);
+ input->remaining_bits += 8;
+ input->bits <<= 8;
+ input->bits |= tmp;
+ } else {
+ return LZXPRESS_ERROR;
+ }
+ return 0;
+}
+
+
+/*
+ * Decompress a block. The actual decompressed size is returned (or -1 on
+ * error). The putative block length is 64k (or shorter, if the message ends
+ * first), but a match can run over the end, extending the block. That's why
+ * we need the overall output size as well as the block size. A match encoded
+ * in this block can point back to previous blocks, but not before the
+ * beginning of the message, so we also need the previously decoded size.
+ *
+ * The compressed block will have 256 bytes for the Huffman table, and at
+ * least 4 bytes of (possibly padded) encoded values.
+ */
+static ssize_t lzx_huffman_decompress_block(struct bitstream *input,
+ uint8_t *output,
+ size_t block_size,
+ size_t output_size,
+ size_t previous_size)
+{
+ size_t output_pos = 0;
+ uint16_t symbol;
+ size_t index;
+ uint16_t distance_bits_wanted = 0;
+ size_t distance = 0;
+ size_t length = 0;
+ bool ok;
+ uint32_t tmp;
+ bool seen_eof_marker = false;
+
+ ok = fill_decomp_table(input);
+ if (! ok) {
+ return LZXPRESS_ERROR;
+ }
+
+ /*
+ * Always read 32 bits at the start, even if we don't need them.
+ */
+ CHECK_READ_16(tmp);
+ CHECK_READ_16(input->bits);
+ input->bits |= tmp << 16;
+ input->remaining_bits = 32;
+
+ /*
+ * This loop iterates over individual *bits*. These are read from
+ * little-endian 16 bit words, most significant bit first.
+ *
+ * At points in the bitstream, the following are possible:
+ *
+ * # the source word is empty and needs to be refilled from the input
+ * stream.
+ * # an incomplete codeword is being extended.
+ * # a codeword is resolved, either as a literal or a match.
+ * # a literal is written.
+ * # a match is collecting distance bits.
+ * # the output stream is copied, as specified by a match.
+ * # input bytes are read for match lengths.
+ *
+ * Note that we *don't* specifically check for the EOF marker (symbol
+ * 256) in this loop, because the a precondition for stopping for the
+ * EOF marker is that the output buffer is full (otherwise, you
+ * wouldn't know which 256 is EOF, rather than an actual symbol), and
+ * we *always* want to stop when the buffer is full. So we work out if
+ * there is an EOF in in another loop after we stop writing.
+ */
+
+ index = 0;
+ while (output_pos < block_size) {
+ uint16_t b;
+ if (input->remaining_bits == 16) {
+ ssize_t ret = pull_bits(input);
+ if (ret) {
+ return ret;
+ }
+ }
+ input->remaining_bits--;
+
+ b = (input->bits >> input->remaining_bits) & 1;
+ if (length == 0) {
+ /* not in a match; pulling a codeword */
+ index <<= 1;
+ index += b + 1;
+ if (input->table[index] == 0xffff) {
+ /* incomplete codeword, the common case */
+ continue;
+ }
+ /* found the symbol, reset the code string */
+ symbol = input->table[index] & 511;
+ index = 0;
+ if (symbol < 256) {
+ /* a literal, the easy case */
+ output[output_pos] = symbol;
+ output_pos++;
+ continue;
+ }
+
+ /* the beginning of a match */
+ distance_bits_wanted = (symbol >> 4) & 15;
+ distance = 1 << distance_bits_wanted;
+ length = symbol & 15;
+ if (length == 15) {
+ CHECK_READ_8(tmp);
+ length += tmp;
+ if (length == 255 + 15) {
+ /*
+ * note, we discard (don't add) the
+ * length so far.
+ */
+ CHECK_READ_16(length);
+ if (length == 0) {
+ CHECK_READ_32(length);
+ }
+ }
+ }
+ length += 3;
+ } else {
+ /* we are pulling extra distance bits */
+ distance_bits_wanted--;
+ distance |= b << distance_bits_wanted;
+ }
+
+ if (distance_bits_wanted == 0) {
+ /*
+ * We have a complete match, and it is time to do the
+ * copy (byte by byte, because the ranges can overlap,
+ * and we might need to copy bytes we just copied in).
+ *
+ * It is possible that this match will extend beyond
+ * the end of the expected block. That's fine, so long
+ * as it doesn't extend past the total output size.
+ */
+ size_t end = output_pos + length;
+ if (end > output_size ||
+ previous_size + output_pos < distance ||
+ unlikely(end < output_pos)) {
+ return LZXPRESS_ERROR;
+ }
+
+ for (; output_pos < end; output_pos++) {
+ output[output_pos] = \
+ output[output_pos - distance];
+ }
+ distance = 0;
+ length = 0;
+ }
+ }
+
+ if (length != 0 || index != 0) {
+ /* it seems like we've hit an early end, mid-code */
+ return LZXPRESS_ERROR;
+ }
+
+ if (input->byte_pos + 256 < input->byte_size) {
+ /*
+ * This block is over, but it clearly isn't the last block, so
+ * we don't want to look for the EOF.
+ */
+ return output_pos;
+ }
+ /*
+ * We won't write any more, but we try to read some more to make sure
+ * we're finishing in a good place. That means we want to see a 256
+ * symbol and then some number of zeroes, possibly zero, but as many
+ * as 32.
+ *
+ * In this we are perhaps a bit stricter than Windows, which
+ * apparently does not insist on the EOF marker, nor on a lack of
+ * trailing bytes.
+ */
+ while (true) {
+ uint16_t b;
+ if (input->remaining_bits == 16) {
+ ssize_t ret;
+ if (input->byte_pos == input->byte_size) {
+ /* FIN */
+ break;
+ }
+ ret = pull_bits(input);
+ if (ret) {
+ return ret;
+ }
+ }
+ input->remaining_bits--;
+ b = (input->bits >> input->remaining_bits) & 1;
+ if (seen_eof_marker) {
+ /*
+ * we have read an EOF symbols. Now we just want to
+ * see zeroes.
+ */
+ if (b != 0) {
+ return LZXPRESS_ERROR;
+ }
+ continue;
+ }
+
+ /* we're pulling in a symbol, which had better be 256 */
+ index <<= 1;
+ index += b + 1;
+ if (input->table[index] == 0xffff) {
+ continue;
+ }
+
+ symbol = input->table[index] & 511;
+ if (symbol != 256) {
+ return LZXPRESS_ERROR;
+ }
+ seen_eof_marker = true;
+ continue;
+ }
+
+ if (! seen_eof_marker) {
+ return LZXPRESS_ERROR;
+ }
+
+ return output_pos;
+}
+
+static ssize_t lzxpress_huffman_decompress_internal(struct bitstream *input,
+ uint8_t *output,
+ size_t output_size)
+{
+ size_t output_pos = 0;
+
+ if (input->byte_size < 260) {
+ return LZXPRESS_ERROR;
+ }
+
+ while (input->byte_pos < input->byte_size) {
+ ssize_t block_output_pos;
+ ssize_t block_output_size;
+ size_t remaining_output_size = output_size - output_pos;
+
+ block_output_size = MIN(65536, remaining_output_size);
+
+ block_output_pos = lzx_huffman_decompress_block(
+ input,
+ output + output_pos,
+ block_output_size,
+ remaining_output_size,
+ output_pos);
+
+ if (block_output_pos < block_output_size) {
+ return LZXPRESS_ERROR;
+ }
+ output_pos += block_output_pos;
+ if (output_pos > output_size) {
+ /* not expecting to get here. */
+ return LZXPRESS_ERROR;
+ }
+ }
+
+ if (input->byte_pos != input->byte_size) {
+ return LZXPRESS_ERROR;
+ }
+
+ return output_pos;
+}
+
+
+/*
+ * lzxpress_huffman_decompress()
+ *
+ * output_size must be the expected length of the decompressed data.
+ * input_size and output_size are limited to the minimum of UINT32_MAX and
+ * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
+ *
+ * @param input_bytes memory to be decompressed.
+ * @param input_size length of the compressed buffer.
+ * @param output destination for the decompressed data.
+ * @param output_size exact expected length of the decompressed data.
+ *
+ * @return the number of bytes written or -1 on error.
+ */
+
+ssize_t lzxpress_huffman_decompress(const uint8_t *input_bytes,
+ size_t input_size,
+ uint8_t *output,
+ size_t output_size)
+{
+ uint16_t table[65536];
+ struct bitstream input = {
+ .bytes = input_bytes,
+ .byte_size = input_size,
+ .byte_pos = 0,
+ .bits = 0,
+ .remaining_bits = 0,
+ .table = table
+ };
+
+ if (input_size > SSIZE_MAX ||
+ input_size > UINT32_MAX ||
+ output_size > SSIZE_MAX ||
+ output_size > UINT32_MAX ||
+ input_size == 0 ||
+ output_size == 0 ||
+ input_bytes == NULL ||
+ output == NULL) {
+ /*
+ * We use negative ssize_t to return errors, which is limiting
+ * on 32 bit machines, and the 4GB limit exists on Windows.
+ */
+ return LZXPRESS_ERROR;
+ }
+
+ return lzxpress_huffman_decompress_internal(&input,
+ output,
+ output_size);
+}
+
+
+/**
+ * lzxpress_huffman_decompress_talloc()
+ *
+ * The caller must provide the exact size of the expected output.
+ *
+ * The input_size is limited to the minimum of UINT32_MAX and SSIZE_MAX, but
+ * output_size is limited to 256MB due to a limit in talloc. This effectively
+ * limits input_size too, as non-crafted compressed data will not exceed the
+ * decompressed size by very much.
+ *
+ * @param mem_ctx TALLOC_CTX parent for the decompressed buffer.
+ * @param input_bytes memory to be decompressed.
+ * @param input_size length of the compressed buffer.
+ * @param output_size expected decompressed size.
+ *
+ * @return a talloc'ed buffer exactly output_size in length, or NULL.
+ */
+
+uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx,
+ const uint8_t *input_bytes,
+ size_t input_size,
+ size_t output_size)
+{
+ ssize_t result;
+ uint8_t *output = NULL;
+ struct bitstream input = {
+ .bytes = input_bytes,
+ .byte_size = input_size
+ };
+
+ output = talloc_array(mem_ctx, uint8_t, output_size);
+ if (output == NULL) {
+ return NULL;
+ }
+
+ input.table = talloc_array(mem_ctx, uint16_t, 65536);
+ if (input.table == NULL) {
+ talloc_free(output);
+ return NULL;
+ }
+ result = lzxpress_huffman_decompress_internal(&input,
+ output,
+ output_size);
+ talloc_free(input.table);
+
+ if (result != output_size) {
+ talloc_free(output);
+ return NULL;
+ }
+ return output;
+}
--- /dev/null
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
+ *
+ * ** NOTE! The following LGPL license applies to this file.
+ * ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <cmocka.h>
+#include <stdbool.h>
+#include "replace.h"
+#include <talloc.h>
+#include "lzxpress_huffman.h"
+#include "lib/util/stable_sort.h"
+#include "lib/util/data_blob.h"
+
+/* set LZXHUFF_DEBUG_FILES to true to save round-trip files in /tmp. */
+#define LZXHUFF_DEBUG_FILES false
+
+/* set LZXHUFF_DEBUG_VERBOSE to true to print more. */
+#define LZXHUFF_DEBUG_VERBOSE false
+
+
+#if LZXHUFF_DEBUG_VERBOSE
+#define debug_message(...) print_message(__VA_ARGS__)
+#else
+#define debug_message(...) /* debug_message */
+#endif
+
+
+struct lzx_pair {
+ const char *name;
+ DATA_BLOB compressed;
+ DATA_BLOB decompressed;
+};
+
+struct lzx_file_pair {
+ const char *name;
+ const char *compressed_file;
+ const char *decompressed_file;
+};
+
+
+#define DECOMP_DIR "testdata/compression/lzxpress-huffman/decompressed"
+#define COMP_DIR "testdata/compression/lzxpress-huffman/compressed"
+#define MORE_COMP_DIR "testdata/compression/lzxpress-huffman/more-compressed"
+
+
+#define VARRGH(...) __VA_ARGS__
+
+#define BLOB_FROM_ARRAY(...) \
+ { \
+ .data = (uint8_t[]){__VA_ARGS__}, \
+ .length = sizeof((uint8_t[]){__VA_ARGS__}) \
+ }
+
+#define BLOB_FROM_STRING(s) \
+ { \
+ .data = discard_const_p(uint8_t, s), \
+ .length = (sizeof(s) - 1) \
+ }
+
+
+const char * file_names[] = {
+ "27826-8.txt",
+ "5d049b4cb1bd933f5e8ex19",
+ "638e61e96d54279981c3x5",
+ "64k-minus-one-zeros",
+ "64k-plus-one-zeros",
+ "64k-zeros",
+ "96f696a4e5ce56c61a3dx10",
+ "9e0b6a12febf38e98f13",
+ "abc-times-101",
+ "abc-times-105",
+ "abc-times-200",
+ "and_rand",
+ "and_rand-128k+",
+ "b63289ccc7f218c0d56b",
+ "beta-variate1-128k+",
+ "beta-variate2-128k+",
+ "beta-variate3-128k+",
+ "decayed_alphabet_128k+",
+ "decayed_alphabet_64k",
+ "exp_shuffle",
+ "exp_shuffle-128k+",
+ "f00842317dc6d5695b02",
+ "fib_shuffle",
+ "fib_shuffle-128k+",
+ "fuzzing-0fc2d461b56cd8103c91",
+ "fuzzing-17c961778538cc10ab7c",
+ "fuzzing-3591f9dc02bb00a54b60",
+ "fuzzing-3ec3bca27bb9eb40c128",
+ "fuzzing-80b4fa18ff5f8dd04862",
+ "fuzzing-a3115a81d1ac500318f9",
+ "generate-windows-test-vectors.c",
+ "midsummer-nights-dream.txt",
+ "notes-on-the-underground.txt",
+ "pg22009.txt",
+ "repeating",
+ "repeating-exactly-64k",
+ "setup.log",
+ "skewed_choices",
+ "skewed_choices-128k+",
+ /* These ones were deathly slow in fuzzing at one point */
+ "slow-015ddc36a71412ccc50d",
+ "slow-100e9f966a7feb9ca40a",
+ "slow-2a671c3cff4f1574cbab",
+ "slow-33d90a24e70515b14cd0",
+ "slow-49d8c05261e3f412fc72",
+ "slow-50a249d2fe56873e56a0",
+ "slow-63e9f0b52235fb0129fa",
+ "slow-73b7f971d65908ac0095",
+ "slow-8b61e3dd267908544531",
+ "slow-9d1c5a079b0462986f1f",
+ "slow-aa7262a821dabdcf04a6",
+ "slow-b8a91d142b0d2af7f5ca",
+ "slow-c79142457734bbc8d575",
+ "slow-d736544545b90d83fe75",
+ "slow-e3b9bdfaed7d1a606fdb",
+ "slow-f3f1c02a9d006e5e1703",
+ "square_series",
+ "square_series-128k+",
+ "trigram_128k+",
+ "trigram_64k",
+ "trigram_sum_128k+",
+ "trigram_sum_64k",
+ NULL
+};
+
+struct lzx_pair bidirectional_pairs[] = {
+
+ {.name = "abc__100_repeats", /* [MS-XCA] 3.2 example 2. */
+ .decompressed = BLOB_FROM_STRING(
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+ ),
+ .compressed = BLOB_FROM_ARRAY(
+ /*
+ * The 'a', 'b', and 'c' bytes are 0x61, 0x62, 0x63. No other
+ * symbols occur. That means we need 48 0x00 bytes for the
+ * first 96 symbol-nybbles, then some short codes, then zeros
+ * again for the rest of the literals.
+ */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,
+ 0x30, 0x23, /* a_ cb */
+ 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 100 bytes */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0, /* here end the 0-255 literals (128 bytes) */
+ 0x02, /* 'EOF' symbol 256 (byte 128 low) */
+ 0,0,0,0,0, 0,0,0,0,0, 0, /* 140 bytes */
+ 0,0,0,
+ 0x20, /* codepoint 287 (byte 143 high) */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0, /* 160 bytes */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 bytes */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
+ /*
+ * So that's the tree.
+ *
+ * length 2 codes for 'c', EOF, 287
+ * length 3 for 'a', 'b'.
+ *
+ * c: 00
+ * EOF: 01
+ * 287: 10
+ * a: 110
+ * b: 111
+ *
+ * thus the literal string "abc" is 110-111-00.
+ *
+ * Now for the lz77 match definitions for EOF and 287.
+ *
+ * Why 287? It encodes the match distance and offset.
+ *
+ * 287 - 256 = 31
+ *
+ * _length = 31 % 16 = 15
+ * _distance = 31 / 16 = 1
+ *
+ * (it's easier to look at the hex, 0x11f:
+ * 1xx means a match; x1x is _distance; xxf is _length)
+ *
+ * _distance 1 means a two bit distance (10 or 11; 2 or 3).
+ * That means the next bit will be the least significant bit
+ * of distance (1 in this case, meaning distance 3).
+ *
+ * if _length is 15, real length is included inline.
+ *
+ * 'EOF' == 256 means _length = 0, _distance = 0.
+ *
+ * _distance 0 means 1, so no further bits needed.
+ * _length 0 means length 3.
+ *
+ * but when used as EOF, this doesn't matter.
+ */
+ 0xa8, 0xdc, 0x00, 0x00, 0xff, 0x26, 0x01
+ /* These remaining bytes are:
+ *
+ * 10101000 11011100 00000000 00000000 11111111
+ * 00100110 00000001
+ *
+ * and we read them as 16 chunks (i.e. flipping odd/even order)
+ *
+ * 110-111-00 10-1-01-000
+ * a b c 287 | EOF
+ * |
+ * this is the 287 distance low bit.
+ *
+ * The last 3 bits are not used. The 287 length is sort of
+ * out of band, coming up soon (because 287 encodes length
+ * 15; most codes do not do this).
+ *
+ * 00000000 00000000
+ *
+ * This padding is there because the first 32 bits are read
+ * at the beginning of decoding. If there were more things to
+ * be encoded, they would be in here.
+ *
+ * 11111111
+ *
+ * This byte is pulled as the length for the 287 match.
+ * Because it is 0xff, we pull a further 2 bytes for the
+ * actual length, i.e. a 16 bit number greater than 270.
+ *
+ * 00000001 00100110
+ *
+ * that is 0x126 = 294 = the match length - 3 (because we're
+ * encoding ["abc", <copy from 3 back, 297 chars>, EOF]).
+ *
+ */
+ )
+ },
+ {.name = "abcdefghijklmnopqrstuvwxyz", /* [MS-XCA] 3.2 example 1. */
+ .decompressed = BLOB_FROM_STRING("abcdefghijklmnopqrstuvwxyz"),
+ .compressed = BLOB_FROM_ARRAY(
+ /*
+ * In this case there are no matches encoded as there are no
+ * repeated symbols. Including the EOF, there are 27 symbols
+ * all occuring exactly as frequently as each other (once).
+ * From that we would expect the codes to be mostly 5 bits
+ * long, because 27 < 2^5 (32), but greater than 2^4. And
+ * that's what we see.
+ */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,
+ /* 14 non-zero bytes for 26 letters/nibbles */
+ 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+ 0x55, 0x55, 0x55, 0x45, 0x44, 0x04,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0, /* 80 */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,
+ 0x04, /* 0x100 EOF */
+ /* no matches */
+ 0,0,0,0,0, 0,0,0,0,0, 0, /* 140 */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 */
+ 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
+
+ 0xd8, 0x52, 0x3e, 0xd7, 0x94, 0x11, 0x5b, 0xe9,
+ 0x19, 0x5f, 0xf9, 0xd6, 0x7c, 0xdf, 0x8d, 0x04,
+ 0x00, 0x00, 0x00, 0x00)
+ },
+ {0}
+};
+
+
+static void test_lzxpress_huffman_decompress(void **state)
+{
+ size_t i;
+ ssize_t written;
+ uint8_t *dest = NULL;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; bidirectional_pairs[i].name != NULL; i++) {
+ struct lzx_pair p = bidirectional_pairs[i];
+ dest = talloc_array(mem_ctx, uint8_t, p.decompressed.length);
+
+ debug_message("%s compressed %zu decomp %zu\n", p.name,
+ p.compressed.length,
+ p.decompressed.length);
+
+ written = lzxpress_huffman_decompress(p.compressed.data,
+ p.compressed.length,
+ dest,
+ p.decompressed.length);
+ assert_int_equal(written, p.decompressed.length);
+
+ assert_memory_equal(dest, p.decompressed.data, p.decompressed.length);
+ talloc_free(dest);
+ }
+}
+
+
+static DATA_BLOB datablob_from_file(TALLOC_CTX *mem_ctx,
+ const char *filename)
+{
+ DATA_BLOB b = {0};
+ FILE *fh = fopen(filename, "rb");
+ int ret;
+ struct stat s;
+ size_t len;
+ if (fh == NULL) {
+ debug_message("could not open '%s'\n", filename);
+ return b;
+ }
+ ret = fstat(fileno(fh), &s);
+ if (ret != 0) {
+ fclose(fh);
+ return b;
+ }
+ b.data = talloc_array(mem_ctx, uint8_t, s.st_size);
+ if (b.data == NULL) {
+ fclose(fh);
+ return b;
+ }
+ len = fread(b.data, 1, s.st_size, fh);
+ if (ferror(fh) || len != s.st_size) {
+ TALLOC_FREE(b.data);
+ } else {
+ b.length = len;
+ }
+ fclose(fh);
+ return b;
+}
+
+
+static void test_lzxpress_huffman_decompress_files(void **state)
+{
+ size_t i;
+ int score = 0;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; file_names[i] != NULL; i++) {
+ char filename[200];
+ uint8_t *dest = NULL;
+ ssize_t written;
+ TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+ struct lzx_pair p = {
+ .name = file_names[i]
+ };
+
+ debug_message("%s\n", p.name);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.decomp", DECOMP_DIR, p.name);
+
+ p.decompressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.decompressed.data);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.lzhuff", COMP_DIR, p.name);
+
+ p.compressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.compressed.data);
+
+ dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+
+ written = lzxpress_huffman_decompress(p.compressed.data,
+ p.compressed.length,
+ dest,
+ p.decompressed.length);
+ if (written == p.decompressed.length &&
+ memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+ debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+ score++;
+ } else {
+ debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+ p.name);
+ debug_message("size %zd vs reference %zu\n",
+ written, p.decompressed.length);
+ }
+ talloc_free(tmp_ctx);
+ }
+ debug_message("%d/%zu correct\n", score, i);
+ assert_int_equal(score, i);
+}
+
+
+static void test_lzxpress_huffman_decompress_more_compressed_files(void **state)
+{
+ /*
+ * This tests the decompression of files that have been compressed on
+ * Windows with the level turned up (to 1, default for MS-XCA is 0).
+ *
+ * The format is identical, but it will have tried harder to find
+ * matches.
+ */
+ size_t i;
+ int score = 0;
+ int found = 0;
+ TALLOC_CTX *mem_ctx = talloc_new(NULL);
+ for (i = 0; file_names[i] != NULL; i++) {
+ char filename[200];
+ uint8_t *dest = NULL;
+ ssize_t written;
+ TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+ struct lzx_pair p = {
+ .name = file_names[i]
+ };
+
+ debug_message("%s\n", p.name);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.decomp", DECOMP_DIR, p.name);
+
+ p.decompressed = datablob_from_file(tmp_ctx, filename);
+ assert_non_null(p.decompressed.data);
+
+ snprintf(filename, sizeof(filename),
+ "%s/%s.lzhuff", MORE_COMP_DIR, p.name);
+
+ p.compressed = datablob_from_file(tmp_ctx, filename);
+ if (p.compressed.data == NULL) {
+ /*
+ * We don't have all the vectors in the
+ * more-compressed directory, which is OK, we skip
+ * them.
+ */
+ continue;
+ }
+ found++;
+ dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+
+ written = lzxpress_huffman_decompress(p.compressed.data,
+ p.compressed.length,
+ dest,
+ p.decompressed.length);
+ if (written == p.decompressed.length &&
+ memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+ debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+ score++;
+ } else {
+ debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+ p.name);
+ debug_message("size %zd vs reference %zu\n",
+ written, p.decompressed.length);
+ }
+ talloc_free(tmp_ctx);
+ }
+ debug_message("%d/%d correct\n", score, found);
+ assert_int_equal(score, found);
+}
+
+
+static void test_lzxpress_huffman_decompress_empty_or_null(void **state)
+{
+ /*
+ * We expect these to fail with a -1, except the last one.
+ */
+ ssize_t ret;
+ const uint8_t *input = bidirectional_pairs[0].compressed.data;
+ size_t ilen = bidirectional_pairs[0].compressed.length;
+ size_t olen = bidirectional_pairs[0].decompressed.length;
+ uint8_t output[olen];
+
+ ret = lzxpress_huffman_decompress(input, 0, output, olen);
+ assert_int_equal(ret, -1LL);
+ ret = lzxpress_huffman_decompress(input, ilen, output, 0);
+ assert_int_equal(ret, -1LL);
+
+ ret = lzxpress_huffman_decompress(NULL, ilen, output, olen);
+ assert_int_equal(ret, -1LL);
+ ret = lzxpress_huffman_decompress(input, ilen, NULL, olen);
+ assert_int_equal(ret, -1LL);
+
+ ret = lzxpress_huffman_decompress(input, ilen, output, olen);
+ assert_int_equal(ret, olen);
+}
+
+
+int main(void) {
+ const struct CMUnitTest tests[] = {
+ cmocka_unit_test(test_lzxpress_huffman_decompress_files),
+ cmocka_unit_test(test_lzxpress_huffman_decompress_more_compressed_files),
+ cmocka_unit_test(test_lzxpress_huffman_decompress),
+ cmocka_unit_test(test_lzxpress_huffman_decompress_empty_or_null),
+ };
+ if (!isatty(1)) {
+ cmocka_set_message_output(CM_OUTPUT_SUBUNIT);
+ }
+
+ return cmocka_run_group_tests(tests, NULL, NULL);
+}