lib/compression: add LZ77 + Huffman decompression

author Douglas Bagnall <douglas.bagnall@catalyst.net.nz>

Thu, 17 Nov 2022 01:24:52 +0000 (14:24 +1300)

committer Joseph Sutton <jsutton@samba.org>

Thu, 1 Dec 2022 22:56:39 +0000 (22:56 +0000)
author Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Thu, 17 Nov 2022 01:24:52 +0000 (14:24 +1300)
committer Joseph Sutton <jsutton@samba.org>
Thu, 1 Dec 2022 22:56:39 +0000 (22:56 +0000)
diff --git a/lib/compression/lzxpress_huffman.c b/lib/compression/lzxpress_huffman.c

new file mode 100644 (file)

index 0000000..105b1bd
--- /dev/null
+++ b/lib/compression/lzxpress_huffman.c
@@ -0,0 +1,656 @@
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
+ *        and Joseph Sutton   <josephsutton@catalyst.net.nz>
+ *
+ *  ** NOTE! The following LGPL license applies to this file.
+ *  ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 3 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <talloc.h>
+
+#include "replace.h"
+#include "lzxpress_huffman.h"
+#include "lib/util/stable_sort.h"
+#include "lib/util/byteorder.h"
+#include "lib/util/bytearray.h"
+
+
+#define LZXPRESS_ERROR -1LL
+
+
+struct bitstream {
+       const uint8_t *bytes;
+       size_t byte_pos;
+       size_t byte_size;
+       uint32_t bits;
+       int remaining_bits;
+       uint16_t *table;
+};
+
+
+/**
+ * Determines the sort order of one prefix_code_symbol relative to another
+ */
+static int compare_uint16(const uint16_t *a, const uint16_t *b)
+{
+       if (*a < *b) {
+               return -1;
+       }
+       if (*a > *b) {
+               return 1;
+       }
+       return 0;
+}
+
+
+static bool fill_decomp_table(struct bitstream *input)
+{
+       /*
+        * There are 512 symbols, each encoded in 4 bits, which indicates
+        * their depth in the Huffman tree. The even numbers get the lower
+        * nibble of each byte, so that the byte hex values look backwards
+        * (i.e. 0xab encodes b then a). These are allocated Huffman codes in
+        * order of appearance, per depth.
+        *
+        * For example, if the first two bytes were:
+        *
+        * 0x23 0x53
+        *
+        * the first four codes have the lengths 3, 2, 3, 5.
+        * Let's call them A, B, C, D.
+        *
+        * Suppose there is no other codeword with length 1 (which is
+        * necessarily true in this example) or 2, but there might be others
+        * of length 3 or 4. Then we can say this about the codes:
+        *
+        *        _ --*--_
+        *      /          \
+        *     0           1
+        *    / \         / \
+        *   0   1       0   1
+        *  B    |\     / \  |\
+        *       0 1   0   1 0 1
+        *       A C   |\ /| | |\
+        *
+        * pos bits  code
+        * A    3    010
+        * B    2    00
+        * C    3    011
+        * D    5    1????
+        *
+        * B has the shortest code, so takes the leftmost branch, 00. That
+        * ends the branch -- nothing else can start with 00. There are no
+        * more 2s, so we look at the 3s, starting as far left as possible. So
+        * A takes 010 and C takes 011. That means everything else has to
+        * start with 1xx. We don't know how many codewords of length 3 or 4
+        * there are; if there are none, D would end up with 10000, the
+        * leftmost available code of length 5. If the compressor is any good,
+        * there should be no unused leaf nodes left dangling at the end.
+        *
+        * (this is "Canonical Huffman Coding").
+        *
+        *
+        * But what symbols do these codes actually stand for?
+        * --------------------------------------------------
+        *
+        * Good question. The first 256 codes stand for the corresponding
+        * literal bytes. The codes from 256 to 511 stand for LZ77 matches,
+        * which have a distance and a length, encoded in a strange way that
+        * isn't entirely the purview of this function.
+        *
+        * What does the value 0 mean?
+        * ---------------------------
+        *
+        * The code does not occur. For example, if the next byte in the
+        * example above was 0x07, that would give the byte 0x04 a 7-long
+        * code, and no code to the 0x05 byte, which means we there is no way
+        * we going to see a 5 in the decoded stream.
+        *
+        * Isn't LZ77 + Huffman what zip/gzip/zlib do?
+        * -------------------------------------------
+        *
+        * Yes, DEFLATE is LZ77 + Huffman, but the details are quite different.
+        */
+       uint16_t symbols[512];
+       uint16_t sort_mem[512];
+       size_t i, n_symbols;
+       ssize_t code;
+       uint16_t len, prev_len;
+       const uint8_t *table_bytes = input->bytes + input->byte_pos;
+
+       if (input->byte_pos + 260 > input->byte_size) {
+               return false;
+       }
+
+       n_symbols = 0;
+       for (i = 0; i < 256; i++) {
+               uint16_t even = table_bytes[i] & 15;
+               uint16_t odd = table_bytes[i] >> 4;
+               if (even != 0) {
+                       symbols[n_symbols] = (even << 9) + i * 2;
+                       n_symbols++;
+               }
+               if (odd != 0) {
+                       symbols[n_symbols] = (odd << 9) + i * 2 + 1;
+                       n_symbols++;
+               }
+       }
+       input->byte_pos += 256;
+       if (n_symbols == 0) {
+               return false;
+       }
+
+       stable_sort(symbols, sort_mem, n_symbols, sizeof(uint16_t),
+                   (samba_compare_fn_t)compare_uint16);
+
+       /*
+        * we're using an implicit binary tree, as you'd see in a heap.
+        * table[0] = unused
+        * table[1] = '0'
+        * table[2] = '1'
+        * table[3] = '00'     <-- '00' and '01' are children of '0'
+        * table[4] = '01'     <-- '0' is [0], children are [0 * 2 + {1,2}]
+        * table[5] = '10'
+        * table[6] = '11'
+        * table[7] = '000'
+        * table[8] = '001'
+        * table[9] = '010'
+        * table[10]= '011'
+        * table[11]= '100
+        *'
+        * table[1 << n - 1] = '0' * n
+        * table[1 << n - 1 + x] = n-bit wide x (left padded with '0')
+        * table[1 << n - 2] = '1' * (n - 1)
+        *
+        * table[i]->left =  table[i*2 + 1]
+        * table[i]->right = table[i*2 + 2]
+        * table[0xffff] = unused (16 '0's, max len is 15)
+        *
+        * therefore e.g. table[70] = table[64     - 1 + 7]
+        *                          = table[1 << 6 - 1 + 7]
+        *                          = '000111' (binary 7, widened to 6 bits)
+        *
+        *   and if '000111' is a code,
+        *   '00011', '0001', '000', '00', '0' are unavailable prefixes.
+        *       34      16      7     3    1  are their indices
+        *   and (i - 1) >> 1 is the path back from 70 through these.
+        *
+        * the lookup is
+        *
+        * 1 start with i = 0
+        * 2 extract a symbol bit (i = (i << 1) + bit + 1)
+        * 3 is table[i] == 0xffff?
+        * 4  yes -- goto 2
+        * 4  table[i] & 511 is the symbol, stop
+        *
+        * and the construction (here) is sort of the reverse.
+        *
+        * Most of this table is free space that can never be reached, and
+        * most of the activity is at the beginning (since all codes start
+        * there, and by design the shortest codes are the most common).
+        */
+       for (i = 0; i < 32; i++) {
+               /* prefill the table head */
+               input->table[i] = 0xffff;
+       }
+       code = -1;
+       prev_len = 0;
+       for (i = 0; i < n_symbols; i++) {
+               uint16_t s = symbols[i];
+               uint16_t prefix;
+               len = (s >> 9) & 15;
+               s &= 511;
+               code++;
+               while (len != prev_len) {
+                       code <<= 1;
+                       code++;
+                       prev_len++;
+               }
+
+               if (code >= 65535) {
+                       return false;
+               }
+               input->table[code] = s;
+               for(prefix = (code - 1) >> 1;
+                   prefix > 31;
+                   prefix = (prefix - 1) >> 1) {
+                       input->table[prefix] = 0xffff;
+               }
+       }
+
+       /*
+        * check that the last code encodes 11111..., with right number of
+        * ones, pointing to the right symbol -- otherwise we have a dangling
+        * uninitialised symbol.
+        */
+       if (code != (1 << (len + 1)) - 2) {
+               return false;
+       }
+       return true;
+}
+
+
+#define CHECK_READ_32(dest)                                      \
+       do {                                                      \
+               if (input->byte_pos + 4 > input->byte_size) {     \
+                       return LZXPRESS_ERROR;                     \
+               }                                                  \
+               dest = PULL_LE_U32(input->bytes, input->byte_pos); \
+               input->byte_pos += 4;                              \
+       } while (0)
+
+#define CHECK_READ_16(dest)                                      \
+       do {                                                      \
+               if (input->byte_pos + 2 > input->byte_size) {     \
+                       return LZXPRESS_ERROR;                     \
+               }                                                  \
+               dest = PULL_LE_U16(input->bytes, input->byte_pos); \
+               input->byte_pos += 2;                              \
+       } while (0)
+
+#define CHECK_READ_8(dest) \
+       do {                                                            \
+               if (input->byte_pos >= input->byte_size) {              \
+                       return LZXPRESS_ERROR;                          \
+               }                                                       \
+               dest = PULL_LE_U8(input->bytes, input->byte_pos);       \
+               input->byte_pos++;                                      \
+       } while(0)
+
+
+static inline ssize_t pull_bits(struct bitstream *input)
+{
+       if (input->byte_pos + 1 < input->byte_size) {
+               uint16_t tmp;
+               CHECK_READ_16(tmp);
+               input->remaining_bits += 16;
+               input->bits <<= 16;
+               input->bits |= tmp;
+       } else if (input->byte_pos < input->byte_size) {
+               uint8_t tmp;
+               CHECK_READ_8(tmp);
+               input->remaining_bits += 8;
+               input->bits <<= 8;
+               input->bits |= tmp;
+       } else {
+               return LZXPRESS_ERROR;
+       }
+       return 0;
+}
+
+
+/*
+ * Decompress a block. The actual decompressed size is returned (or -1 on
+ * error). The putative block length is 64k (or shorter, if the message ends
+ * first), but a match can run over the end, extending the block. That's why
+ * we need the overall output size as well as the block size. A match encoded
+ * in this block can point back to previous blocks, but not before the
+ * beginning of the message, so we also need the previously decoded size.
+ *
+ * The compressed block will have 256 bytes for the Huffman table, and at
+ * least 4 bytes of (possibly padded) encoded values.
+ */
+static ssize_t lzx_huffman_decompress_block(struct bitstream *input,
+                                           uint8_t *output,
+                                           size_t block_size,
+                                           size_t output_size,
+                                           size_t previous_size)
+{
+       size_t output_pos = 0;
+       uint16_t symbol;
+       size_t index;
+       uint16_t distance_bits_wanted = 0;
+       size_t distance = 0;
+       size_t length = 0;
+       bool ok;
+       uint32_t tmp;
+       bool seen_eof_marker = false;
+
+       ok = fill_decomp_table(input);
+       if (! ok) {
+               return LZXPRESS_ERROR;
+       }
+
+       /*
+        * Always read 32 bits at the start, even if we don't need them.
+        */
+       CHECK_READ_16(tmp);
+       CHECK_READ_16(input->bits);
+       input->bits |= tmp << 16;
+       input->remaining_bits = 32;
+
+       /*
+        * This loop iterates over individual *bits*. These are read from
+        * little-endian 16 bit words, most significant bit first.
+        *
+        * At points in the bitstream, the following are possible:
+        *
+        * # the source word is empty and needs to be refilled from the input
+        *    stream.
+        * # an incomplete codeword is being extended.
+        * # a codeword is resolved, either as a literal or a match.
+        * # a literal is written.
+        * # a match is collecting distance bits.
+        * # the output stream is copied, as specified by a match.
+        * # input bytes are read for match lengths.
+        *
+        * Note that we *don't* specifically check for the EOF marker (symbol
+        * 256) in this loop, because the a precondition for stopping for the
+        * EOF marker is that the output buffer is full (otherwise, you
+        * wouldn't know which 256 is EOF, rather than an actual symbol), and
+        * we *always* want to stop when the buffer is full. So we work out if
+        * there is an EOF in in another loop after we stop writing.
+        */
+
+       index = 0;
+       while (output_pos < block_size) {
+               uint16_t b;
+               if (input->remaining_bits == 16) {
+                       ssize_t ret = pull_bits(input);
+                       if (ret) {
+                               return ret;
+                       }
+               }
+               input->remaining_bits--;
+
+               b = (input->bits >> input->remaining_bits) & 1;
+               if (length == 0) {
+                       /* not in a match; pulling a codeword */
+                       index <<= 1;
+                       index += b + 1;
+                       if (input->table[index] == 0xffff) {
+                               /* incomplete codeword, the common case */
+                               continue;
+                       }
+                       /* found the symbol, reset the code string */
+                       symbol = input->table[index] & 511;
+                       index = 0;
+                       if (symbol < 256) {
+                               /* a literal, the easy case */
+                               output[output_pos] = symbol;
+                               output_pos++;
+                               continue;
+                       }
+
+                       /* the beginning of a match */
+                       distance_bits_wanted = (symbol >> 4) & 15;
+                       distance = 1 << distance_bits_wanted;
+                       length = symbol & 15;
+                       if (length == 15) {
+                               CHECK_READ_8(tmp);
+                               length += tmp;
+                               if (length == 255 + 15) {
+                                       /*
+                                        * note, we discard (don't add) the
+                                        * length so far.
+                                        */
+                                       CHECK_READ_16(length);
+                                       if (length == 0) {
+                                               CHECK_READ_32(length);
+                                       }
+                               }
+                       }
+                       length += 3;
+               } else {
+                       /* we are pulling extra distance bits */
+                       distance_bits_wanted--;
+                       distance |= b << distance_bits_wanted;
+               }
+
+               if (distance_bits_wanted == 0) {
+                       /*
+                        * We have a complete match, and it is time to do the
+                        * copy (byte by byte, because the ranges can overlap,
+                        * and we might need to copy bytes we just copied in).
+                        *
+                        * It is possible that this match will extend beyond
+                        * the end of the expected block. That's fine, so long
+                        * as it doesn't extend past the total output size.
+                        */
+                       size_t end = output_pos + length;
+                       if (end > output_size ||
+                           previous_size + output_pos < distance ||
+                           unlikely(end < output_pos)) {
+                               return LZXPRESS_ERROR;
+                       }
+
+                       for (; output_pos < end; output_pos++) {
+                               output[output_pos] = \
+                                       output[output_pos - distance];
+                       }
+                       distance = 0;
+                       length = 0;
+               }
+       }
+
+       if (length != 0 || index != 0) {
+               /* it seems like we've hit an early end, mid-code */
+               return LZXPRESS_ERROR;
+       }
+
+       if (input->byte_pos + 256 < input->byte_size) {
+               /*
+                * This block is over, but it clearly isn't the last block, so
+                * we don't want to look for the EOF.
+                */
+               return output_pos;
+       }
+       /*
+        * We won't write any more, but we try to read some more to make sure
+        * we're finishing in a good place. That means we want to see a 256
+        * symbol and then some number of zeroes, possibly zero, but as many
+        * as 32.
+        *
+        * In this we are perhaps a bit stricter than Windows, which
+        * apparently does not insist on the EOF marker, nor on a lack of
+        * trailing bytes.
+        */
+       while (true) {
+               uint16_t b;
+               if (input->remaining_bits == 16) {
+                       ssize_t ret;
+                       if (input->byte_pos == input->byte_size) {
+                               /* FIN */
+                               break;
+                       }
+                       ret = pull_bits(input);
+                       if (ret) {
+                               return ret;
+                       }
+               }
+               input->remaining_bits--;
+               b = (input->bits >> input->remaining_bits) & 1;
+               if (seen_eof_marker) {
+                       /*
+                        * we have read an EOF symbols. Now we just want to
+                        * see zeroes.
+                        */
+                       if (b != 0) {
+                               return LZXPRESS_ERROR;
+                       }
+                       continue;
+               }
+
+               /* we're pulling in a symbol, which had better be 256 */
+               index <<= 1;
+               index += b + 1;
+               if (input->table[index] == 0xffff) {
+                       continue;
+               }
+
+               symbol = input->table[index] & 511;
+               if (symbol != 256) {
+                       return LZXPRESS_ERROR;
+               }
+               seen_eof_marker = true;
+               continue;
+       }
+
+       if (! seen_eof_marker) {
+               return LZXPRESS_ERROR;
+       }
+
+       return output_pos;
+}
+
+static ssize_t lzxpress_huffman_decompress_internal(struct bitstream *input,
+                                                   uint8_t *output,
+                                                   size_t output_size)
+{
+       size_t output_pos = 0;
+
+       if (input->byte_size < 260) {
+               return LZXPRESS_ERROR;
+       }
+
+       while (input->byte_pos < input->byte_size) {
+               ssize_t block_output_pos;
+               ssize_t block_output_size;
+               size_t remaining_output_size = output_size - output_pos;
+
+               block_output_size = MIN(65536, remaining_output_size);
+
+               block_output_pos = lzx_huffman_decompress_block(
+                       input,
+                       output + output_pos,
+                       block_output_size,
+                       remaining_output_size,
+                       output_pos);
+
+               if (block_output_pos < block_output_size) {
+                       return LZXPRESS_ERROR;
+               }
+               output_pos += block_output_pos;
+               if (output_pos > output_size) {
+                       /* not expecting to get here. */
+                       return LZXPRESS_ERROR;
+               }
+       }
+
+       if (input->byte_pos != input->byte_size) {
+               return LZXPRESS_ERROR;
+       }
+
+       return output_pos;
+}
+
+
+/*
+ * lzxpress_huffman_decompress()
+ *
+ * output_size must be the expected length of the decompressed data.
+ * input_size and output_size are limited to the minimum of UINT32_MAX and
+ * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
+ *
+ * @param input_bytes  memory to be decompressed.
+ * @param input_size   length of the compressed buffer.
+ * @param output       destination for the decompressed data.
+ * @param output_size  exact expected length of the decompressed data.
+ *
+ * @return the number of bytes written or -1 on error.
+ */
+
+ssize_t lzxpress_huffman_decompress(const uint8_t *input_bytes,
+                                   size_t input_size,
+                                   uint8_t *output,
+                                   size_t output_size)
+{
+       uint16_t table[65536];
+       struct bitstream input = {
+               .bytes = input_bytes,
+               .byte_size = input_size,
+               .byte_pos = 0,
+               .bits = 0,
+               .remaining_bits = 0,
+               .table = table
+       };
+
+       if (input_size > SSIZE_MAX ||
+           input_size > UINT32_MAX ||
+           output_size > SSIZE_MAX ||
+           output_size > UINT32_MAX ||
+           input_size == 0 ||
+           output_size == 0 ||
+           input_bytes == NULL ||
+           output == NULL) {
+               /*
+                * We use negative ssize_t to return errors, which is limiting
+                * on 32 bit machines, and the 4GB limit exists on Windows.
+                */
+               return  LZXPRESS_ERROR;
+       }
+
+       return lzxpress_huffman_decompress_internal(&input,
+                                                   output,
+                                                   output_size);
+}
+
+
+/**
+ * lzxpress_huffman_decompress_talloc()
+ *
+ * The caller must provide the exact size of the expected output.
+ *
+ * The input_size is limited to the minimum of UINT32_MAX and SSIZE_MAX, but
+ * output_size is limited to 256MB due to a limit in talloc. This effectively
+ * limits input_size too, as non-crafted compressed data will not exceed the
+ * decompressed size by very much.
+ *
+ * @param mem_ctx      TALLOC_CTX parent for the decompressed buffer.
+ * @param input_bytes  memory to be decompressed.
+ * @param input_size   length of the compressed buffer.
+ * @param output_size  expected decompressed size.
+ *
+ * @return a talloc'ed buffer exactly output_size in length, or NULL.
+ */
+
+uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx,
+                                           const uint8_t *input_bytes,
+                                           size_t input_size,
+                                           size_t output_size)
+{
+       ssize_t result;
+       uint8_t *output = NULL;
+       struct bitstream input = {
+               .bytes = input_bytes,
+               .byte_size = input_size
+       };
+
+       output = talloc_array(mem_ctx, uint8_t, output_size);
+       if (output == NULL) {
+               return NULL;
+       }
+
+       input.table = talloc_array(mem_ctx, uint16_t, 65536);
+       if (input.table == NULL) {
+               talloc_free(output);
+               return NULL;
+       }
+       result = lzxpress_huffman_decompress_internal(&input,
+                                                     output,
+                                                     output_size);
+       talloc_free(input.table);
+
+       if (result != output_size) {
+               talloc_free(output);
+               return NULL;
+       }
+       return output;
+}
diff --git a/lib/compression/lzxpress_huffman.h b/lib/compression/lzxpress_huffman.h

new file mode 100644 (file)

index 0000000..c484181
--- /dev/null
+++ b/lib/compression/lzxpress_huffman.h
@@ -0,0 +1,38 @@
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ *  ** NOTE! The following LGPL license applies to this file.
+ *  ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 3 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HAVE_LZXPRESS_HUFFMAN_H
+#define HAVE_LZXPRESS_HUFFMAN_H
+
+
+ssize_t lzxpress_huffman_decompress(const uint8_t *input,
+                                   size_t input_size,
+                                   uint8_t *output,
+                                   size_t max_output_size);
+
+uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx,
+                                           const uint8_t *input_bytes,
+                                           size_t input_size,
+                                           size_t output_size);
+
+
+#endif /* HAVE_LZXPRESS_HUFFMAN_H */
diff --git a/lib/compression/tests/test_lzx_huffman.c b/lib/compression/tests/test_lzx_huffman.c

new file mode 100644 (file)

index 0000000..43275d2
--- /dev/null
+++ b/lib/compression/tests/test_lzx_huffman.c
@@ -0,0 +1,514 @@
+/*
+ * Samba compression library - LGPLv3
+ *
+ * Copyright © Catalyst IT 2022
+ *
+ * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
+ *
+ *  ** NOTE! The following LGPL license applies to this file.
+ *  ** It does NOT imply that all of Samba is released under the LGPL
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 3 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include <cmocka.h>
+#include <stdbool.h>
+#include "replace.h"
+#include <talloc.h>
+#include "lzxpress_huffman.h"
+#include "lib/util/stable_sort.h"
+#include "lib/util/data_blob.h"
+
+/* set LZXHUFF_DEBUG_FILES to true to save round-trip files in /tmp. */
+#define LZXHUFF_DEBUG_FILES false
+
+/* set LZXHUFF_DEBUG_VERBOSE to true to print more. */
+#define LZXHUFF_DEBUG_VERBOSE false
+
+
+#if LZXHUFF_DEBUG_VERBOSE
+#define debug_message(...) print_message(__VA_ARGS__)
+#else
+#define debug_message(...) /* debug_message */
+#endif
+
+
+struct lzx_pair {
+       const char *name;
+       DATA_BLOB compressed;
+       DATA_BLOB decompressed;
+};
+
+struct lzx_file_pair {
+       const char *name;
+       const char *compressed_file;
+       const char *decompressed_file;
+};
+
+
+#define DECOMP_DIR "testdata/compression/lzxpress-huffman/decompressed"
+#define COMP_DIR "testdata/compression/lzxpress-huffman/compressed"
+#define MORE_COMP_DIR "testdata/compression/lzxpress-huffman/more-compressed"
+
+
+#define VARRGH(...) __VA_ARGS__
+
+#define BLOB_FROM_ARRAY(...)                             \
+       {                                                \
+               .data = (uint8_t[]){__VA_ARGS__},          \
+               .length = sizeof((uint8_t[]){__VA_ARGS__}) \
+       }
+
+#define BLOB_FROM_STRING(s)                                      \
+       {                                                            \
+               .data = discard_const_p(uint8_t, s),                 \
+               .length = (sizeof(s) - 1)                    \
+       }
+
+
+const char * file_names[] = {
+       "27826-8.txt",
+       "5d049b4cb1bd933f5e8ex19",
+       "638e61e96d54279981c3x5",
+       "64k-minus-one-zeros",
+       "64k-plus-one-zeros",
+       "64k-zeros",
+       "96f696a4e5ce56c61a3dx10",
+       "9e0b6a12febf38e98f13",
+       "abc-times-101",
+       "abc-times-105",
+       "abc-times-200",
+       "and_rand",
+       "and_rand-128k+",
+       "b63289ccc7f218c0d56b",
+       "beta-variate1-128k+",
+       "beta-variate2-128k+",
+       "beta-variate3-128k+",
+       "decayed_alphabet_128k+",
+       "decayed_alphabet_64k",
+       "exp_shuffle",
+       "exp_shuffle-128k+",
+       "f00842317dc6d5695b02",
+       "fib_shuffle",
+       "fib_shuffle-128k+",
+       "fuzzing-0fc2d461b56cd8103c91",
+       "fuzzing-17c961778538cc10ab7c",
+       "fuzzing-3591f9dc02bb00a54b60",
+       "fuzzing-3ec3bca27bb9eb40c128",
+       "fuzzing-80b4fa18ff5f8dd04862",
+       "fuzzing-a3115a81d1ac500318f9",
+       "generate-windows-test-vectors.c",
+       "midsummer-nights-dream.txt",
+       "notes-on-the-underground.txt",
+       "pg22009.txt",
+       "repeating",
+       "repeating-exactly-64k",
+       "setup.log",
+       "skewed_choices",
+       "skewed_choices-128k+",
+       /* These ones were deathly slow in fuzzing at one point */
+       "slow-015ddc36a71412ccc50d",
+       "slow-100e9f966a7feb9ca40a",
+       "slow-2a671c3cff4f1574cbab",
+       "slow-33d90a24e70515b14cd0",
+       "slow-49d8c05261e3f412fc72",
+       "slow-50a249d2fe56873e56a0",
+       "slow-63e9f0b52235fb0129fa",
+       "slow-73b7f971d65908ac0095",
+       "slow-8b61e3dd267908544531",
+       "slow-9d1c5a079b0462986f1f",
+       "slow-aa7262a821dabdcf04a6",
+       "slow-b8a91d142b0d2af7f5ca",
+       "slow-c79142457734bbc8d575",
+       "slow-d736544545b90d83fe75",
+       "slow-e3b9bdfaed7d1a606fdb",
+       "slow-f3f1c02a9d006e5e1703",
+       "square_series",
+       "square_series-128k+",
+       "trigram_128k+",
+       "trigram_64k",
+       "trigram_sum_128k+",
+       "trigram_sum_64k",
+       NULL
+};
+
+struct lzx_pair bidirectional_pairs[] = {
+
+       {.name = "abc__100_repeats", /* [MS-XCA] 3.2 example 2. */
+        .decompressed = BLOB_FROM_STRING(
+                "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+                "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+                "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+                "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+                "abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc"
+                ),
+        .compressed = BLOB_FROM_ARRAY(
+                /*
+                 * The 'a', 'b', and 'c' bytes are 0x61, 0x62, 0x63. No other
+                 * symbols occur. That means we need 48 0x00 bytes for the
+                 * first 96 symbol-nybbles, then some short codes, then zeros
+                 * again for the rest of the literals.
+                 */
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,
+                0x30, 0x23, /* a_ cb */
+                0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 100 bytes */
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0, /* here end the 0-255 literals (128 bytes) */
+                0x02, /* 'EOF' symbol 256 (byte 128 low) */
+                0,0,0,0,0, 0,0,0,0,0, 0,                    /* 140 bytes */
+                0,0,0,
+                0x20, /* codepoint 287 (byte 143 high) */
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,         /* 160 bytes */
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 bytes */
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
+                /*
+                 * So that's the tree.
+                 *
+                 * length 2 codes for 'c', EOF, 287
+                 * length 3 for 'a', 'b'.
+                 *
+                 * c:   00
+                 * EOF: 01
+                 * 287: 10
+                 * a:   110
+                 * b:   111
+                 *
+                 * thus the literal string "abc" is 110-111-00.
+                 *
+                 * Now for the lz77 match definitions for EOF and 287.
+                 *
+                 * Why 287? It encodes the match distance and offset.
+                 *
+                 * 287 - 256 = 31
+                 *
+                 * _length = 31 % 16 = 15
+                 * _distance = 31 / 16 = 1
+                 *
+                 * (it's easier to look at the hex, 0x11f:
+                 * 1xx means a match; x1x is _distance; xxf is _length)
+                 *
+                 * _distance 1 means a two bit distance (10 or 11; 2 or 3).
+                 * That means the next bit will be the least significant bit
+                 * of distance (1 in this case, meaning distance 3).
+                 *
+                 * if _length is 15, real length is included inline.
+                 *
+                 * 'EOF' == 256 means _length = 0, _distance = 0.
+                 *
+                 * _distance 0 means 1, so no further bits needed.
+                 * _length 0 means length 3.
+                 *
+                 * but when used as EOF, this doesn't matter.
+                 */
+                0xa8, 0xdc, 0x00, 0x00, 0xff, 0x26, 0x01
+                 /* These remaining bytes are:
+                 *
+                 * 10101000 11011100 00000000 00000000 11111111
+                 * 00100110 00000001
+                 *
+                 * and we read them as 16 chunks (i.e. flipping odd/even order)
+                 *
+                 * 110-111-00  10-1-01-000
+                 *   a  b   c 287 | EOF
+                 *                |
+                 *                this is the 287 distance low bit.
+                 *
+                 * The last 3 bits are not used. The 287 length is sort of
+                 * out of band, coming up soon (because 287 encodes length
+                 * 15; most codes do not do this).
+                 *
+                 * 00000000 00000000
+                 *
+                 * This padding is there because the first 32 bits are read
+                 * at the beginning of decoding. If there were more things to
+                 * be encoded, they would be in here.
+                 *
+                 * 11111111
+                 *
+                 * This byte is pulled as the length for the 287 match.
+                 * Because it is 0xff, we pull a further 2 bytes for the
+                 * actual length, i.e. a 16 bit number greater than 270.
+                 *
+                 * 00000001 00100110
+                 *
+                 * that is 0x126 = 294 = the match length - 3 (because we're
+                 * encoding ["abc", <copy from 3 back, 297 chars>, EOF]).
+                 *
+                 */
+                )
+       },
+       {.name = "abcdefghijklmnopqrstuvwxyz", /* [MS-XCA] 3.2 example 1. */
+        .decompressed = BLOB_FROM_STRING("abcdefghijklmnopqrstuvwxyz"),
+        .compressed = BLOB_FROM_ARRAY(
+                /*
+                 * In this case there are no matches encoded as there are no
+                 * repeated symbols. Including the EOF, there are 27 symbols
+                 * all occuring exactly as frequently as each other (once).
+                 * From that we would expect the codes to be mostly 5 bits
+                 * long, because 27 < 2^5 (32), but greater than 2^4. And
+                 * that's what we see.
+                 */
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,
+                /* 14 non-zero bytes for 26 letters/nibbles */
+                0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+                0x55, 0x55, 0x55, 0x45, 0x44, 0x04,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,   /* 80 */
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,
+                0x04,                                 /* 0x100 EOF */
+                /* no matches */
+                0,0,0,0,0, 0,0,0,0,0, 0,                    /* 140 */
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, /* 240 */
+                0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,
+
+                0xd8, 0x52, 0x3e, 0xd7, 0x94, 0x11, 0x5b, 0xe9,
+                0x19, 0x5f, 0xf9, 0xd6, 0x7c, 0xdf, 0x8d, 0x04,
+                0x00, 0x00, 0x00, 0x00)
+       },
+       {0}
+};
+
+
+static void test_lzxpress_huffman_decompress(void **state)
+{
+       size_t i;
+       ssize_t written;
+       uint8_t *dest = NULL;
+       TALLOC_CTX *mem_ctx = talloc_new(NULL);
+       for (i = 0; bidirectional_pairs[i].name != NULL; i++) {
+               struct lzx_pair p = bidirectional_pairs[i];
+               dest = talloc_array(mem_ctx, uint8_t, p.decompressed.length);
+
+               debug_message("%s compressed %zu decomp %zu\n", p.name,
+                             p.compressed.length,
+                             p.decompressed.length);
+
+               written = lzxpress_huffman_decompress(p.compressed.data,
+                                                     p.compressed.length,
+                                                     dest,
+                                                     p.decompressed.length);
+               assert_int_equal(written, p.decompressed.length);
+
+               assert_memory_equal(dest, p.decompressed.data, p.decompressed.length);
+               talloc_free(dest);
+       }
+}
+
+
+static DATA_BLOB datablob_from_file(TALLOC_CTX *mem_ctx,
+                                   const char *filename)
+{
+       DATA_BLOB b = {0};
+       FILE *fh = fopen(filename, "rb");
+       int ret;
+       struct stat s;
+       size_t len;
+       if (fh == NULL) {
+               debug_message("could not open '%s'\n", filename);
+               return b;
+       }
+       ret = fstat(fileno(fh), &s);
+       if (ret != 0) {
+               fclose(fh);
+               return b;
+       }
+       b.data = talloc_array(mem_ctx, uint8_t, s.st_size);
+       if (b.data == NULL) {
+               fclose(fh);
+               return b;
+       }
+       len = fread(b.data, 1, s.st_size, fh);
+       if (ferror(fh) || len != s.st_size) {
+               TALLOC_FREE(b.data);
+       } else {
+               b.length = len;
+       }
+       fclose(fh);
+       return b;
+}
+
+
+static void test_lzxpress_huffman_decompress_files(void **state)
+{
+       size_t i;
+       int score = 0;
+       TALLOC_CTX *mem_ctx = talloc_new(NULL);
+       for (i = 0; file_names[i] != NULL; i++) {
+               char filename[200];
+               uint8_t *dest = NULL;
+               ssize_t written;
+               TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+               struct lzx_pair p = {
+                       .name = file_names[i]
+               };
+
+               debug_message("%s\n", p.name);
+
+               snprintf(filename, sizeof(filename),
+                        "%s/%s.decomp", DECOMP_DIR, p.name);
+
+               p.decompressed = datablob_from_file(tmp_ctx, filename);
+               assert_non_null(p.decompressed.data);
+
+               snprintf(filename, sizeof(filename),
+                        "%s/%s.lzhuff", COMP_DIR, p.name);
+
+               p.compressed = datablob_from_file(tmp_ctx, filename);
+               assert_non_null(p.compressed.data);
+
+               dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+
+               written = lzxpress_huffman_decompress(p.compressed.data,
+                                                     p.compressed.length,
+                                                     dest,
+                                                     p.decompressed.length);
+               if (written == p.decompressed.length &&
+                   memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+                       debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+                       score++;
+               } else {
+                       debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+                                     p.name);
+                       debug_message("size %zd vs reference %zu\n",
+                                     written, p.decompressed.length);
+               }
+               talloc_free(tmp_ctx);
+       }
+       debug_message("%d/%zu correct\n", score, i);
+       assert_int_equal(score, i);
+}
+
+
+static void test_lzxpress_huffman_decompress_more_compressed_files(void **state)
+{
+       /*
+        * This tests the decompression of files that have been compressed on
+        * Windows with the level turned up (to 1, default for MS-XCA is 0).
+        *
+        * The format is identical, but it will have tried harder to find
+        * matches.
+        */
+       size_t i;
+       int score = 0;
+       int found = 0;
+       TALLOC_CTX *mem_ctx = talloc_new(NULL);
+       for (i = 0; file_names[i] != NULL; i++) {
+               char filename[200];
+               uint8_t *dest = NULL;
+               ssize_t written;
+               TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+               struct lzx_pair p = {
+                       .name = file_names[i]
+               };
+
+               debug_message("%s\n", p.name);
+
+               snprintf(filename, sizeof(filename),
+                        "%s/%s.decomp", DECOMP_DIR, p.name);
+
+               p.decompressed = datablob_from_file(tmp_ctx, filename);
+               assert_non_null(p.decompressed.data);
+
+               snprintf(filename, sizeof(filename),
+                        "%s/%s.lzhuff", MORE_COMP_DIR, p.name);
+
+               p.compressed = datablob_from_file(tmp_ctx, filename);
+               if (p.compressed.data == NULL) {
+                       /*
+                        * We don't have all the vectors in the
+                        * more-compressed directory, which is OK, we skip
+                        * them.
+                        */
+                       continue;
+               }
+               found++;
+               dest = talloc_array(tmp_ctx, uint8_t, p.decompressed.length);
+
+               written = lzxpress_huffman_decompress(p.compressed.data,
+                                                     p.compressed.length,
+                                                     dest,
+                                                     p.decompressed.length);
+               if (written == p.decompressed.length &&
+                   memcmp(dest, p.decompressed.data, p.decompressed.length) == 0) {
+                       debug_message("\033[1;32mdecompressed %s!\033[0m\n", p.name);
+                       score++;
+               } else {
+                       debug_message("\033[1;31mfailed to decompress %s!\033[0m\n",
+                                     p.name);
+                       debug_message("size %zd vs reference %zu\n",
+                                     written, p.decompressed.length);
+               }
+               talloc_free(tmp_ctx);
+       }
+       debug_message("%d/%d correct\n", score, found);
+       assert_int_equal(score, found);
+}
+
+
+static void test_lzxpress_huffman_decompress_empty_or_null(void **state)
+{
+       /*
+        * We expect these to fail with a -1, except the last one.
+        */
+       ssize_t ret;
+       const uint8_t *input = bidirectional_pairs[0].compressed.data;
+       size_t ilen = bidirectional_pairs[0].compressed.length;
+       size_t olen = bidirectional_pairs[0].decompressed.length;
+       uint8_t output[olen];
+
+       ret = lzxpress_huffman_decompress(input, 0, output, olen);
+       assert_int_equal(ret, -1LL);
+       ret = lzxpress_huffman_decompress(input, ilen, output, 0);
+       assert_int_equal(ret, -1LL);
+
+       ret = lzxpress_huffman_decompress(NULL, ilen, output, olen);
+       assert_int_equal(ret, -1LL);
+       ret = lzxpress_huffman_decompress(input, ilen, NULL, olen);
+       assert_int_equal(ret, -1LL);
+
+       ret = lzxpress_huffman_decompress(input, ilen, output, olen);
+       assert_int_equal(ret, olen);
+}
+
+
+int main(void) {
+       const struct CMUnitTest tests[] = {
+               cmocka_unit_test(test_lzxpress_huffman_decompress_files),
+               cmocka_unit_test(test_lzxpress_huffman_decompress_more_compressed_files),
+               cmocka_unit_test(test_lzxpress_huffman_decompress),
+               cmocka_unit_test(test_lzxpress_huffman_decompress_empty_or_null),
+       };
+       if (!isatty(1)) {
+               cmocka_set_message_output(CM_OUTPUT_SUBUNIT);
+       }
+
+       return cmocka_run_group_tests(tests, NULL, NULL);
+}
diff --git a/lib/compression/wscript_build b/lib/compression/wscript_build

index 7ad533340de5266d8d1b2c43dcae1a073fe44846..df931bef3653efda7549e63e512c3b0ee813fc60 100644 (file)
--- a/lib/compression/wscript_build
+++ b/lib/compression/wscript_build
@@ -1,6 +1,13 @@
  #!/usr/bin/env python
  
  bld.SAMBA_SUBSYSTEM('LZXPRESS',
-        deps='replace',
-       source='lzxpress.c'
-       )
+                    deps='replace talloc stable_sort samba-debug',
+                    source='lzxpress.c lzxpress_huffman.c'
+                    )
+
+bld.SAMBA_BINARY('test_lzx_huffman',
+                 source='tests/test_lzx_huffman.c',
+                 deps=('cmocka replace LZXPRESS'
+                       ' samba-util'),
+                 local_include=False,
+                 for_selftest=True)
diff --git a/selftest/tests.py b/selftest/tests.py

index 4cc2e7f8918adbabde352276314c4d4d3b879678..c774f18af3ee3f138e14d46784248178e90bff11 100644 (file)
--- a/selftest/tests.py
+++ b/selftest/tests.py
@@ -478,3 +478,6 @@ plantestsuite("samba.unittests.gnutls_aead_aes_256_cbc_hmac_sha512", "none",
                [os.path.join(bindir(), "test_gnutls_aead_aes_256_cbc_hmac_sha512")])
  plantestsuite("samba.unittests.encode_decode", "none",
                [os.path.join(bindir(), "test_encode_decode")])
+
+plantestsuite("samba.unittests.compression.lzxpress_huffman", "none",
+              [os.path.join(bindir(), "default/lib/compression/test_lzx_huffman")])
author	Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
	Thu, 17 Nov 2022 01:24:52 +0000 (14:24 +1300)
committer	Joseph Sutton <jsutton@samba.org>
	Thu, 1 Dec 2022 22:56:39 +0000 (22:56 +0000)
lib/compression/lzxpress_huffman.c	[new file with mode: 0644]	patch \| blob
lib/compression/lzxpress_huffman.h	[new file with mode: 0644]	patch \| blob
lib/compression/tests/test_lzx_huffman.c	[new file with mode: 0644]	patch \| blob
lib/compression/wscript_build		patch \| blob \| blame \| history
selftest/tests.py		patch \| blob \| blame \| history