Commit d645114f7e
Changed files (128)
lib
std
compress
flate
testdata
block_writer
fuzz
src
lib/std/compress/deflate/compressor.zig
@@ -327,7 +327,7 @@ pub fn Compressor(comptime WriterType: anytype) type {
}
}
}
- const n = std.compress.deflate.copy(self.window[self.window_end..], b);
+ const n = std.compress.v1.deflate.copy(self.window[self.window_end..], b);
self.window_end += n;
return @as(u32, @intCast(n));
}
@@ -705,7 +705,7 @@ pub fn Compressor(comptime WriterType: anytype) type {
}
fn fillStore(self: *Self, b: []const u8) u32 {
- const n = std.compress.deflate.copy(self.window[self.window_end..], b);
+ const n = std.compress.v1.deflate.copy(self.window[self.window_end..], b);
self.window_end += n;
return @as(u32, @intCast(n));
}
lib/std/compress/deflate/decompressor.zig
@@ -450,7 +450,7 @@ pub fn Decompressor(comptime ReaderType: type) type {
pub fn read(self: *Self, output: []u8) Error!usize {
while (true) {
if (self.to_read.len > 0) {
- const n = std.compress.deflate.copy(output, self.to_read);
+ const n = std.compress.v1.deflate.copy(output, self.to_read);
self.to_read = self.to_read[n..];
if (self.to_read.len == 0 and
self.err != null)
lib/std/compress/flate/testdata/block_writer/huffman-null-max.dyn.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-null-max.dyn.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-null-max.huff.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-null-max.input
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-null-max.wb.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-null-max.wb.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-pi.dyn.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-pi.dyn.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-pi.huff.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-pi.input
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-pi.wb.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-pi.wb.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-1k.dyn.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-1k.dyn.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-1k.huff.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-1k.input
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-1k.wb.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-1k.wb.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-limit.dyn.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-limit.dyn.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-limit.huff.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-limit.input
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-limit.wb.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-limit.wb.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-max.huff.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-rand-max.input
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-shifts.dyn.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-shifts.dyn.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-shifts.huff.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-shifts.input
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-shifts.wb.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-shifts.wb.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text-shift.dyn.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text-shift.dyn.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text-shift.huff.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text-shift.input
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text-shift.wb.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text-shift.wb.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text.dyn.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text.dyn.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text.huff.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text.input
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text.wb.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-text.wb.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-zero.dyn.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-zero.dyn.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-zero.huff.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-zero.input
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-zero.wb.expect
Binary file
lib/std/compress/flate/testdata/block_writer/huffman-zero.wb.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/null-long-match.dyn.expect-noinput
Binary file
lib/std/compress/flate/testdata/block_writer/null-long-match.wb.expect-noinput
Binary file
lib/std/compress/flate/testdata/fuzz/deflate-stream.expect
Binary file
lib/std/compress/flate/testdata/fuzz/deflate-stream.input
Binary file
lib/std/compress/flate/testdata/fuzz/empty-distance-alphabet01.input
Binary file
lib/std/compress/flate/testdata/fuzz/empty-distance-alphabet02.input
Binary file
lib/std/compress/flate/testdata/fuzz/end-of-stream.input
Binary file
lib/std/compress/flate/testdata/fuzz/fuzz1.input
Binary file
lib/std/compress/flate/testdata/fuzz/fuzz2.input
Binary file
lib/std/compress/flate/testdata/fuzz/fuzz3.input
Binary file
lib/std/compress/flate/testdata/fuzz/fuzz4.input
Binary file
lib/std/compress/flate/testdata/fuzz/invalid-distance.input
Binary file
lib/std/compress/flate/testdata/fuzz/invalid-tree01.input
Binary file
lib/std/compress/flate/testdata/fuzz/invalid-tree02.input
Binary file
lib/std/compress/flate/testdata/fuzz/invalid-tree03.input
Binary file
lib/std/compress/flate/testdata/fuzz/lengths-overflow.input
Binary file
lib/std/compress/flate/testdata/fuzz/out-of-codes.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff01.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff02.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff03.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff04.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff05.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff06.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff07.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff08.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff09.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff10.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff11.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff12.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff13.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff14.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff15.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff16.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff17.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff18.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff19.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff20.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff21.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff22.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff23.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff24.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff25.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff26.input
Binary file
lib/std/compress/flate/testdata/fuzz/puff27.input
Binary file
lib/std/compress/flate/testdata/fuzz/roundtrip1.input
Binary file
lib/std/compress/flate/testdata/fuzz/roundtrip2.input
Binary file
lib/std/compress/flate/testdata/block_writer.zig
Binary file
lib/std/compress/flate/testdata/rfc1951.txt
Binary file
lib/std/compress/flate/bit_reader.zig
@@ -0,0 +1,333 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const testing = std.testing;
+
+pub fn bitReader(reader: anytype) BitReader(@TypeOf(reader)) {
+ return BitReader(@TypeOf(reader)).init(reader);
+}
+
+/// Bit reader used during inflate (decompression). Has internal buffer of 64
+/// bits which shifts right after bits are consumed. Uses forward_reader to fill
+/// that internal buffer when needed.
+///
+/// readF is the core function. Supports few different ways of getting bits
+/// controlled by flags. In hot path we try to avoid checking whether we need to
+/// fill buffer from forward_reader by calling fill in advance and readF with
+/// buffered flag set.
+///
+pub fn BitReader(comptime ReaderType: type) type {
+ return struct {
+ // Underlying reader used for filling internal bits buffer
+ forward_reader: ReaderType = undefined,
+ // Internal buffer of 64 bits
+ bits: u64 = 0,
+ // Number of bits in the buffer
+ nbits: u32 = 0,
+
+ const Self = @This();
+
+ pub const Error = ReaderType.Error || error{EndOfStream};
+
+ pub fn init(rdr: ReaderType) Self {
+ var self = Self{ .forward_reader = rdr };
+ self.fill(1) catch {};
+ return self;
+ }
+
+ // Try to have `nice` bits are available in buffer. Reads from
+ // forward reader if there is no `nice` bits in buffer. Returns error
+ // if end of forward stream is reached and internal buffer is empty.
+ // It will not error if less than `nice` bits are in buffer, only when
+ // all bits are exhausted. During inflate we usually know what is the
+ // maximum bits for the next step but usually that step will need less
+ // bits to decode. So `nice` is not hard limit, it will just try to have
+ // that number of bits available. If end of forward stream is reached
+ // it may be some extra zero bits in buffer.
+ pub inline fn fill(self: *Self, nice: u6) !void {
+ if (self.nbits >= nice) {
+ return; // We have enought bits
+ }
+ // Read more bits from forward reader
+
+ // Number of empty bytes in bits, round nbits to whole bytes.
+ const empty_bytes =
+ @as(u8, if (self.nbits & 0x7 == 0) 8 else 7) - // 8 for 8, 16, 24..., 7 otherwise
+ (self.nbits >> 3); // 0 for 0-7, 1 for 8-16, ... same as / 8
+
+ var buf: [8]u8 = [_]u8{0} ** 8;
+ const bytes_read = self.forward_reader.read(buf[0..empty_bytes]) catch 0;
+ if (bytes_read > 0) {
+ const u: u64 = std.mem.readInt(u64, buf[0..8], .little);
+ self.bits |= u << @as(u6, @intCast(self.nbits));
+ self.nbits += 8 * @as(u8, @intCast(bytes_read));
+ return;
+ }
+
+ if (self.nbits == 0)
+ return error.EndOfStream;
+ }
+
+ // Read exactly buf.len bytes into buf.
+ pub fn readAll(self: *Self, buf: []u8) !void {
+ assert(self.alignBits() == 0); // internal bits must be at byte boundary
+
+ // First read from internal bits buffer.
+ var n: usize = 0;
+ while (self.nbits > 0 and n < buf.len) {
+ buf[n] = try self.readF(u8, flag.buffered);
+ n += 1;
+ }
+ // Then use forward reader for all other bytes.
+ try self.forward_reader.readNoEof(buf[n..]);
+ }
+
+ pub const flag = struct {
+ pub const peek: u3 = 0b001; // dont advance internal buffer, just get bits, leave them in buffer
+ pub const buffered: u3 = 0b010; // assume that there is no need to fill, fill should be called before
+ pub const reverse: u3 = 0b100; // bit reverse readed bits
+ };
+
+ // Alias for readF(U, 0).
+ pub fn read(self: *Self, comptime U: type) !U {
+ return self.readF(U, 0);
+ }
+
+ // Alias for readF with flag.peak set.
+ pub inline fn peekF(self: *Self, comptime U: type, comptime how: u3) !U {
+ return self.readF(U, how | flag.peek);
+ }
+
+ // Read with flags provided.
+ pub fn readF(self: *Self, comptime U: type, comptime how: u3) !U {
+ const n: u6 = @bitSizeOf(U);
+ switch (how) {
+ 0 => { // `normal` read
+ try self.fill(n); // ensure that there are n bits in the buffer
+ const u: U = @truncate(self.bits); // get n bits
+ try self.shift(n); // advance buffer for n
+ return u;
+ },
+ (flag.peek) => { // no shift, leave bits in the buffer
+ try self.fill(n);
+ return @truncate(self.bits);
+ },
+ flag.buffered => { // no fill, assume that buffer has enought bits
+ const u: U = @truncate(self.bits);
+ try self.shift(n);
+ return u;
+ },
+ (flag.reverse) => { // same as 0 with bit reverse
+ try self.fill(n);
+ const u: U = @truncate(self.bits);
+ try self.shift(n);
+ return @bitReverse(u);
+ },
+ (flag.peek | flag.reverse) => {
+ try self.fill(n);
+ return @bitReverse(@as(U, @truncate(self.bits)));
+ },
+ (flag.buffered | flag.reverse) => {
+ const u: U = @truncate(self.bits);
+ try self.shift(n);
+ return @bitReverse(u);
+ },
+ (flag.peek | flag.buffered) => {
+ return @truncate(self.bits);
+ },
+ (flag.peek | flag.buffered | flag.reverse) => {
+ return @bitReverse(@as(U, @truncate(self.bits)));
+ },
+ }
+ }
+
+ // Read n number of bits.
+ // Only buffered flag can be used in how.
+ pub fn readN(self: *Self, n: u4, comptime how: u3) !u16 {
+ switch (how) {
+ 0 => {
+ try self.fill(n);
+ },
+ flag.buffered => {},
+ else => unreachable,
+ }
+ const mask: u16 = (@as(u16, 1) << n) - 1;
+ const u: u16 = @as(u16, @truncate(self.bits)) & mask;
+ try self.shift(n);
+ return u;
+ }
+
+ // Advance buffer for n bits.
+ pub fn shift(self: *Self, n: u6) !void {
+ if (n > self.nbits) return error.EndOfStream;
+ self.bits >>= n;
+ self.nbits -= n;
+ }
+
+ // Skip n bytes.
+ pub fn skipBytes(self: *Self, n: u16) !void {
+ for (0..n) |_| {
+ try self.fill(8);
+ try self.shift(8);
+ }
+ }
+
+ // Number of bits to align stream to the byte boundary.
+ fn alignBits(self: *Self) u3 {
+ return @intCast(self.nbits & 0x7);
+ }
+
+ // Align stream to the byte boundary.
+ pub fn alignToByte(self: *Self) void {
+ const ab = self.alignBits();
+ if (ab > 0) self.shift(ab) catch unreachable;
+ }
+
+ // Skip zero terminated string.
+ pub fn skipStringZ(self: *Self) !void {
+ while (true) {
+ if (try self.readF(u8, 0) == 0) break;
+ }
+ }
+
+ // Read deflate fixed fixed code.
+ // Reads first 7 bits, and then mybe 1 or 2 more to get full 7,8 or 9 bit code.
+ // ref: https://datatracker.ietf.org/doc/html/rfc1951#page-12
+ // Lit Value Bits Codes
+ // --------- ---- -----
+ // 0 - 143 8 00110000 through
+ // 10111111
+ // 144 - 255 9 110010000 through
+ // 111111111
+ // 256 - 279 7 0000000 through
+ // 0010111
+ // 280 - 287 8 11000000 through
+ // 11000111
+ pub fn readFixedCode(self: *Self) !u16 {
+ try self.fill(7 + 2);
+ const code7 = try self.readF(u7, flag.buffered | flag.reverse);
+ if (code7 <= 0b0010_111) { // 7 bits, 256-279, codes 0000_000 - 0010_111
+ return @as(u16, code7) + 256;
+ } else if (code7 <= 0b1011_111) { // 8 bits, 0-143, codes 0011_0000 through 1011_1111
+ return (@as(u16, code7) << 1) + @as(u16, try self.readF(u1, flag.buffered)) - 0b0011_0000;
+ } else if (code7 <= 0b1100_011) { // 8 bit, 280-287, codes 1100_0000 - 1100_0111
+ return (@as(u16, code7 - 0b1100000) << 1) + try self.readF(u1, flag.buffered) + 280;
+ } else { // 9 bit, 144-255, codes 1_1001_0000 - 1_1111_1111
+ return (@as(u16, code7 - 0b1100_100) << 2) + @as(u16, try self.readF(u2, flag.buffered | flag.reverse)) + 144;
+ }
+ }
+ };
+}
+
+test "flate.BitReader" {
+ var fbs = std.io.fixedBufferStream(&[_]u8{ 0xf3, 0x48, 0xcd, 0xc9, 0x00, 0x00 });
+ var br = bitReader(fbs.reader());
+ const F = BitReader(@TypeOf(fbs.reader())).flag;
+
+ try testing.expectEqual(@as(u8, 48), br.nbits);
+ try testing.expectEqual(@as(u64, 0xc9cd48f3), br.bits);
+
+ try testing.expect(try br.readF(u1, 0) == 0b0000_0001);
+ try testing.expect(try br.readF(u2, 0) == 0b0000_0001);
+ try testing.expectEqual(@as(u8, 48 - 3), br.nbits);
+ try testing.expectEqual(@as(u3, 5), br.alignBits());
+
+ try testing.expect(try br.readF(u8, F.peek) == 0b0001_1110);
+ try testing.expect(try br.readF(u9, F.peek) == 0b1_0001_1110);
+ try br.shift(9);
+ try testing.expectEqual(@as(u8, 36), br.nbits);
+ try testing.expectEqual(@as(u3, 4), br.alignBits());
+
+ try testing.expect(try br.readF(u4, 0) == 0b0100);
+ try testing.expectEqual(@as(u8, 32), br.nbits);
+ try testing.expectEqual(@as(u3, 0), br.alignBits());
+
+ try br.shift(1);
+ try testing.expectEqual(@as(u3, 7), br.alignBits());
+ try br.shift(1);
+ try testing.expectEqual(@as(u3, 6), br.alignBits());
+ br.alignToByte();
+ try testing.expectEqual(@as(u3, 0), br.alignBits());
+
+ try testing.expectEqual(@as(u64, 0xc9), br.bits);
+ try testing.expectEqual(@as(u16, 0x9), try br.readN(4, 0));
+ try testing.expectEqual(@as(u16, 0xc), try br.readN(4, 0));
+}
+
+test "flate.BitReader read block type 1 data" {
+ const data = [_]u8{
+ 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1
+ 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00,
+ 0x0c, 0x01, 0x02, 0x03, //
+ 0xaa, 0xbb, 0xcc, 0xdd,
+ };
+ var fbs = std.io.fixedBufferStream(&data);
+ var br = bitReader(fbs.reader());
+ const F = BitReader(@TypeOf(fbs.reader())).flag;
+
+ try testing.expectEqual(@as(u1, 1), try br.readF(u1, 0)); // bfinal
+ try testing.expectEqual(@as(u2, 1), try br.readF(u2, 0)); // block_type
+
+ for ("Hello world\n") |c| {
+ try testing.expectEqual(@as(u8, c), try br.readF(u8, F.reverse) - 0x30);
+ }
+ try testing.expectEqual(@as(u7, 0), try br.readF(u7, 0)); // end of block
+ br.alignToByte();
+ try testing.expectEqual(@as(u32, 0x0302010c), try br.readF(u32, 0));
+ try testing.expectEqual(@as(u16, 0xbbaa), try br.readF(u16, 0));
+ try testing.expectEqual(@as(u16, 0xddcc), try br.readF(u16, 0));
+}
+
+test "flate.BitReader init" {
+ const data = [_]u8{
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+ };
+ var fbs = std.io.fixedBufferStream(&data);
+ var br = bitReader(fbs.reader());
+
+ try testing.expectEqual(@as(u64, 0x08_07_06_05_04_03_02_01), br.bits);
+ try br.shift(8);
+ try testing.expectEqual(@as(u64, 0x00_08_07_06_05_04_03_02), br.bits);
+ try br.fill(60); // fill with 1 byte
+ try testing.expectEqual(@as(u64, 0x01_08_07_06_05_04_03_02), br.bits);
+ try br.shift(8 * 4 + 4);
+ try testing.expectEqual(@as(u64, 0x00_00_00_00_00_10_80_70), br.bits);
+
+ try br.fill(60); // fill with 4 bytes (shift by 4)
+ try testing.expectEqual(@as(u64, 0x00_50_40_30_20_10_80_70), br.bits);
+ try testing.expectEqual(@as(u8, 8 * 7 + 4), br.nbits);
+
+ try br.shift(@intCast(br.nbits)); // clear buffer
+ try br.fill(8); // refill with the rest of the bytes
+ try testing.expectEqual(@as(u64, 0x00_00_00_00_00_08_07_06), br.bits);
+}
+
+test "flate.BitReader readAll" {
+ const data = [_]u8{
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+ };
+ var fbs = std.io.fixedBufferStream(&data);
+ var br = bitReader(fbs.reader());
+
+ try testing.expectEqual(@as(u64, 0x08_07_06_05_04_03_02_01), br.bits);
+
+ var out: [16]u8 = undefined;
+ try br.readAll(out[0..]);
+ try testing.expect(br.nbits == 0);
+ try testing.expect(br.bits == 0);
+
+ try testing.expectEqualSlices(u8, data[0..16], &out);
+}
+
+test "flate.BitReader readFixedCode" {
+ const fixed_codes = @import("huffman_encoder.zig").fixed_codes;
+
+ var fbs = std.io.fixedBufferStream(&fixed_codes);
+ var rdr = bitReader(fbs.reader());
+
+ for (0..286) |c| {
+ try testing.expectEqual(c, try rdr.readFixedCode());
+ }
+ try testing.expect(rdr.nbits == 0);
+}
lib/std/compress/flate/bit_writer.zig
@@ -0,0 +1,99 @@
+const std = @import("std");
+const assert = std.debug.assert;
+
+/// Bit writer for use in deflate (compression).
+///
+/// Has internal bits buffer of 64 bits and internal bytes buffer of 248 bytes.
+/// When we accumulate 48 bits 6 bytes are moved to the bytes buffer. When we
+/// accumulate 240 bytes they are flushed to the underlying inner_writer.
+///
+pub fn BitWriter(comptime WriterType: type) type {
+ // buffer_flush_size indicates the buffer size
+ // after which bytes are flushed to the writer.
+ // Should preferably be a multiple of 6, since
+ // we accumulate 6 bytes between writes to the buffer.
+ const buffer_flush_size = 240;
+
+ // buffer_size is the actual output byte buffer size.
+ // It must have additional headroom for a flush
+ // which can contain up to 8 bytes.
+ const buffer_size = buffer_flush_size + 8;
+
+ return struct {
+ inner_writer: WriterType,
+
+ // Data waiting to be written is bytes[0 .. nbytes]
+ // and then the low nbits of bits. Data is always written
+ // sequentially into the bytes array.
+ bits: u64 = 0,
+ nbits: u32 = 0, // number of bits
+ bytes: [buffer_size]u8 = undefined,
+ nbytes: u32 = 0, // number of bytes
+
+ const Self = @This();
+
+ pub const Error = WriterType.Error || error{UnfinishedBits};
+
+ pub fn init(writer: WriterType) Self {
+ return .{ .inner_writer = writer };
+ }
+
+ pub fn setWriter(self: *Self, new_writer: WriterType) void {
+ //assert(self.bits == 0 and self.nbits == 0 and self.nbytes == 0);
+ self.inner_writer = new_writer;
+ }
+
+ pub fn flush(self: *Self) Error!void {
+ var n = self.nbytes;
+ while (self.nbits != 0) {
+ self.bytes[n] = @as(u8, @truncate(self.bits));
+ self.bits >>= 8;
+ if (self.nbits > 8) { // Avoid underflow
+ self.nbits -= 8;
+ } else {
+ self.nbits = 0;
+ }
+ n += 1;
+ }
+ self.bits = 0;
+ _ = try self.inner_writer.write(self.bytes[0..n]);
+ self.nbytes = 0;
+ }
+
+ pub fn writeBits(self: *Self, b: u32, nb: u32) Error!void {
+ self.bits |= @as(u64, @intCast(b)) << @as(u6, @intCast(self.nbits));
+ self.nbits += nb;
+ if (self.nbits < 48)
+ return;
+
+ var n = self.nbytes;
+ std.mem.writeInt(u64, self.bytes[n..][0..8], self.bits, .little);
+ n += 6;
+ if (n >= buffer_flush_size) {
+ _ = try self.inner_writer.write(self.bytes[0..n]);
+ n = 0;
+ }
+ self.nbytes = n;
+ self.bits >>= 48;
+ self.nbits -= 48;
+ }
+
+ pub fn writeBytes(self: *Self, bytes: []const u8) Error!void {
+ var n = self.nbytes;
+ if (self.nbits & 7 != 0) {
+ return error.UnfinishedBits;
+ }
+ while (self.nbits != 0) {
+ self.bytes[n] = @as(u8, @truncate(self.bits));
+ self.bits >>= 8;
+ self.nbits -= 8;
+ n += 1;
+ }
+ if (n != 0) {
+ _ = try self.inner_writer.write(self.bytes[0..n]);
+ }
+ self.nbytes = 0;
+ _ = try self.inner_writer.write(bytes);
+ }
+ };
+}
lib/std/compress/flate/block_writer.zig
@@ -0,0 +1,706 @@
+const std = @import("std");
+const io = std.io;
+const assert = std.debug.assert;
+
+const hc = @import("huffman_encoder.zig");
+const consts = @import("consts.zig").huffman;
+const Token = @import("Token.zig");
+const BitWriter = @import("bit_writer.zig").BitWriter;
+
+pub fn blockWriter(writer: anytype) BlockWriter(@TypeOf(writer)) {
+ return BlockWriter(@TypeOf(writer)).init(writer);
+}
+
+/// Accepts list of tokens, decides what is best block type to write. What block
+/// type will provide best compression. Writes header and body of the block.
+///
+pub fn BlockWriter(comptime WriterType: type) type {
+ const BitWriterType = BitWriter(WriterType);
+ return struct {
+ const codegen_order = consts.codegen_order;
+ const end_code_mark = 255;
+ const Self = @This();
+
+ pub const Error = BitWriterType.Error;
+ bit_writer: BitWriterType,
+
+ codegen_freq: [consts.codegen_code_count]u16 = undefined,
+ literal_freq: [consts.max_num_lit]u16 = undefined,
+ distance_freq: [consts.distance_code_count]u16 = undefined,
+ codegen: [consts.max_num_lit + consts.distance_code_count + 1]u8 = undefined,
+ literal_encoding: hc.LiteralEncoder = .{},
+ distance_encoding: hc.DistanceEncoder = .{},
+ codegen_encoding: hc.CodegenEncoder = .{},
+ fixed_literal_encoding: hc.LiteralEncoder,
+ fixed_distance_encoding: hc.DistanceEncoder,
+ huff_distance: hc.DistanceEncoder,
+
+ pub fn init(writer: WriterType) Self {
+ return .{
+ .bit_writer = BitWriterType.init(writer),
+ .fixed_literal_encoding = hc.fixedLiteralEncoder(),
+ .fixed_distance_encoding = hc.fixedDistanceEncoder(),
+ .huff_distance = hc.huffmanDistanceEncoder(),
+ };
+ }
+
+ /// Flush intrenal bit buffer to the writer.
+ /// Should be called only when bit stream is at byte boundary.
+ ///
+ /// That is after final block; when last byte could be incomplete or
+ /// after stored block; which is aligned to the byte bounday (it has x
+ /// padding bits after first 3 bits).
+ pub fn flush(self: *Self) Error!void {
+ try self.bit_writer.flush();
+ }
+
+ pub fn setWriter(self: *Self, new_writer: WriterType) void {
+ self.bit_writer.setWriter(new_writer);
+ }
+
+ fn writeCode(self: *Self, c: hc.HuffCode) Error!void {
+ try self.bit_writer.writeBits(c.code, c.len);
+ }
+
+ // RFC 1951 3.2.7 specifies a special run-length encoding for specifying
+ // the literal and distance lengths arrays (which are concatenated into a single
+ // array). This method generates that run-length encoding.
+ //
+ // The result is written into the codegen array, and the frequencies
+ // of each code is written into the codegen_freq array.
+ // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
+ // information. Code bad_code is an end marker
+ //
+ // num_literals: The number of literals in literal_encoding
+ // num_distances: The number of distances in distance_encoding
+ // lit_enc: The literal encoder to use
+ // dist_enc: The distance encoder to use
+ fn generateCodegen(
+ self: *Self,
+ num_literals: u32,
+ num_distances: u32,
+ lit_enc: *hc.LiteralEncoder,
+ dist_enc: *hc.DistanceEncoder,
+ ) void {
+ for (self.codegen_freq, 0..) |_, i| {
+ self.codegen_freq[i] = 0;
+ }
+
+ // Note that we are using codegen both as a temporary variable for holding
+ // a copy of the frequencies, and as the place where we put the result.
+ // This is fine because the output is always shorter than the input used
+ // so far.
+ var codegen = &self.codegen; // cache
+ // Copy the concatenated code sizes to codegen. Put a marker at the end.
+ var cgnl = codegen[0..num_literals];
+ for (cgnl, 0..) |_, i| {
+ cgnl[i] = @as(u8, @intCast(lit_enc.codes[i].len));
+ }
+
+ cgnl = codegen[num_literals .. num_literals + num_distances];
+ for (cgnl, 0..) |_, i| {
+ cgnl[i] = @as(u8, @intCast(dist_enc.codes[i].len));
+ }
+ codegen[num_literals + num_distances] = end_code_mark;
+
+ var size = codegen[0];
+ var count: i32 = 1;
+ var out_index: u32 = 0;
+ var in_index: u32 = 1;
+ while (size != end_code_mark) : (in_index += 1) {
+ // INVARIANT: We have seen "count" copies of size that have not yet
+ // had output generated for them.
+ const next_size = codegen[in_index];
+ if (next_size == size) {
+ count += 1;
+ continue;
+ }
+ // We need to generate codegen indicating "count" of size.
+ if (size != 0) {
+ codegen[out_index] = size;
+ out_index += 1;
+ self.codegen_freq[size] += 1;
+ count -= 1;
+ while (count >= 3) {
+ var n: i32 = 6;
+ if (n > count) {
+ n = count;
+ }
+ codegen[out_index] = 16;
+ out_index += 1;
+ codegen[out_index] = @as(u8, @intCast(n - 3));
+ out_index += 1;
+ self.codegen_freq[16] += 1;
+ count -= n;
+ }
+ } else {
+ while (count >= 11) {
+ var n: i32 = 138;
+ if (n > count) {
+ n = count;
+ }
+ codegen[out_index] = 18;
+ out_index += 1;
+ codegen[out_index] = @as(u8, @intCast(n - 11));
+ out_index += 1;
+ self.codegen_freq[18] += 1;
+ count -= n;
+ }
+ if (count >= 3) {
+ // 3 <= count <= 10
+ codegen[out_index] = 17;
+ out_index += 1;
+ codegen[out_index] = @as(u8, @intCast(count - 3));
+ out_index += 1;
+ self.codegen_freq[17] += 1;
+ count = 0;
+ }
+ }
+ count -= 1;
+ while (count >= 0) : (count -= 1) {
+ codegen[out_index] = size;
+ out_index += 1;
+ self.codegen_freq[size] += 1;
+ }
+ // Set up invariant for next time through the loop.
+ size = next_size;
+ count = 1;
+ }
+ // Marker indicating the end of the codegen.
+ codegen[out_index] = end_code_mark;
+ }
+
+ const DynamicSize = struct {
+ size: u32,
+ num_codegens: u32,
+ };
+
+ // dynamicSize returns the size of dynamically encoded data in bits.
+ fn dynamicSize(
+ self: *Self,
+ lit_enc: *hc.LiteralEncoder, // literal encoder
+ dist_enc: *hc.DistanceEncoder, // distance encoder
+ extra_bits: u32,
+ ) DynamicSize {
+ var num_codegens = self.codegen_freq.len;
+ while (num_codegens > 4 and self.codegen_freq[codegen_order[num_codegens - 1]] == 0) {
+ num_codegens -= 1;
+ }
+ const header = 3 + 5 + 5 + 4 + (3 * num_codegens) +
+ self.codegen_encoding.bitLength(self.codegen_freq[0..]) +
+ self.codegen_freq[16] * 2 +
+ self.codegen_freq[17] * 3 +
+ self.codegen_freq[18] * 7;
+ const size = header +
+ lit_enc.bitLength(&self.literal_freq) +
+ dist_enc.bitLength(&self.distance_freq) +
+ extra_bits;
+
+ return DynamicSize{
+ .size = @as(u32, @intCast(size)),
+ .num_codegens = @as(u32, @intCast(num_codegens)),
+ };
+ }
+
+ // fixedSize returns the size of dynamically encoded data in bits.
+ fn fixedSize(self: *Self, extra_bits: u32) u32 {
+ return 3 +
+ self.fixed_literal_encoding.bitLength(&self.literal_freq) +
+ self.fixed_distance_encoding.bitLength(&self.distance_freq) +
+ extra_bits;
+ }
+
+ const StoredSize = struct {
+ size: u32,
+ storable: bool,
+ };
+
+ // storedSizeFits calculates the stored size, including header.
+ // The function returns the size in bits and whether the block
+ // fits inside a single block.
+ fn storedSizeFits(in: ?[]const u8) StoredSize {
+ if (in == null) {
+ return .{ .size = 0, .storable = false };
+ }
+ if (in.?.len <= consts.max_store_block_size) {
+ return .{ .size = @as(u32, @intCast((in.?.len + 5) * 8)), .storable = true };
+ }
+ return .{ .size = 0, .storable = false };
+ }
+
+ // Write the header of a dynamic Huffman block to the output stream.
+ //
+ // num_literals: The number of literals specified in codegen
+ // num_distances: The number of distances specified in codegen
+ // num_codegens: The number of codegens used in codegen
+ // eof: Is it the end-of-file? (end of stream)
+ fn dynamicHeader(
+ self: *Self,
+ num_literals: u32,
+ num_distances: u32,
+ num_codegens: u32,
+ eof: bool,
+ ) Error!void {
+ const first_bits: u32 = if (eof) 5 else 4;
+ try self.bit_writer.writeBits(first_bits, 3);
+ try self.bit_writer.writeBits(num_literals - 257, 5);
+ try self.bit_writer.writeBits(num_distances - 1, 5);
+ try self.bit_writer.writeBits(num_codegens - 4, 4);
+
+ var i: u32 = 0;
+ while (i < num_codegens) : (i += 1) {
+ const value = self.codegen_encoding.codes[codegen_order[i]].len;
+ try self.bit_writer.writeBits(value, 3);
+ }
+
+ i = 0;
+ while (true) {
+ const code_word: u32 = @as(u32, @intCast(self.codegen[i]));
+ i += 1;
+ if (code_word == end_code_mark) {
+ break;
+ }
+ try self.writeCode(self.codegen_encoding.codes[@as(u32, @intCast(code_word))]);
+
+ switch (code_word) {
+ 16 => {
+ try self.bit_writer.writeBits(self.codegen[i], 2);
+ i += 1;
+ },
+ 17 => {
+ try self.bit_writer.writeBits(self.codegen[i], 3);
+ i += 1;
+ },
+ 18 => {
+ try self.bit_writer.writeBits(self.codegen[i], 7);
+ i += 1;
+ },
+ else => {},
+ }
+ }
+ }
+
+ fn storedHeader(self: *Self, length: usize, eof: bool) Error!void {
+ assert(length <= 65535);
+ const flag: u32 = if (eof) 1 else 0;
+ try self.bit_writer.writeBits(flag, 3);
+ try self.flush();
+ const l: u16 = @intCast(length);
+ try self.bit_writer.writeBits(l, 16);
+ try self.bit_writer.writeBits(~l, 16);
+ }
+
+ fn fixedHeader(self: *Self, eof: bool) Error!void {
+ // Indicate that we are a fixed Huffman block
+ var value: u32 = 2;
+ if (eof) {
+ value = 3;
+ }
+ try self.bit_writer.writeBits(value, 3);
+ }
+
+ // Write a block of tokens with the smallest encoding. Will choose block type.
+ // The original input can be supplied, and if the huffman encoded data
+ // is larger than the original bytes, the data will be written as a
+ // stored block.
+ // If the input is null, the tokens will always be Huffman encoded.
+ pub fn write(self: *Self, tokens: []const Token, eof: bool, input: ?[]const u8) Error!void {
+ const lit_and_dist = self.indexTokens(tokens);
+ const num_literals = lit_and_dist.num_literals;
+ const num_distances = lit_and_dist.num_distances;
+
+ var extra_bits: u32 = 0;
+ const ret = storedSizeFits(input);
+ const stored_size = ret.size;
+ const storable = ret.storable;
+
+ if (storable) {
+ // We only bother calculating the costs of the extra bits required by
+ // the length of distance fields (which will be the same for both fixed
+ // and dynamic encoding), if we need to compare those two encodings
+ // against stored encoding.
+ var length_code: u16 = Token.length_codes_start + 8;
+ while (length_code < num_literals) : (length_code += 1) {
+ // First eight length codes have extra size = 0.
+ extra_bits += @as(u32, @intCast(self.literal_freq[length_code])) *
+ @as(u32, @intCast(Token.lengthExtraBits(length_code)));
+ }
+ var distance_code: u16 = 4;
+ while (distance_code < num_distances) : (distance_code += 1) {
+ // First four distance codes have extra size = 0.
+ extra_bits += @as(u32, @intCast(self.distance_freq[distance_code])) *
+ @as(u32, @intCast(Token.distanceExtraBits(distance_code)));
+ }
+ }
+
+ // Figure out smallest code.
+ // Fixed Huffman baseline.
+ var literal_encoding = &self.fixed_literal_encoding;
+ var distance_encoding = &self.fixed_distance_encoding;
+ var size = self.fixedSize(extra_bits);
+
+ // Dynamic Huffman?
+ var num_codegens: u32 = 0;
+
+ // Generate codegen and codegenFrequencies, which indicates how to encode
+ // the literal_encoding and the distance_encoding.
+ self.generateCodegen(
+ num_literals,
+ num_distances,
+ &self.literal_encoding,
+ &self.distance_encoding,
+ );
+ self.codegen_encoding.generate(self.codegen_freq[0..], 7);
+ const dynamic_size = self.dynamicSize(
+ &self.literal_encoding,
+ &self.distance_encoding,
+ extra_bits,
+ );
+ const dyn_size = dynamic_size.size;
+ num_codegens = dynamic_size.num_codegens;
+
+ if (dyn_size < size) {
+ size = dyn_size;
+ literal_encoding = &self.literal_encoding;
+ distance_encoding = &self.distance_encoding;
+ }
+
+ // Stored bytes?
+ if (storable and stored_size < size) {
+ try self.storedBlock(input.?, eof);
+ return;
+ }
+
+ // Huffman.
+ if (@intFromPtr(literal_encoding) == @intFromPtr(&self.fixed_literal_encoding)) {
+ try self.fixedHeader(eof);
+ } else {
+ try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
+ }
+
+ // Write the tokens.
+ try self.writeTokens(tokens, &literal_encoding.codes, &distance_encoding.codes);
+ }
+
+ pub fn storedBlock(self: *Self, input: []const u8, eof: bool) Error!void {
+ try self.storedHeader(input.len, eof);
+ try self.bit_writer.writeBytes(input);
+ }
+
+ // writeBlockDynamic encodes a block using a dynamic Huffman table.
+ // This should be used if the symbols used have a disproportionate
+ // histogram distribution.
+ // If input is supplied and the compression savings are below 1/16th of the
+ // input size the block is stored.
+ fn dynamicBlock(
+ self: *Self,
+ tokens: []const Token,
+ eof: bool,
+ input: ?[]const u8,
+ ) Error!void {
+ const total_tokens = self.indexTokens(tokens);
+ const num_literals = total_tokens.num_literals;
+ const num_distances = total_tokens.num_distances;
+
+ // Generate codegen and codegenFrequencies, which indicates how to encode
+ // the literal_encoding and the distance_encoding.
+ self.generateCodegen(
+ num_literals,
+ num_distances,
+ &self.literal_encoding,
+ &self.distance_encoding,
+ );
+ self.codegen_encoding.generate(self.codegen_freq[0..], 7);
+ const dynamic_size = self.dynamicSize(&self.literal_encoding, &self.distance_encoding, 0);
+ const size = dynamic_size.size;
+ const num_codegens = dynamic_size.num_codegens;
+
+ // Store bytes, if we don't get a reasonable improvement.
+
+ const stored_size = storedSizeFits(input);
+ const ssize = stored_size.size;
+ const storable = stored_size.storable;
+ if (storable and ssize < (size + (size >> 4))) {
+ try self.storedBlock(input.?, eof);
+ return;
+ }
+
+ // Write Huffman table.
+ try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
+
+ // Write the tokens.
+ try self.writeTokens(tokens, &self.literal_encoding.codes, &self.distance_encoding.codes);
+ }
+
+ const TotalIndexedTokens = struct {
+ num_literals: u32,
+ num_distances: u32,
+ };
+
+ // Indexes a slice of tokens followed by an end_block_marker, and updates
+ // literal_freq and distance_freq, and generates literal_encoding
+ // and distance_encoding.
+ // The number of literal and distance tokens is returned.
+ fn indexTokens(self: *Self, tokens: []const Token) TotalIndexedTokens {
+ var num_literals: u32 = 0;
+ var num_distances: u32 = 0;
+
+ for (self.literal_freq, 0..) |_, i| {
+ self.literal_freq[i] = 0;
+ }
+ for (self.distance_freq, 0..) |_, i| {
+ self.distance_freq[i] = 0;
+ }
+
+ for (tokens) |t| {
+ if (t.kind == Token.Kind.literal) {
+ self.literal_freq[t.literal()] += 1;
+ continue;
+ }
+ self.literal_freq[t.lengthCode()] += 1;
+ self.distance_freq[t.distanceCode()] += 1;
+ }
+ // add end_block_marker token at the end
+ self.literal_freq[consts.end_block_marker] += 1;
+
+ // get the number of literals
+ num_literals = @as(u32, @intCast(self.literal_freq.len));
+ while (self.literal_freq[num_literals - 1] == 0) {
+ num_literals -= 1;
+ }
+ // get the number of distances
+ num_distances = @as(u32, @intCast(self.distance_freq.len));
+ while (num_distances > 0 and self.distance_freq[num_distances - 1] == 0) {
+ num_distances -= 1;
+ }
+ if (num_distances == 0) {
+ // We haven't found a single match. If we want to go with the dynamic encoding,
+ // we should count at least one distance to be sure that the distance huffman tree could be encoded.
+ self.distance_freq[0] = 1;
+ num_distances = 1;
+ }
+ self.literal_encoding.generate(&self.literal_freq, 15);
+ self.distance_encoding.generate(&self.distance_freq, 15);
+ return TotalIndexedTokens{
+ .num_literals = num_literals,
+ .num_distances = num_distances,
+ };
+ }
+
+ // Writes a slice of tokens to the output followed by and end_block_marker.
+ // codes for literal and distance encoding must be supplied.
+ fn writeTokens(
+ self: *Self,
+ tokens: []const Token,
+ le_codes: []hc.HuffCode,
+ oe_codes: []hc.HuffCode,
+ ) Error!void {
+ for (tokens) |t| {
+ if (t.kind == Token.Kind.literal) {
+ try self.writeCode(le_codes[t.literal()]);
+ continue;
+ }
+
+ // Write the length
+ const le = t.lengthEncoding();
+ try self.writeCode(le_codes[le.code]);
+ if (le.extra_bits > 0) {
+ try self.bit_writer.writeBits(le.extra_length, le.extra_bits);
+ }
+
+ // Write the distance
+ const oe = t.distanceEncoding();
+ try self.writeCode(oe_codes[oe.code]);
+ if (oe.extra_bits > 0) {
+ try self.bit_writer.writeBits(oe.extra_distance, oe.extra_bits);
+ }
+ }
+ // add end_block_marker at the end
+ try self.writeCode(le_codes[consts.end_block_marker]);
+ }
+
+ // Encodes a block of bytes as either Huffman encoded literals or uncompressed bytes
+ // if the results only gains very little from compression.
+ pub fn huffmanBlock(self: *Self, input: []const u8, eof: bool) Error!void {
+ // Add everything as literals
+ histogram(input, &self.literal_freq);
+
+ self.literal_freq[consts.end_block_marker] = 1;
+
+ const num_literals = consts.end_block_marker + 1;
+ self.distance_freq[0] = 1;
+ const num_distances = 1;
+
+ self.literal_encoding.generate(&self.literal_freq, 15);
+
+ // Figure out smallest code.
+ // Always use dynamic Huffman or Store
+ var num_codegens: u32 = 0;
+
+ // Generate codegen and codegenFrequencies, which indicates how to encode
+ // the literal_encoding and the distance_encoding.
+ self.generateCodegen(
+ num_literals,
+ num_distances,
+ &self.literal_encoding,
+ &self.huff_distance,
+ );
+ self.codegen_encoding.generate(self.codegen_freq[0..], 7);
+ const dynamic_size = self.dynamicSize(&self.literal_encoding, &self.huff_distance, 0);
+ const size = dynamic_size.size;
+ num_codegens = dynamic_size.num_codegens;
+
+ // Store bytes, if we don't get a reasonable improvement.
+ const stored_size_ret = storedSizeFits(input);
+ const ssize = stored_size_ret.size;
+ const storable = stored_size_ret.storable;
+
+ if (storable and ssize < (size + (size >> 4))) {
+ try self.storedBlock(input, eof);
+ return;
+ }
+
+ // Huffman.
+ try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
+ const encoding = self.literal_encoding.codes[0..257];
+
+ for (input) |t| {
+ const c = encoding[t];
+ try self.bit_writer.writeBits(c.code, c.len);
+ }
+ try self.writeCode(encoding[consts.end_block_marker]);
+ }
+
+ // histogram accumulates a histogram of b in h.
+ fn histogram(b: []const u8, h: *[286]u16) void {
+ // Clear histogram
+ for (h, 0..) |_, i| {
+ h[i] = 0;
+ }
+
+ var lh = h.*[0..256];
+ for (b) |t| {
+ lh[t] += 1;
+ }
+ }
+ };
+}
+
+// tests
+const expect = std.testing.expect;
+const fmt = std.fmt;
+const testing = std.testing;
+const ArrayList = std.ArrayList;
+
+const TestCase = @import("testdata/block_writer.zig").TestCase;
+const testCases = @import("testdata/block_writer.zig").testCases;
+
+// tests if the writeBlock encoding has changed.
+test "flate.BlockWriter write" {
+ inline for (0..testCases.len) |i| {
+ try testBlock(testCases[i], .write_block);
+ }
+}
+
+// tests if the writeBlockDynamic encoding has changed.
+test "flate.BlockWriter dynamicBlock" {
+ inline for (0..testCases.len) |i| {
+ try testBlock(testCases[i], .write_dyn_block);
+ }
+}
+
+test "flate.BlockWriter huffmanBlock" {
+ inline for (0..testCases.len) |i| {
+ try testBlock(testCases[i], .write_huffman_block);
+ }
+ try testBlock(.{
+ .tokens = &[_]Token{},
+ .input = "huffman-rand-max.input",
+ .want = "huffman-rand-max.{s}.expect",
+ }, .write_huffman_block);
+}
+
+const TestFn = enum {
+ write_block,
+ write_dyn_block, // write dynamic block
+ write_huffman_block,
+
+ fn to_s(self: TestFn) []const u8 {
+ return switch (self) {
+ .write_block => "wb",
+ .write_dyn_block => "dyn",
+ .write_huffman_block => "huff",
+ };
+ }
+
+ fn write(
+ comptime self: TestFn,
+ bw: anytype,
+ tok: []const Token,
+ input: ?[]const u8,
+ final: bool,
+ ) !void {
+ switch (self) {
+ .write_block => try bw.write(tok, final, input),
+ .write_dyn_block => try bw.dynamicBlock(tok, final, input),
+ .write_huffman_block => try bw.huffmanBlock(input.?, final),
+ }
+ try bw.flush();
+ }
+};
+
+// testBlock tests a block against its references
+//
+// size
+// 64K [file-name].input - input non compressed file
+// 8.1K [file-name].golden -
+// 78 [file-name].dyn.expect - output with writeBlockDynamic
+// 78 [file-name].wb.expect - output with writeBlock
+// 8.1K [file-name].huff.expect - output with writeBlockHuff
+// 78 [file-name].dyn.expect-noinput - output with writeBlockDynamic when input is null
+// 78 [file-name].wb.expect-noinput - output with writeBlock when input is null
+//
+// wb - writeBlock
+// dyn - writeBlockDynamic
+// huff - writeBlockHuff
+//
+fn testBlock(comptime tc: TestCase, comptime tfn: TestFn) !void {
+ if (tc.input.len != 0 and tc.want.len != 0) {
+ const want_name = comptime fmt.comptimePrint(tc.want, .{tfn.to_s()});
+ const input = @embedFile("testdata/block_writer/" ++ tc.input);
+ const want = @embedFile("testdata/block_writer/" ++ want_name);
+ try testWriteBlock(tfn, input, want, tc.tokens);
+ }
+
+ if (tfn == .write_huffman_block) {
+ return;
+ }
+
+ const want_name_no_input = comptime fmt.comptimePrint(tc.want_no_input, .{tfn.to_s()});
+ const want = @embedFile("testdata/block_writer/" ++ want_name_no_input);
+ try testWriteBlock(tfn, null, want, tc.tokens);
+}
+
+// Uses writer function `tfn` to write `tokens`, tests that we got `want` as output.
+fn testWriteBlock(comptime tfn: TestFn, input: ?[]const u8, want: []const u8, tokens: []const Token) !void {
+ var buf = ArrayList(u8).init(testing.allocator);
+ var bw = blockWriter(buf.writer());
+ try tfn.write(&bw, tokens, input, false);
+ var got = buf.items;
+ try testing.expectEqualSlices(u8, want, got); // expect writeBlock to yield expected result
+ try expect(got[0] & 0b0000_0001 == 0); // bfinal is not set
+ //
+ // Test if the writer produces the same output after reset.
+ buf.deinit();
+ buf = ArrayList(u8).init(testing.allocator);
+ defer buf.deinit();
+ bw.setWriter(buf.writer());
+
+ try tfn.write(&bw, tokens, input, true);
+ try bw.flush();
+ got = buf.items;
+
+ try expect(got[0] & 1 == 1); // bfinal is set
+ buf.items[0] &= 0b1111_1110; // remove bfinal bit, so we can run test slices
+ try testing.expectEqualSlices(u8, want, got); // expect writeBlock to yield expected result
+}
lib/std/compress/flate/CircularBuffer.zig
@@ -0,0 +1,234 @@
+/// 64K buffer of uncompressed data created in inflate (decompression). Has enough
+/// history to support writing match<length, distance>; copying length of bytes
+/// from the position distance backward from current.
+///
+/// Reads can return less than available bytes if they are spread across
+/// different circles. So reads should repeat until get required number of bytes
+/// or until returned slice is zero length.
+///
+/// Note on deflate limits:
+/// * non-compressible block is limited to 65,535 bytes.
+/// * backward pointer is limited in distance to 32K bytes and in length to 258 bytes.
+///
+/// Whole non-compressed block can be written without overlap. We always have
+/// history of up to 64K, more then 32K needed.
+///
+const std = @import("std");
+const assert = std.debug.assert;
+const testing = std.testing;
+
+const consts = @import("consts.zig").match;
+
+const mask = 0xffff; // 64K - 1
+const buffer_len = mask + 1; // 64K buffer
+
+const Self = @This();
+
+buffer: [buffer_len]u8 = undefined,
+wp: usize = 0, // write position
+rp: usize = 0, // read position
+
+fn writeAll(self: *Self, buf: []const u8) void {
+ for (buf) |c| self.write(c);
+}
+
+// Write literal.
+pub fn write(self: *Self, b: u8) void {
+ assert(self.wp - self.rp < mask);
+ self.buffer[self.wp & mask] = b;
+ self.wp += 1;
+}
+
+// Write match (back-reference to the same data slice) starting at `distance`
+// back from current write position, and `length` of bytes.
+pub fn writeMatch(self: *Self, length: u16, distance: u16) !void {
+ if (self.wp < distance or
+ length < consts.base_length or length > consts.max_length or
+ distance < consts.min_distance or distance > consts.max_distance)
+ {
+ return error.InvalidMatch;
+ }
+ assert(self.wp - self.rp < mask);
+
+ var from: usize = self.wp - distance;
+ const from_end: usize = from + length;
+ var to: usize = self.wp;
+ const to_end: usize = to + length;
+
+ self.wp += length;
+
+ // Fast path using memcpy
+ if (length <= distance and // no overlapping buffers
+ (from >> 16 == from_end >> 16) and // start and and at the same circle
+ (to >> 16 == to_end >> 16))
+ {
+ @memcpy(self.buffer[to & mask .. to_end & mask], self.buffer[from & mask .. from_end & mask]);
+ return;
+ }
+
+ // Slow byte by byte
+ while (to < to_end) {
+ self.buffer[to & mask] = self.buffer[from & mask];
+ to += 1;
+ from += 1;
+ }
+}
+
+// Returns writable part of the internal buffer of size `n` at most. Advances
+// write pointer, assumes that returned buffer will be filled with data.
+pub fn getWritable(self: *Self, n: usize) []u8 {
+ const wp = self.wp & mask;
+ const len = @min(n, buffer_len - wp);
+ self.wp += len;
+ return self.buffer[wp .. wp + len];
+}
+
+// Read available data. Can return part of the available data if it is
+// spread across two circles. So read until this returns zero length.
+pub fn read(self: *Self) []const u8 {
+ return self.readAtMost(buffer_len);
+}
+
+// Read part of available data. Can return less than max even if there are
+// more than max decoded data.
+pub fn readAtMost(self: *Self, limit: usize) []const u8 {
+ const rb = self.readBlock(if (limit == 0) buffer_len else limit);
+ defer self.rp += rb.len;
+ return self.buffer[rb.head..rb.tail];
+}
+
+const ReadBlock = struct {
+ head: usize,
+ tail: usize,
+ len: usize,
+};
+
+// Returns position of continous read block data.
+fn readBlock(self: *Self, max: usize) ReadBlock {
+ const r = self.rp & mask;
+ const w = self.wp & mask;
+ const n = @min(
+ max,
+ if (w >= r) w - r else buffer_len - r,
+ );
+ return .{
+ .head = r,
+ .tail = r + n,
+ .len = n,
+ };
+}
+
+// Number of free bytes for write.
+pub fn free(self: *Self) usize {
+ return buffer_len - (self.wp - self.rp);
+}
+
+// Full if largest match can't fit. 258 is largest match length. That much bytes
+// can be produced in single decode step.
+pub fn full(self: *Self) bool {
+ return self.free() < 258 + 1;
+}
+
+// example from: https://youtu.be/SJPvNi4HrWQ?t=3558
+test "flate.CircularBuffer writeMatch" {
+ var cb: Self = .{};
+
+ cb.writeAll("a salad; ");
+ try cb.writeMatch(5, 9);
+ try cb.writeMatch(3, 3);
+
+ try testing.expectEqualStrings("a salad; a salsal", cb.read());
+}
+
+test "flate.CircularBuffer writeMatch overlap" {
+ var cb: Self = .{};
+
+ cb.writeAll("a b c ");
+ try cb.writeMatch(8, 4);
+ cb.write('d');
+
+ try testing.expectEqualStrings("a b c b c b c d", cb.read());
+}
+
+test "flate.CircularBuffer readAtMost" {
+ var cb: Self = .{};
+
+ cb.writeAll("0123456789");
+ try cb.writeMatch(50, 10);
+
+ try testing.expectEqualStrings("0123456789" ** 6, cb.buffer[cb.rp..cb.wp]);
+ for (0..6) |i| {
+ try testing.expectEqual(i * 10, cb.rp);
+ try testing.expectEqualStrings("0123456789", cb.readAtMost(10));
+ }
+ try testing.expectEqualStrings("", cb.readAtMost(10));
+ try testing.expectEqualStrings("", cb.read());
+}
+
+test "flate.CircularBuffer" {
+ var cb: Self = .{};
+
+ const data = "0123456789abcdef" ** (1024 / 16);
+ cb.writeAll(data);
+ try testing.expectEqual(@as(usize, 0), cb.rp);
+ try testing.expectEqual(@as(usize, 1024), cb.wp);
+ try testing.expectEqual(@as(usize, 1024 * 63), cb.free());
+
+ for (0..62 * 4) |_|
+ try cb.writeMatch(256, 1024); // write 62K
+
+ try testing.expectEqual(@as(usize, 0), cb.rp);
+ try testing.expectEqual(@as(usize, 63 * 1024), cb.wp);
+ try testing.expectEqual(@as(usize, 1024), cb.free());
+
+ cb.writeAll(data[0..200]);
+ _ = cb.readAtMost(1024); // make some space
+ cb.writeAll(data); // overflows write position
+ try testing.expectEqual(@as(usize, 200 + 65536), cb.wp);
+ try testing.expectEqual(@as(usize, 1024), cb.rp);
+ try testing.expectEqual(@as(usize, 1024 - 200), cb.free());
+
+ const rb = cb.readBlock(Self.buffer_len);
+ try testing.expectEqual(@as(usize, 65536 - 1024), rb.len);
+ try testing.expectEqual(@as(usize, 1024), rb.head);
+ try testing.expectEqual(@as(usize, 65536), rb.tail);
+
+ try testing.expectEqual(@as(usize, 65536 - 1024), cb.read().len); // read to the end of the buffer
+ try testing.expectEqual(@as(usize, 200 + 65536), cb.wp);
+ try testing.expectEqual(@as(usize, 65536), cb.rp);
+ try testing.expectEqual(@as(usize, 65536 - 200), cb.free());
+
+ try testing.expectEqual(@as(usize, 200), cb.read().len); // read the rest
+}
+
+test "flate.CircularBuffer write overlap" {
+ var cb: Self = .{};
+ cb.wp = cb.buffer.len - 15;
+ cb.rp = cb.wp;
+
+ cb.writeAll("0123456789");
+ cb.writeAll("abcdefghij");
+
+ try testing.expectEqual(cb.buffer.len + 5, cb.wp);
+ try testing.expectEqual(cb.buffer.len - 15, cb.rp);
+
+ try testing.expectEqualStrings("0123456789abcde", cb.read());
+ try testing.expectEqualStrings("fghij", cb.read());
+
+ try testing.expect(cb.wp == cb.rp);
+}
+
+test "flate.CircularBuffer writeMatch/read overlap" {
+ var cb: Self = .{};
+ cb.wp = cb.buffer.len - 15;
+ cb.rp = cb.wp;
+
+ cb.writeAll("0123456789");
+ try cb.writeMatch(15, 5);
+
+ try testing.expectEqualStrings("012345678956789", cb.read());
+ try testing.expectEqualStrings("5678956789", cb.read());
+
+ try cb.writeMatch(20, 25);
+ try testing.expectEqualStrings("01234567895678956789", cb.read());
+}
lib/std/compress/flate/consts.zig
@@ -0,0 +1,49 @@
+pub const deflate = struct {
+ // Number of tokens to accumlate in deflate before starting block encoding.
+ //
+ // In zlib this depends on memlevel: 6 + memlevel, where default memlevel is
+ // 8 and max 9 that gives 14 or 15 bits.
+ pub const tokens = 1 << 15;
+};
+
+pub const match = struct {
+ pub const base_length = 3; // smallest match length per the RFC section 3.2.5
+ pub const min_length = 4; // min length used in this algorithm
+ pub const max_length = 258;
+
+ pub const min_distance = 1;
+ pub const max_distance = 32768;
+};
+
+pub const history = struct {
+ pub const len = match.max_distance;
+};
+
+pub const lookup = struct {
+ pub const bits = 15;
+ pub const len = 1 << bits;
+ pub const shift = 32 - bits;
+};
+
+pub const huffman = struct {
+ // The odd order in which the codegen code sizes are written.
+ pub const codegen_order = [_]u32{ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+ // The number of codegen codes.
+ pub const codegen_code_count = 19;
+
+ // The largest distance code.
+ pub const distance_code_count = 30;
+
+ // Maximum number of literals.
+ pub const max_num_lit = 286;
+
+ // Max number of frequencies used for a Huffman Code
+ // Possible lengths are codegen_code_count (19), distance_code_count (30) and max_num_lit (286).
+ // The largest of these is max_num_lit.
+ pub const max_num_frequencies = max_num_lit;
+
+ // Biggest block size for uncompressed block.
+ pub const max_store_block_size = 65535;
+ // The special code used to mark the end of a block.
+ pub const end_block_marker = 256;
+};
lib/std/compress/flate/container.zig
@@ -0,0 +1,205 @@
+const std = @import("std");
+
+/// Container of the deflate bit stream body. Container adds header before
+/// deflate bit stream and footer after. It can bi gzip, zlib or raw (no header,
+/// no footer, raw bit stream).
+///
+/// Zlib format is defined in rfc 1950. Header has 2 bytes and footer 4 bytes
+/// addler 32 checksum.
+///
+/// Gzip format is defined in rfc 1952. Header has 10+ bytes and footer 4 bytes
+/// crc32 checksum and 4 bytes of uncompressed data length.
+///
+///
+/// rfc 1950: https://datatracker.ietf.org/doc/html/rfc1950#page-4
+/// rfc 1952: https://datatracker.ietf.org/doc/html/rfc1952#page-5
+///
+pub const Container = enum {
+ raw, // no header or footer
+ gzip, // gzip header and footer
+ zlib, // zlib header and footer
+
+ pub fn size(w: Container) usize {
+ return headerSize(w) + footerSize(w);
+ }
+
+ pub fn headerSize(w: Container) usize {
+ return switch (w) {
+ .gzip => 10,
+ .zlib => 2,
+ .raw => 0,
+ };
+ }
+
+ pub fn footerSize(w: Container) usize {
+ return switch (w) {
+ .gzip => 8,
+ .zlib => 4,
+ .raw => 0,
+ };
+ }
+
+ pub const list = [_]Container{ .raw, .gzip, .zlib };
+
+ pub const Error = error{
+ BadGzipHeader,
+ BadZlibHeader,
+ WrongGzipChecksum,
+ WrongGzipSize,
+ WrongZlibChecksum,
+ };
+
+ pub fn writeHeader(comptime wrap: Container, writer: anytype) !void {
+ switch (wrap) {
+ .gzip => {
+ // GZIP 10 byte header (https://datatracker.ietf.org/doc/html/rfc1952#page-5):
+ // - ID1 (IDentification 1), always 0x1f
+ // - ID2 (IDentification 2), always 0x8b
+ // - CM (Compression Method), always 8 = deflate
+ // - FLG (Flags), all set to 0
+ // - 4 bytes, MTIME (Modification time), not used, all set to zero
+ // - XFL (eXtra FLags), all set to zero
+ // - OS (Operating System), 03 = Unix
+ const gzipHeader = [_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 };
+ try writer.writeAll(&gzipHeader);
+ },
+ .zlib => {
+ // ZLIB has a two-byte header (https://datatracker.ietf.org/doc/html/rfc1950#page-4):
+ // 1st byte:
+ // - First four bits is the CINFO (compression info), which is 7 for the default deflate window size.
+ // - The next four bits is the CM (compression method), which is 8 for deflate.
+ // 2nd byte:
+ // - Two bits is the FLEVEL (compression level). Values are: 0=fastest, 1=fast, 2=default, 3=best.
+ // - The next bit, FDICT, is set if a dictionary is given.
+ // - The final five FCHECK bits form a mod-31 checksum.
+ //
+ // CINFO = 7, CM = 8, FLEVEL = 0b10, FDICT = 0, FCHECK = 0b11100
+ const zlibHeader = [_]u8{ 0x78, 0b10_0_11100 };
+ try writer.writeAll(&zlibHeader);
+ },
+ .raw => {},
+ }
+ }
+
+ pub fn writeFooter(comptime wrap: Container, hasher: *Hasher(wrap), writer: anytype) !void {
+ var bits: [4]u8 = undefined;
+ switch (wrap) {
+ .gzip => {
+ // GZIP 8 bytes footer
+ // - 4 bytes, CRC32 (CRC-32)
+ // - 4 bytes, ISIZE (Input SIZE) - size of the original (uncompressed) input data modulo 2^32
+ std.mem.writeInt(u32, &bits, hasher.chksum(), .little);
+ try writer.writeAll(&bits);
+
+ std.mem.writeInt(u32, &bits, hasher.bytesRead(), .little);
+ try writer.writeAll(&bits);
+ },
+ .zlib => {
+ // ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952).
+ // 4 bytes of ADLER32 (Adler-32 checksum)
+ // Checksum value of the uncompressed data (excluding any
+ // dictionary data) computed according to Adler-32
+ // algorithm.
+ std.mem.writeInt(u32, &bits, hasher.chksum(), .big);
+ try writer.writeAll(&bits);
+ },
+ .raw => {},
+ }
+ }
+
+ pub fn parseHeader(comptime wrap: Container, reader: anytype) !void {
+ switch (wrap) {
+ .gzip => try parseGzipHeader(reader),
+ .zlib => try parseZlibHeader(reader),
+ .raw => {},
+ }
+ }
+
+ fn parseGzipHeader(reader: anytype) !void {
+ const magic1 = try reader.read(u8);
+ const magic2 = try reader.read(u8);
+ const method = try reader.read(u8);
+ const flags = try reader.read(u8);
+ try reader.skipBytes(6); // mtime(4), xflags, os
+ if (magic1 != 0x1f or magic2 != 0x8b or method != 0x08)
+ return error.BadGzipHeader;
+ // Flags description: https://www.rfc-editor.org/rfc/rfc1952.html#page-5
+ if (flags != 0) {
+ if (flags & 0b0000_0100 != 0) { // FEXTRA
+ const extra_len = try reader.read(u16);
+ try reader.skipBytes(extra_len);
+ }
+ if (flags & 0b0000_1000 != 0) { // FNAME
+ try reader.skipStringZ();
+ }
+ if (flags & 0b0001_0000 != 0) { // FCOMMENT
+ try reader.skipStringZ();
+ }
+ if (flags & 0b0000_0010 != 0) { // FHCRC
+ try reader.skipBytes(2);
+ }
+ }
+ }
+
+ fn parseZlibHeader(reader: anytype) !void {
+ const cinfo_cm = try reader.read(u8);
+ _ = try reader.read(u8);
+ if (cinfo_cm != 0x78) {
+ return error.BadZlibHeader;
+ }
+ }
+
+ pub fn parseFooter(comptime wrap: Container, hasher: *Hasher(wrap), reader: anytype) !void {
+ switch (wrap) {
+ .gzip => {
+ if (try reader.read(u32) != hasher.chksum()) return error.WrongGzipChecksum;
+ if (try reader.read(u32) != hasher.bytesRead()) return error.WrongGzipSize;
+ },
+ .zlib => {
+ const chksum: u32 = @byteSwap(hasher.chksum());
+ if (try reader.read(u32) != chksum) return error.WrongZlibChecksum;
+ },
+ .raw => {},
+ }
+ }
+
+ pub fn Hasher(comptime wrap: Container) type {
+ const HasherType = switch (wrap) {
+ .gzip => std.hash.Crc32,
+ .zlib => std.hash.Adler32,
+ .raw => struct {
+ pub fn init() @This() {
+ return .{};
+ }
+ },
+ };
+
+ return struct {
+ hasher: HasherType = HasherType.init(),
+ bytes: usize = 0,
+
+ const Self = @This();
+
+ pub fn update(self: *Self, buf: []const u8) void {
+ switch (wrap) {
+ .raw => {},
+ else => {
+ self.hasher.update(buf);
+ self.bytes += buf.len;
+ },
+ }
+ }
+
+ pub fn chksum(self: *Self) u32 {
+ switch (wrap) {
+ .raw => return 0,
+ else => return self.hasher.final(),
+ }
+ }
+
+ pub fn bytesRead(self: *Self) u32 {
+ return @truncate(self.bytes);
+ }
+ };
+ }
+};
lib/std/compress/flate/deflate.zig
@@ -0,0 +1,783 @@
+const std = @import("std");
+const io = std.io;
+const assert = std.debug.assert;
+const testing = std.testing;
+const expect = testing.expect;
+const print = std.debug.print;
+
+const Token = @import("Token.zig");
+const consts = @import("consts.zig");
+const BlockWriter = @import("block_writer.zig").BlockWriter;
+const Container = @import("container.zig").Container;
+const SlidingWindow = @import("SlidingWindow.zig");
+const Lookup = @import("Lookup.zig");
+
+pub const Options = struct {
+ level: Level = .default,
+};
+
+/// Trades between speed and compression size.
+/// Starts with level 4: in [zlib](https://github.com/madler/zlib/blob/abd3d1a28930f89375d4b41408b39f6c1be157b2/deflate.c#L115C1-L117C43)
+/// levels 1-3 are using different algorithm to perform faster but with less
+/// compression. That is not implemented here.
+pub const Level = enum(u4) {
+ // zig fmt: off
+ fast = 0xb, level_4 = 4,
+ level_5 = 5,
+ default = 0xc, level_6 = 6,
+ level_7 = 7,
+ level_8 = 8,
+ best = 0xd, level_9 = 9,
+ // zig fmt: on
+};
+
+/// Algorithm knobs for each level.
+const LevelArgs = struct {
+ good: u16, // Do less lookups if we already have match of this length.
+ nice: u16, // Stop looking for better match if we found match with at least this length.
+ lazy: u16, // Don't do lazy match find if got match with at least this length.
+ chain: u16, // How many lookups for previous match to perform.
+
+ pub fn get(level: Level) LevelArgs {
+ // zig fmt: off
+ return switch (level) {
+ .fast, .level_4 => .{ .good = 4, .lazy = 4, .nice = 16, .chain = 16 },
+ .level_5 => .{ .good = 8, .lazy = 16, .nice = 32, .chain = 32 },
+ .default, .level_6 => .{ .good = 8, .lazy = 16, .nice = 128, .chain = 128 },
+ .level_7 => .{ .good = 8, .lazy = 32, .nice = 128, .chain = 256 },
+ .level_8 => .{ .good = 32, .lazy = 128, .nice = 258, .chain = 1024 },
+ .best, .level_9 => .{ .good = 32, .lazy = 258, .nice = 258, .chain = 4096 },
+ };
+ // zig fmt: on
+ }
+};
+
+/// Compress plain data from reader into compressed stream written to writer.
+pub fn compress(comptime container: Container, reader: anytype, writer: anytype, options: Options) !void {
+ var c = try compressor(container, writer, options);
+ try c.compress(reader);
+ try c.finish();
+}
+
+/// Create compressor for writer type.
+pub fn compressor(comptime container: Container, writer: anytype, options: Options) !Compressor(
+ container,
+ @TypeOf(writer),
+) {
+ return try Compressor(container, @TypeOf(writer)).init(writer, options);
+}
+
+/// Compressor type.
+pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
+ const TokenWriterType = BlockWriter(WriterType);
+ return Deflate(container, WriterType, TokenWriterType);
+}
+
+/// Default compression algorithm. Has two steps: tokenization and token
+/// encoding.
+///
+/// Tokenization takes uncompressed input stream and produces list of tokens.
+/// Each token can be literal (byte of data) or match (backrefernce to previous
+/// data with length and distance). Tokenization accumulators 32K tokens, when
+/// full or `flush` is called tokens are passed to the `block_writer`. Level
+/// defines how hard (how slow) it tries to find match.
+///
+/// Block writer will decide which type of deflate block to write (stored, fixed,
+/// dynamic) and encode tokens to the output byte stream. Client has to call
+/// `finish` to write block with the final bit set.
+///
+/// Container defines type of header and footer which can be gzip, zlib or raw.
+/// They all share same deflate body. Raw has no header or footer just deflate
+/// body.
+///
+/// Compression algorithm explained in rfc-1951 (slightly edited for this case):
+///
+/// The compressor uses a chained hash table `lookup` to find duplicated
+/// strings, using a hash function that operates on 4-byte sequences. At any
+/// given point during compression, let XYZW be the next 4 input bytes
+/// (lookahead) to be examined (not necessarily all different, of course).
+/// First, the compressor examines the hash chain for XYZW. If the chain is
+/// empty, the compressor simply writes out X as a literal byte and advances
+/// one byte in the input. If the hash chain is not empty, indicating that the
+/// sequence XYZW (or, if we are unlucky, some other 4 bytes with the same
+/// hash function value) has occurred recently, the compressor compares all
+/// strings on the XYZW hash chain with the actual input data sequence
+/// starting at the current point, and selects the longest match.
+///
+/// To improve overall compression, the compressor defers the selection of
+/// matches ("lazy matching"): after a match of length N has been found, the
+/// compressor searches for a longer match starting at the next input byte. If
+/// it finds a longer match, it truncates the previous match to a length of
+/// one (thus producing a single literal byte) and then emits the longer
+/// match. Otherwise, it emits the original match, and, as described above,
+/// advances N bytes before continuing.
+///
+///
+/// Allocates statically ~400K (192K lookup, 128K tokens, 64K window).
+///
+/// Deflate function accepts BlockWriterType so we can change that in test to test
+/// just tokenization part.
+///
+fn Deflate(comptime container: Container, comptime WriterType: type, comptime BlockWriterType: type) type {
+ return struct {
+ lookup: Lookup = .{},
+ win: SlidingWindow = .{},
+ tokens: Tokens = .{},
+ wrt: WriterType,
+ block_writer: BlockWriterType,
+ level: LevelArgs,
+ hasher: container.Hasher() = .{},
+
+ // Match and literal at the previous position.
+ // Used for lazy match finding in processWindow.
+ prev_match: ?Token = null,
+ prev_literal: ?u8 = null,
+
+ const Self = @This();
+
+ pub fn init(wrt: WriterType, options: Options) !Self {
+ const self = Self{
+ .wrt = wrt,
+ .block_writer = BlockWriterType.init(wrt),
+ .level = LevelArgs.get(options.level),
+ };
+ try container.writeHeader(self.wrt);
+ return self;
+ }
+
+ const FlushOption = enum { none, flush, final };
+
+ // Process data in window and create tokens. If token buffer is full
+ // flush tokens to the token writer. In the case of `flush` or `final`
+ // option it will process all data from the window. In the `none` case
+ // it will preserve some data for the next match.
+ fn tokenize(self: *Self, flush_opt: FlushOption) !void {
+ // flush - process all data from window
+ const should_flush = (flush_opt != .none);
+
+ // While there is data in active lookahead buffer.
+ while (self.win.activeLookahead(should_flush)) |lh| {
+ var step: u16 = 1; // 1 in the case of literal, match length otherwise
+ const pos: u16 = self.win.pos();
+ const literal = lh[0]; // literal at current position
+ const min_len: u16 = if (self.prev_match) |m| m.length() else 0;
+
+ // Try to find match at least min_len long.
+ if (self.findMatch(pos, lh, min_len)) |match| {
+ // Found better match than previous.
+ try self.addPrevLiteral();
+
+ // Is found match length good enough?
+ if (match.length() >= self.level.lazy) {
+ // Don't try to lazy find better match, use this.
+ step = try self.addMatch(match);
+ } else {
+ // Store this match.
+ self.prev_literal = literal;
+ self.prev_match = match;
+ }
+ } else {
+ // There is no better match at current pos then it was previous.
+ // Write previous match or literal.
+ if (self.prev_match) |m| {
+ // Write match from previous position.
+ step = try self.addMatch(m) - 1; // we already advanced 1 from previous position
+ } else {
+ // No match at previous postition.
+ // Write previous literal if any, and remember this literal.
+ try self.addPrevLiteral();
+ self.prev_literal = literal;
+ }
+ }
+ // Advance window and add hashes.
+ self.windowAdvance(step, lh, pos);
+ }
+
+ if (should_flush) {
+ // In the case of flushing, last few lookahead buffers were smaller then min match len.
+ // So only last literal can be unwritten.
+ assert(self.prev_match == null);
+ try self.addPrevLiteral();
+ self.prev_literal = null;
+
+ try self.flushTokens(flush_opt);
+ }
+ }
+
+ fn windowAdvance(self: *Self, step: u16, lh: []const u8, pos: u16) void {
+ // current position is already added in findMatch
+ self.lookup.bulkAdd(lh[1..], step - 1, pos + 1);
+ self.win.advance(step);
+ }
+
+ // Add previous literal (if any) to the tokens list.
+ fn addPrevLiteral(self: *Self) !void {
+ if (self.prev_literal) |l| try self.addToken(Token.initLiteral(l));
+ }
+
+ // Add match to the tokens list, reset prev pointers.
+ // Returns length of the added match.
+ fn addMatch(self: *Self, m: Token) !u16 {
+ try self.addToken(m);
+ self.prev_literal = null;
+ self.prev_match = null;
+ return m.length();
+ }
+
+ fn addToken(self: *Self, token: Token) !void {
+ self.tokens.add(token);
+ if (self.tokens.full()) try self.flushTokens(.none);
+ }
+
+ // Finds largest match in the history window with the data at current pos.
+ fn findMatch(self: *Self, pos: u16, lh: []const u8, min_len: u16) ?Token {
+ var len: u16 = min_len;
+ // Previous location with the same hash (same 4 bytes).
+ var prev_pos = self.lookup.add(lh, pos);
+ // Last found match.
+ var match: ?Token = null;
+
+ // How much back-references to try, performance knob.
+ var chain: usize = self.level.chain;
+ if (len >= self.level.good) {
+ // If we've got a match that's good enough, only look in 1/4 the chain.
+ chain >>= 2;
+ }
+
+ // Hot path loop!
+ while (prev_pos > 0 and chain > 0) : (chain -= 1) {
+ const distance = pos - prev_pos;
+ if (distance > consts.match.max_distance)
+ break;
+
+ const new_len = self.win.match(prev_pos, pos, len);
+ if (new_len > len) {
+ match = Token.initMatch(@intCast(distance), new_len);
+ if (new_len >= self.level.nice) {
+ // The match is good enough that we don't try to find a better one.
+ return match;
+ }
+ len = new_len;
+ }
+ prev_pos = self.lookup.prev(prev_pos);
+ }
+
+ return match;
+ }
+
+ fn flushTokens(self: *Self, flush_opt: FlushOption) !void {
+ // Pass tokens to the token writer
+ try self.block_writer.write(self.tokens.tokens(), flush_opt == .final, self.win.tokensBuffer());
+ // Stored block ensures byte aligment.
+ // It has 3 bits (final, block_type) and then padding until byte boundary.
+ // After that everyting is aligned to the boundary in the stored block.
+ // Empty stored block is Ob000 + (0-7) bits of padding + 0x00 0x00 0xFF 0xFF.
+ // Last 4 bytes are byte aligned.
+ if (flush_opt == .flush) {
+ try self.block_writer.storedBlock("", false);
+ }
+ if (flush_opt != .none) {
+ // Safe to call only when byte aligned or it is OK to add
+ // padding bits (on last byte of the final block).
+ try self.block_writer.flush();
+ }
+ // Reset internal tokens store.
+ self.tokens.reset();
+ // Notify win that tokens are flushed.
+ self.win.flush();
+ }
+
+ // Slide win and if needed lookup tables.
+ fn slide(self: *Self) void {
+ const n = self.win.slide();
+ self.lookup.slide(n);
+ }
+
+ /// Compresses as much data as possible, stops when the reader becomes
+ /// empty. It will introduce some output latency (reading input without
+ /// producing all output) because some data are still in internal
+ /// buffers.
+ ///
+ /// It is up to the caller to call flush (if needed) or finish (required)
+ /// when is need to output any pending data or complete stream.
+ ///
+ pub fn compress(self: *Self, reader: anytype) !void {
+ while (true) {
+ // Fill window from reader
+ const buf = self.win.writable();
+ if (buf.len == 0) {
+ try self.tokenize(.none);
+ self.slide();
+ continue;
+ }
+ const n = try reader.readAll(buf);
+ self.hasher.update(buf[0..n]);
+ self.win.written(n);
+ // Process window
+ try self.tokenize(.none);
+ // Exit when no more data in reader
+ if (n < buf.len) break;
+ }
+ }
+
+ /// Flushes internal buffers to the output writer. Outputs empty stored
+ /// block to sync bit stream to the byte boundary, so that the
+ /// decompressor can get all input data available so far.
+ ///
+ /// It is useful mainly in compressed network protocols, to ensure that
+ /// deflate bit stream can be used as byte stream. May degrade
+ /// compression so it should be used only when necessary.
+ ///
+ /// Completes the current deflate block and follows it with an empty
+ /// stored block that is three zero bits plus filler bits to the next
+ /// byte, followed by four bytes (00 00 ff ff).
+ ///
+ pub fn flush(self: *Self) !void {
+ try self.tokenize(.flush);
+ }
+
+ /// Completes deflate bit stream by writing any pending data as deflate
+ /// final deflate block. HAS to be called once all data are written to
+ /// the compressor as a signal that next block has to have final bit
+ /// set.
+ ///
+ pub fn finish(self: *Self) !void {
+ try self.tokenize(.final);
+ try container.writeFooter(&self.hasher, self.wrt);
+ }
+
+ /// Use another writer while preserving history. Most probably flush
+ /// should be called on old writer before setting new.
+ pub fn setWriter(self: *Self, new_writer: WriterType) void {
+ self.block_writer.setWriter(new_writer);
+ self.wrt = new_writer;
+ }
+
+ // Writer interface
+
+ pub const Writer = io.Writer(*Self, Error, write);
+ pub const Error = BlockWriterType.Error;
+
+ /// Write `input` of uncompressed data.
+ /// See compress.
+ pub fn write(self: *Self, input: []const u8) !usize {
+ var fbs = io.fixedBufferStream(input);
+ try self.compress(fbs.reader());
+ return input.len;
+ }
+
+ pub fn writer(self: *Self) Writer {
+ return .{ .context = self };
+ }
+ };
+}
+
+// Tokens store
+const Tokens = struct {
+ list: [consts.deflate.tokens]Token = undefined,
+ pos: usize = 0,
+
+ fn add(self: *Tokens, t: Token) void {
+ self.list[self.pos] = t;
+ self.pos += 1;
+ }
+
+ fn full(self: *Tokens) bool {
+ return self.pos == self.list.len;
+ }
+
+ fn reset(self: *Tokens) void {
+ self.pos = 0;
+ }
+
+ fn tokens(self: *Tokens) []const Token {
+ return self.list[0..self.pos];
+ }
+};
+
+/// Creates huffman only deflate blocks. Disables Lempel-Ziv match searching and
+/// only performs Huffman entropy encoding. Results in faster compression, much
+/// less memory requirements during compression but bigger compressed sizes.
+pub const huffman = struct {
+ pub fn compress(comptime container: Container, reader: anytype, writer: anytype) !void {
+ var c = try huffman.compressor(container, writer);
+ try c.compress(reader);
+ try c.finish();
+ }
+
+ pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
+ return SimpleCompressor(.huffman, container, WriterType);
+ }
+
+ pub fn compressor(comptime container: Container, writer: anytype) !huffman.Compressor(container, @TypeOf(writer)) {
+ return try huffman.Compressor(container, @TypeOf(writer)).init(writer);
+ }
+};
+
+/// Creates store blocks only. Data are not compressed only packed into deflate
+/// store blocks. That adds 9 bytes of header for each block. Max stored block
+/// size is 64K. Block is emitted when flush is called on on finish.
+pub const store = struct {
+ pub fn compress(comptime container: Container, reader: anytype, writer: anytype) !void {
+ var c = try store.compressor(container, writer);
+ try c.compress(reader);
+ try c.finish();
+ }
+
+ pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
+ return SimpleCompressor(.store, container, WriterType);
+ }
+
+ pub fn compressor(comptime container: Container, writer: anytype) !store.Compressor(container, @TypeOf(writer)) {
+ return try store.Compressor(container, @TypeOf(writer)).init(writer);
+ }
+};
+
+const SimpleCompressorKind = enum {
+ huffman,
+ store,
+};
+
+fn simpleCompressor(
+ comptime kind: SimpleCompressorKind,
+ comptime container: Container,
+ writer: anytype,
+) !SimpleCompressor(kind, container, @TypeOf(writer)) {
+ return try SimpleCompressor(kind, container, @TypeOf(writer)).init(writer);
+}
+
+fn SimpleCompressor(
+ comptime kind: SimpleCompressorKind,
+ comptime container: Container,
+ comptime WriterType: type,
+) type {
+ const BlockWriterType = BlockWriter(WriterType);
+ return struct {
+ buffer: [65535]u8 = undefined, // because store blocks are limited to 65535 bytes
+ wp: usize = 0,
+
+ wrt: WriterType,
+ block_writer: BlockWriterType,
+ hasher: container.Hasher() = .{},
+
+ const Self = @This();
+
+ pub fn init(wrt: WriterType) !Self {
+ const self = Self{
+ .wrt = wrt,
+ .block_writer = BlockWriterType.init(wrt),
+ };
+ try container.writeHeader(self.wrt);
+ return self;
+ }
+
+ pub fn flush(self: *Self) !void {
+ try self.flushBuffer(false);
+ try self.block_writer.storedBlock("", false);
+ try self.block_writer.flush();
+ }
+
+ pub fn finish(self: *Self) !void {
+ try self.flushBuffer(true);
+ try self.block_writer.flush();
+ try container.writeFooter(&self.hasher, self.wrt);
+ }
+
+ fn flushBuffer(self: *Self, final: bool) !void {
+ const buf = self.buffer[0..self.wp];
+ switch (kind) {
+ .huffman => try self.block_writer.huffmanBlock(buf, final),
+ .store => try self.block_writer.storedBlock(buf, final),
+ }
+ self.wp = 0;
+ }
+
+ // Writes all data from the input reader of uncompressed data.
+ // It is up to the caller to call flush or finish if there is need to
+ // output compressed blocks.
+ pub fn compress(self: *Self, reader: anytype) !void {
+ while (true) {
+ // read from rdr into buffer
+ const buf = self.buffer[self.wp..];
+ if (buf.len == 0) {
+ try self.flushBuffer(false);
+ continue;
+ }
+ const n = try reader.readAll(buf);
+ self.hasher.update(buf[0..n]);
+ self.wp += n;
+ if (n < buf.len) break; // no more data in reader
+ }
+ }
+
+ // Writer interface
+
+ pub const Writer = io.Writer(*Self, Error, write);
+ pub const Error = BlockWriterType.Error;
+
+ // Write `input` of uncompressed data.
+ pub fn write(self: *Self, input: []const u8) !usize {
+ var fbs = io.fixedBufferStream(input);
+ try self.compress(fbs.reader());
+ return input.len;
+ }
+
+ pub fn writer(self: *Self) Writer {
+ return .{ .context = self };
+ }
+ };
+}
+
+test "flate.Deflate tokenization" {
+ const L = Token.initLiteral;
+ const M = Token.initMatch;
+
+ const cases = [_]struct {
+ data: []const u8,
+ tokens: []const Token,
+ }{
+ .{
+ .data = "Blah blah blah blah blah!",
+ .tokens = &[_]Token{ L('B'), L('l'), L('a'), L('h'), L(' '), L('b'), M(5, 18), L('!') },
+ },
+ .{
+ .data = "ABCDEABCD ABCDEABCD",
+ .tokens = &[_]Token{
+ L('A'), L('B'), L('C'), L('D'), L('E'), L('A'), L('B'), L('C'), L('D'), L(' '),
+ L('A'), M(10, 8),
+ },
+ },
+ };
+
+ for (cases) |c| {
+ inline for (Container.list) |container| { // for each wrapping
+ var cw = io.countingWriter(io.null_writer);
+ const cww = cw.writer();
+ var df = try Deflate(container, @TypeOf(cww), TestTokenWriter).init(cww, .{});
+
+ _ = try df.write(c.data);
+ try df.flush();
+
+ // df.token_writer.show();
+ try expect(df.block_writer.pos == c.tokens.len); // number of tokens written
+ try testing.expectEqualSlices(Token, df.block_writer.get(), c.tokens); // tokens match
+
+ try testing.expectEqual(container.headerSize(), cw.bytes_written);
+ try df.finish();
+ try testing.expectEqual(container.size(), cw.bytes_written);
+ }
+ }
+}
+
+// Tests that tokens writen are equal to expected token list.
+const TestTokenWriter = struct {
+ const Self = @This();
+ //expected: []const Token,
+ pos: usize = 0,
+ actual: [1024]Token = undefined,
+
+ pub fn init(_: anytype) Self {
+ return .{};
+ }
+ pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
+ for (tokens) |t| {
+ self.actual[self.pos] = t;
+ self.pos += 1;
+ }
+ }
+
+ pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {}
+
+ pub fn get(self: *Self) []Token {
+ return self.actual[0..self.pos];
+ }
+
+ pub fn show(self: *Self) void {
+ print("\n", .{});
+ for (self.get()) |t| {
+ t.show();
+ }
+ }
+
+ pub fn flush(_: *Self) !void {}
+};
+
+test "flate.Deflate struct sizes" {
+ try expect(@sizeOf(Token) == 4);
+
+ // list: (1 << 15) * 4 = 128k + pos: 8
+ const tokens_size = 128 * 1024 + 8;
+ try expect(@sizeOf(Tokens) == tokens_size);
+
+ // head: (1 << 15) * 2 = 64k, chain: (32768 * 2) * 2 = 128k = 192k
+ const lookup_size = 192 * 1024;
+ try expect(@sizeOf(Lookup) == lookup_size);
+
+ // buffer: (32k * 2), wp: 8, rp: 8, fp: 8
+ const window_size = 64 * 1024 + 8 + 8 + 8;
+ try expect(@sizeOf(SlidingWindow) == window_size);
+
+ const Bw = BlockWriter(@TypeOf(io.null_writer));
+ // huffman bit writer internal: 11480
+ const hbw_size = 11472; // 11.2k
+ try expect(@sizeOf(Bw) == hbw_size);
+
+ const D = Deflate(.raw, @TypeOf(io.null_writer), Bw);
+ // 404744, 395.26K
+ // ?Token: 6, ?u8: 2, level: 8
+ try expect(@sizeOf(D) == tokens_size + lookup_size + window_size + hbw_size + 24);
+ //print("Delfate size: {d} {d}\n", .{ @sizeOf(D), tokens_size + lookup_size + hbw_size + window_size });
+
+ // current std lib deflate allocation:
+ // 797_901, 779.2k
+ // measured with:
+ // var la = std.heap.logToWriterAllocator(testing.allocator, io.getStdOut().writer());
+ // const allocator = la.allocator();
+ // var cmp = try std.compress.deflate.compressor(allocator, io.null_writer, .{});
+ // defer cmp.deinit();
+
+ const HC = huffman.Compressor(.raw, @TypeOf(io.null_writer));
+ //print("size of HOC {d}\n", .{@sizeOf(HOC)});
+ try expect(@sizeOf(HC) == 77024);
+ // 64K buffer
+ // 11480 huffman_encoded
+ // 8 buffer write pointer
+}
+
+test "flate deflate file tokenization" {
+ const levels = [_]Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
+ const cases = [_]struct {
+ data: []const u8, // uncompressed content
+ // expected number of tokens producet in deflate tokenization
+ tokens_count: [levels.len]usize = .{0} ** levels.len,
+ }{
+ .{
+ .data = @embedFile("testdata/rfc1951.txt"),
+ .tokens_count = .{ 7675, 7672, 7599, 7594, 7598, 7599 },
+ },
+
+ .{
+ .data = @embedFile("testdata/block_writer/huffman-null-max.input"),
+ .tokens_count = .{ 257, 257, 257, 257, 257, 257 },
+ },
+ .{
+ .data = @embedFile("testdata/block_writer/huffman-pi.input"),
+ .tokens_count = .{ 2570, 2564, 2564, 2564, 2564, 2564 },
+ },
+ .{
+ .data = @embedFile("testdata/block_writer/huffman-text.input"),
+ .tokens_count = .{ 235, 234, 234, 234, 234, 234 },
+ },
+ .{
+ .data = @embedFile("testdata/fuzz/roundtrip1.input"),
+ .tokens_count = .{ 333, 331, 331, 331, 331, 331 },
+ },
+ .{
+ .data = @embedFile("testdata/fuzz/roundtrip2.input"),
+ .tokens_count = .{ 334, 334, 334, 334, 334, 334 },
+ },
+ };
+
+ for (cases) |case| { // for each case
+ const data = case.data;
+
+ for (levels, 0..) |level, i| { // for each compression level
+ var original = io.fixedBufferStream(data);
+
+ // buffer for decompressed data
+ var al = std.ArrayList(u8).init(testing.allocator);
+ defer al.deinit();
+ const writer = al.writer();
+
+ // create compressor
+ const WriterType = @TypeOf(writer);
+ const TokenWriter = TokenDecoder(@TypeOf(writer));
+ var cmp = try Deflate(.raw, WriterType, TokenWriter).init(writer, .{ .level = level });
+
+ // Stream uncompressed `orignal` data to the compressor. It will
+ // produce tokens list and pass that list to the TokenDecoder. This
+ // TokenDecoder uses CircularBuffer from inflate to convert list of
+ // tokens back to the uncompressed stream.
+ try cmp.compress(original.reader());
+ try cmp.flush();
+ const expected_count = case.tokens_count[i];
+ const actual = cmp.block_writer.tokens_count;
+ if (expected_count == 0) {
+ print("actual token count {d}\n", .{actual});
+ } else {
+ try testing.expectEqual(expected_count, actual);
+ }
+
+ try testing.expectEqual(data.len, al.items.len);
+ try testing.expectEqualSlices(u8, data, al.items);
+ }
+ }
+}
+
+fn TokenDecoder(comptime WriterType: type) type {
+ return struct {
+ const CircularBuffer = @import("CircularBuffer.zig");
+ hist: CircularBuffer = .{},
+ wrt: WriterType,
+ tokens_count: usize = 0,
+
+ const Self = @This();
+
+ pub fn init(wrt: WriterType) Self {
+ return .{ .wrt = wrt };
+ }
+
+ pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
+ self.tokens_count += tokens.len;
+ for (tokens) |t| {
+ switch (t.kind) {
+ .literal => self.hist.write(t.literal()),
+ .match => try self.hist.writeMatch(t.length(), t.distance()),
+ }
+ if (self.hist.free() < 285) try self.flushWin();
+ }
+ try self.flushWin();
+ }
+
+ pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {}
+
+ fn flushWin(self: *Self) !void {
+ while (true) {
+ const buf = self.hist.read();
+ if (buf.len == 0) break;
+ try self.wrt.writeAll(buf);
+ }
+ }
+
+ pub fn flush(_: *Self) !void {}
+ };
+}
+
+test "flate.Deflate store simple compressor" {
+ const data = "Hello world!";
+ const expected = [_]u8{
+ 0x1, // block type 0, final bit set
+ 0xc, 0x0, // len = 12
+ 0xf3, 0xff, // ~len
+ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', //
+ //0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21,
+ };
+
+ var fbs = std.io.fixedBufferStream(data);
+ var al = std.ArrayList(u8).init(testing.allocator);
+ defer al.deinit();
+
+ var cmp = try store.compressor(.raw, al.writer());
+ try cmp.compress(fbs.reader());
+ try cmp.finish();
+ try testing.expectEqualSlices(u8, &expected, al.items);
+
+ fbs.reset();
+ try al.resize(0);
+
+ // huffman only compresoor will also emit store block for this small sample
+ var hc = try huffman.compressor(.raw, al.writer());
+ try hc.compress(fbs.reader());
+ try hc.finish();
+ try testing.expectEqualSlices(u8, &expected, al.items);
+}
lib/std/compress/flate/flate.zig
@@ -0,0 +1,236 @@
+/// Deflate is a lossless data compression file format that uses a combination
+/// of LZ77 and Huffman coding.
+pub const deflate = @import("deflate.zig");
+
+/// Inflate is the decoding process that takes a Deflate bitstream for
+/// decompression and correctly produces the original full-size data or file.
+pub const inflate = @import("inflate.zig");
+
+/// Decompress compressed data from reader and write plain data to the writer.
+pub fn decompress(reader: anytype, writer: anytype) !void {
+ try inflate.decompress(.raw, reader, writer);
+}
+
+/// Decompressor type
+pub fn Decompressor(comptime ReaderType: type) type {
+ return inflate.Inflate(.raw, ReaderType);
+}
+
+/// Create Decompressor which will read compressed data from reader.
+pub fn decompressor(reader: anytype) Decompressor(@TypeOf(reader)) {
+ return inflate.decompressor(.raw, reader);
+}
+
+/// Compression level, trades between speed and compression size.
+pub const Options = deflate.Options;
+
+/// Compress plain data from reader and write compressed data to the writer.
+pub fn compress(reader: anytype, writer: anytype, options: Options) !void {
+ try deflate.compress(.raw, reader, writer, options);
+}
+
+/// Compressor type
+pub fn Compressor(comptime WriterType: type) type {
+ return deflate.Compressor(.raw, WriterType);
+}
+
+/// Create Compressor which outputs compressed data to the writer.
+pub fn compressor(writer: anytype, options: Options) !Compressor(@TypeOf(writer)) {
+ return try deflate.compressor(.raw, writer, options);
+}
+
+/// Huffman only compression. Without Lempel-Ziv match searching. Faster
+/// compression, less memory requirements but bigger compressed sizes.
+pub const huffman = struct {
+ pub fn compress(reader: anytype, writer: anytype) !void {
+ try deflate.huffman.compress(.raw, reader, writer);
+ }
+
+ pub fn Compressor(comptime WriterType: type) type {
+ return deflate.huffman.Compressor(.raw, WriterType);
+ }
+
+ pub fn compressor(writer: anytype) !huffman.Compressor(@TypeOf(writer)) {
+ return deflate.huffman.compressor(.raw, writer);
+ }
+};
+
+// No compression store only. Compressed size is slightly bigger than plain.
+pub const store = struct {
+ pub fn compress(reader: anytype, writer: anytype) !void {
+ try deflate.store.compress(.raw, reader, writer);
+ }
+
+ pub fn Compressor(comptime WriterType: type) type {
+ return deflate.store.Compressor(.raw, WriterType);
+ }
+
+ pub fn compressor(writer: anytype) !store.Compressor(@TypeOf(writer)) {
+ return deflate.store.compressor(.raw, writer);
+ }
+};
+
+/// Container defines header/footer arround deflate bit stream. Gzip and zlib
+/// compression algorithms are containers arround deflate bit stream body.
+const Container = @import("container.zig").Container;
+const std = @import("std");
+const testing = std.testing;
+const fixedBufferStream = std.io.fixedBufferStream;
+const print = std.debug.print;
+
+test "flate compress/decompress" {
+ var cmp_buf: [64 * 1024]u8 = undefined; // compressed data buffer
+ var dcm_buf: [64 * 1024]u8 = undefined; // decompressed data buffer
+
+ const levels = [_]deflate.Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
+ const cases = [_]struct {
+ data: []const u8, // uncompressed content
+ // compressed data sizes per level 4-9
+ gzip_sizes: [levels.len]usize = [_]usize{0} ** levels.len,
+ huffman_only_size: usize = 0,
+ store_size: usize = 0,
+ }{
+ .{
+ .data = @embedFile("testdata/rfc1951.txt"),
+ .gzip_sizes = [_]usize{ 11513, 11217, 11139, 11126, 11122, 11119 },
+ .huffman_only_size = 20287,
+ .store_size = 36967,
+ },
+ .{
+ .data = @embedFile("testdata/fuzz/roundtrip1.input"),
+ .gzip_sizes = [_]usize{ 373, 370, 370, 370, 370, 370 },
+ .huffman_only_size = 393,
+ .store_size = 393,
+ },
+ .{
+ .data = @embedFile("testdata/fuzz/roundtrip2.input"),
+ .gzip_sizes = [_]usize{ 373, 373, 373, 373, 373, 373 },
+ .huffman_only_size = 394,
+ .store_size = 394,
+ },
+ .{
+ .data = @embedFile("testdata/fuzz/deflate-stream.expect"),
+ .gzip_sizes = [_]usize{ 351, 347, 347, 347, 347, 347 },
+ .huffman_only_size = 498,
+ .store_size = 747,
+ },
+ };
+
+ for (cases, 0..) |case, case_no| { // for each case
+ const data = case.data;
+
+ for (levels, 0..) |level, i| { // for each compression level
+
+ inline for (Container.list) |container| { // for each wrapping
+ var compressed_size: usize = if (case.gzip_sizes[i] > 0)
+ case.gzip_sizes[i] - Container.gzip.size() + container.size()
+ else
+ 0;
+
+ // compress original stream to compressed stream
+ {
+ var original = fixedBufferStream(data);
+ var compressed = fixedBufferStream(&cmp_buf);
+ try deflate.compress(container, original.reader(), compressed.writer(), .{ .level = level });
+ if (compressed_size == 0) {
+ if (container == .gzip)
+ print("case {d} gzip level {} compressed size: {d}\n", .{ case_no, level, compressed.pos });
+ compressed_size = compressed.pos;
+ }
+ try testing.expectEqual(compressed_size, compressed.pos);
+ }
+ // decompress compressed stream to decompressed stream
+ {
+ var compressed = fixedBufferStream(cmp_buf[0..compressed_size]);
+ var decompressed = fixedBufferStream(&dcm_buf);
+ try inflate.decompress(container, compressed.reader(), decompressed.writer());
+ try testing.expectEqualSlices(u8, data, decompressed.getWritten());
+ }
+
+ // compressor writer interface
+ {
+ var compressed = fixedBufferStream(&cmp_buf);
+ var cmp = try deflate.compressor(container, compressed.writer(), .{ .level = level });
+ var cmp_wrt = cmp.writer();
+ try cmp_wrt.writeAll(data);
+ try cmp.finish();
+
+ try testing.expectEqual(compressed_size, compressed.pos);
+ }
+ // decompressor reader interface
+ {
+ var compressed = fixedBufferStream(cmp_buf[0..compressed_size]);
+ var dcm = inflate.decompressor(container, compressed.reader());
+ var dcm_rdr = dcm.reader();
+ const n = try dcm_rdr.readAll(&dcm_buf);
+ try testing.expectEqual(data.len, n);
+ try testing.expectEqualSlices(u8, data, dcm_buf[0..n]);
+ }
+ }
+ }
+ // huffman only compression
+ {
+ inline for (Container.list) |container| { // for each wrapping
+ var compressed_size: usize = if (case.huffman_only_size > 0)
+ case.huffman_only_size - Container.gzip.size() + container.size()
+ else
+ 0;
+
+ // compress original stream to compressed stream
+ {
+ var original = fixedBufferStream(data);
+ var compressed = fixedBufferStream(&cmp_buf);
+ var cmp = try deflate.huffman.compressor(container, compressed.writer());
+ try cmp.compress(original.reader());
+ try cmp.finish();
+ if (compressed_size == 0) {
+ if (container == .gzip)
+ print("case {d} huffman only compressed size: {d}\n", .{ case_no, compressed.pos });
+ compressed_size = compressed.pos;
+ }
+ try testing.expectEqual(compressed_size, compressed.pos);
+ }
+ // decompress compressed stream to decompressed stream
+ {
+ var compressed = fixedBufferStream(cmp_buf[0..compressed_size]);
+ var decompressed = fixedBufferStream(&dcm_buf);
+ try inflate.decompress(container, compressed.reader(), decompressed.writer());
+ try testing.expectEqualSlices(u8, data, decompressed.getWritten());
+ }
+ }
+ }
+
+ // store only
+ {
+ inline for (Container.list) |container| { // for each wrapping
+ var compressed_size: usize = if (case.store_size > 0)
+ case.store_size - Container.gzip.size() + container.size()
+ else
+ 0;
+
+ // compress original stream to compressed stream
+ {
+ var original = fixedBufferStream(data);
+ var compressed = fixedBufferStream(&cmp_buf);
+ var cmp = try deflate.store.compressor(container, compressed.writer());
+ try cmp.compress(original.reader());
+ try cmp.finish();
+ if (compressed_size == 0) {
+ if (container == .gzip)
+ print("case {d} store only compressed size: {d}\n", .{ case_no, compressed.pos });
+ compressed_size = compressed.pos;
+ }
+
+ try testing.expectEqual(compressed_size, compressed.pos);
+ }
+ // decompress compressed stream to decompressed stream
+ {
+ var compressed = fixedBufferStream(cmp_buf[0..compressed_size]);
+ var decompressed = fixedBufferStream(&dcm_buf);
+ try inflate.decompress(container, compressed.reader(), decompressed.writer());
+ try testing.expectEqualSlices(u8, data, decompressed.getWritten());
+ }
+ }
+ }
+ }
+}
lib/std/compress/flate/gzip.zig
@@ -0,0 +1,66 @@
+const deflate = @import("deflate.zig");
+const inflate = @import("inflate.zig");
+
+/// Decompress compressed data from reader and write plain data to the writer.
+pub fn decompress(reader: anytype, writer: anytype) !void {
+ try inflate.decompress(.gzip, reader, writer);
+}
+
+/// Decompressor type
+pub fn Decompressor(comptime ReaderType: type) type {
+ return inflate.Inflate(.gzip, ReaderType);
+}
+
+/// Create Decompressor which will read compressed data from reader.
+pub fn decompressor(reader: anytype) Decompressor(@TypeOf(reader)) {
+ return inflate.decompressor(.gzip, reader);
+}
+
+/// Compression level, trades between speed and compression size.
+pub const Options = deflate.Options;
+
+/// Compress plain data from reader and write compressed data to the writer.
+pub fn compress(reader: anytype, writer: anytype, options: Options) !void {
+ try deflate.compress(.gzip, reader, writer, options);
+}
+
+/// Compressor type
+pub fn Compressor(comptime WriterType: type) type {
+ return deflate.Compressor(.gzip, WriterType);
+}
+
+/// Create Compressor which outputs compressed data to the writer.
+pub fn compressor(writer: anytype, options: Options) !Compressor(@TypeOf(writer)) {
+ return try deflate.compressor(.gzip, writer, options);
+}
+
+/// Huffman only compression. Without Lempel-Ziv match searching. Faster
+/// compression, less memory requirements but bigger compressed sizes.
+pub const huffman = struct {
+ pub fn compress(reader: anytype, writer: anytype) !void {
+ try deflate.huffman.compress(.gzip, reader, writer);
+ }
+
+ pub fn Compressor(comptime WriterType: type) type {
+ return deflate.huffman.Compressor(.gzip, WriterType);
+ }
+
+ pub fn compressor(writer: anytype) !huffman.Compressor(@TypeOf(writer)) {
+ return deflate.huffman.compressor(.gzip, writer);
+ }
+};
+
+// No compression store only. Compressed size is slightly bigger than plain.
+pub const store = struct {
+ pub fn compress(reader: anytype, writer: anytype) !void {
+ try deflate.store.compress(.gzip, reader, writer);
+ }
+
+ pub fn Compressor(comptime WriterType: type) type {
+ return deflate.store.Compressor(.gzip, WriterType);
+ }
+
+ pub fn compressor(writer: anytype) !store.Compressor(@TypeOf(writer)) {
+ return deflate.store.compressor(.gzip, writer);
+ }
+};
lib/std/compress/flate/huffman_decoder.zig
@@ -0,0 +1,308 @@
+const std = @import("std");
+const testing = std.testing;
+
+pub const Symbol = packed struct {
+ pub const Kind = enum(u2) {
+ literal,
+ end_of_block,
+ match,
+ };
+
+ symbol: u8 = 0, // symbol from alphabet
+ code_bits: u4 = 0, // number of bits in code 0-15
+ kind: Kind = .literal,
+
+ code: u16 = 0, // huffman code of the symbol
+ next: u16 = 0, // pointer to the next symbol in linked list
+ // it is safe to use 0 as null pointer, when sorted 0 has shortest code and fits into lookup
+
+ // Sorting less than function.
+ pub fn asc(_: void, a: Symbol, b: Symbol) bool {
+ if (a.code_bits == b.code_bits) {
+ if (a.kind == b.kind) {
+ return a.symbol < b.symbol;
+ }
+ return @intFromEnum(a.kind) < @intFromEnum(b.kind);
+ }
+ return a.code_bits < b.code_bits;
+ }
+};
+
+pub const LiteralDecoder = HuffmanDecoder(286, 15, 9);
+pub const DistanceDecoder = HuffmanDecoder(30, 15, 9);
+pub const CodegenDecoder = HuffmanDecoder(19, 7, 7);
+
+pub const Error = error{
+ InvalidCode,
+ OversubscribedHuffmanTree,
+ IncompleteHuffmanTree,
+ MissingEndOfBlockCode,
+};
+
+/// Creates huffman tree codes from list of code lengths (in `build`).
+///
+/// `find` then finds symbol for code bits. Code can be any length between 1 and
+/// 15 bits. When calling `find` we don't know how many bits will be used to
+/// find symbol. When symbol is returned it has code_bits field which defines
+/// how much we should advance in bit stream.
+///
+/// Lookup table is used to map 15 bit int to symbol. Same symbol is written
+/// many times in this table; 32K places for 286 (at most) symbols.
+/// Small lookup table is optimization for faster search.
+/// It is variation of the algorithm explained in [zlib](https://github.com/madler/zlib/blob/643e17b7498d12ab8d15565662880579692f769d/doc/algorithm.txt#L92)
+/// with difference that we here use statically allocated arrays.
+///
+fn HuffmanDecoder(
+ comptime alphabet_size: u16,
+ comptime max_code_bits: u4,
+ comptime lookup_bits: u4,
+) type {
+ const lookup_shift = max_code_bits - lookup_bits;
+
+ return struct {
+ // all symbols in alaphabet, sorted by code_len, symbol
+ symbols: [alphabet_size]Symbol = undefined,
+ // lookup table code -> symbol
+ lookup: [1 << lookup_bits]Symbol = undefined,
+
+ const Self = @This();
+
+ /// Generates symbols and lookup tables from list of code lens for each symbol.
+ pub fn generate(self: *Self, lens: []const u4) !void {
+ try checkCompletnes(lens);
+
+ // init alphabet with code_bits
+ for (self.symbols, 0..) |_, i| {
+ const cb: u4 = if (i < lens.len) lens[i] else 0;
+ self.symbols[i] = if (i < 256)
+ .{ .kind = .literal, .symbol = @intCast(i), .code_bits = cb }
+ else if (i == 256)
+ .{ .kind = .end_of_block, .symbol = 0xff, .code_bits = cb }
+ else
+ .{ .kind = .match, .symbol = @intCast(i - 257), .code_bits = cb };
+ }
+ std.sort.heap(Symbol, &self.symbols, {}, Symbol.asc);
+
+ // reset lookup table
+ for (0..self.lookup.len) |i| {
+ self.lookup[i] = .{};
+ }
+
+ // assign code to symbols
+ // reference: https://youtu.be/9_YEGLe33NA?list=PLU4IQLU9e_OrY8oASHx0u3IXAL9TOdidm&t=2639
+ var code: u16 = 0;
+ var idx: u16 = 0;
+ for (&self.symbols, 0..) |*sym, pos| {
+ //print("sym: {}\n", .{sym});
+ if (sym.code_bits == 0) continue; // skip unused
+ sym.code = code;
+
+ const next_code = code + (@as(u16, 1) << (max_code_bits - sym.code_bits));
+ const next_idx = next_code >> lookup_shift;
+
+ if (next_idx > self.lookup.len or idx >= self.lookup.len) break;
+ if (sym.code_bits <= lookup_bits) {
+ // fill small lookup table
+ for (idx..next_idx) |j|
+ self.lookup[j] = sym.*;
+ } else {
+ // insert into linked table starting at root
+ const root = &self.lookup[idx];
+ const root_next = root.next;
+ root.next = @intCast(pos);
+ sym.next = root_next;
+ }
+
+ idx = next_idx;
+ code = next_code;
+ }
+ //print("decoder generate, code: {d}, idx: {d}\n", .{ code, idx });
+ }
+
+ /// Given the list of code lengths check that it represents a canonical
+ /// Huffman code for n symbols.
+ ///
+ /// Reference: https://github.com/madler/zlib/blob/5c42a230b7b468dff011f444161c0145b5efae59/contrib/puff/puff.c#L340
+ fn checkCompletnes(lens: []const u4) !void {
+ if (alphabet_size == 286)
+ if (lens[256] == 0) return error.MissingEndOfBlockCode;
+
+ var count = [_]u16{0} ** (@as(usize, max_code_bits) + 1);
+ var max: usize = 0;
+ for (lens) |n| {
+ if (n == 0) continue;
+ if (n > max) max = n;
+ count[n] += 1;
+ }
+ if (max == 0) // emtpy tree
+ return;
+
+ // check for an over-subscribed or incomplete set of lengths
+ var left: usize = 1; // one possible code of zero length
+ for (1..count.len) |len| {
+ left <<= 1; // one more bit, double codes left
+ if (count[len] > left)
+ return error.OversubscribedHuffmanTree;
+ left -= count[len]; // deduct count from possible codes
+ }
+ if (left > 0) { // left > 0 means incomplete
+ // incomplete code ok only for single length 1 code
+ if (max_code_bits > 7 and max == count[0] + count[1]) return;
+ return error.IncompleteHuffmanTree;
+ }
+ }
+
+ /// Finds symbol for lookup table code.
+ pub fn find(self: *Self, code: u16) !Symbol {
+ // try to find in lookup table
+ const idx = code >> lookup_shift;
+ const sym = self.lookup[idx];
+ if (sym.code_bits != 0) return sym;
+ // if not use linked list of symbols with same prefix
+ return self.findLinked(code, sym.next);
+ }
+
+ inline fn findLinked(self: *Self, code: u16, start: u16) !Symbol {
+ var pos = start;
+ while (pos > 0) {
+ const sym = self.symbols[pos];
+ const shift = max_code_bits - sym.code_bits;
+ // compare code_bits number of upper bits
+ if ((code ^ sym.code) >> shift == 0) return sym;
+ pos = sym.next;
+ }
+ return error.InvalidCode;
+ }
+ };
+}
+
+test "flate.HuffmanDecoder init/find" {
+ // example data from: https://youtu.be/SJPvNi4HrWQ?t=8423
+ const code_lens = [_]u4{ 4, 3, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 2 };
+ var h: CodegenDecoder = .{};
+ try h.generate(&code_lens);
+
+ const expected = [_]struct {
+ sym: Symbol,
+ code: u16,
+ }{
+ .{
+ .code = 0b00_00000,
+ .sym = .{ .symbol = 3, .code_bits = 2 },
+ },
+ .{
+ .code = 0b01_00000,
+ .sym = .{ .symbol = 18, .code_bits = 2 },
+ },
+ .{
+ .code = 0b100_0000,
+ .sym = .{ .symbol = 1, .code_bits = 3 },
+ },
+ .{
+ .code = 0b101_0000,
+ .sym = .{ .symbol = 4, .code_bits = 3 },
+ },
+ .{
+ .code = 0b110_0000,
+ .sym = .{ .symbol = 17, .code_bits = 3 },
+ },
+ .{
+ .code = 0b1110_000,
+ .sym = .{ .symbol = 0, .code_bits = 4 },
+ },
+ .{
+ .code = 0b1111_000,
+ .sym = .{ .symbol = 16, .code_bits = 4 },
+ },
+ };
+
+ // unused symbols
+ for (0..12) |i| {
+ try testing.expectEqual(0, h.symbols[i].code_bits);
+ }
+ // used, from index 12
+ for (expected, 12..) |e, i| {
+ try testing.expectEqual(e.sym.symbol, h.symbols[i].symbol);
+ try testing.expectEqual(e.sym.code_bits, h.symbols[i].code_bits);
+ const sym_from_code = try h.find(e.code);
+ try testing.expectEqual(e.sym.symbol, sym_from_code.symbol);
+ }
+
+ // All possible codes for each symbol.
+ // Lookup table has 126 elements, to cover all possible 7 bit codes.
+ for (0b0000_000..0b0100_000) |c| // 0..32 (32)
+ try testing.expectEqual(3, (try h.find(@intCast(c))).symbol);
+
+ for (0b0100_000..0b1000_000) |c| // 32..64 (32)
+ try testing.expectEqual(18, (try h.find(@intCast(c))).symbol);
+
+ for (0b1000_000..0b1010_000) |c| // 64..80 (16)
+ try testing.expectEqual(1, (try h.find(@intCast(c))).symbol);
+
+ for (0b1010_000..0b1100_000) |c| // 80..96 (16)
+ try testing.expectEqual(4, (try h.find(@intCast(c))).symbol);
+
+ for (0b1100_000..0b1110_000) |c| // 96..112 (16)
+ try testing.expectEqual(17, (try h.find(@intCast(c))).symbol);
+
+ for (0b1110_000..0b1111_000) |c| // 112..120 (8)
+ try testing.expectEqual(0, (try h.find(@intCast(c))).symbol);
+
+ for (0b1111_000..0b1_0000_000) |c| // 120...128 (8)
+ try testing.expectEqual(16, (try h.find(@intCast(c))).symbol);
+}
+
+const print = std.debug.print;
+const assert = std.debug.assert;
+const expect = std.testing.expect;
+
+test "flate.HuffmanDecoder encode/decode literals" {
+ const LiteralEncoder = @import("huffman_encoder.zig").LiteralEncoder;
+
+ for (1..286) |j| { // for all different number of codes
+ var enc: LiteralEncoder = .{};
+ // create freqencies
+ var freq = [_]u16{0} ** 286;
+ freq[256] = 1; // ensure we have end of block code
+ for (&freq, 1..) |*f, i| {
+ if (i % j == 0)
+ f.* = @intCast(i);
+ }
+
+ // encoder from freqencies
+ enc.generate(&freq, 15);
+
+ // get code_lens from encoder
+ var code_lens = [_]u4{0} ** 286;
+ for (code_lens, 0..) |_, i| {
+ code_lens[i] = @intCast(enc.codes[i].len);
+ }
+ // generate decoder from code lens
+ var dec: LiteralDecoder = .{};
+ try dec.generate(&code_lens);
+
+ // expect decoder code to match original encoder code
+ for (dec.symbols) |s| {
+ if (s.code_bits == 0) continue;
+ const c_code: u16 = @bitReverse(@as(u15, @intCast(s.code)));
+ const symbol: u16 = switch (s.kind) {
+ .literal => s.symbol,
+ .end_of_block => 256,
+ .match => @as(u16, s.symbol) + 257,
+ };
+
+ const c = enc.codes[symbol];
+ try expect(c.code == c_code);
+ }
+
+ // find each symbol by code
+ for (enc.codes) |c| {
+ if (c.len == 0) continue;
+
+ const s_code: u15 = @bitReverse(@as(u15, @intCast(c.code)));
+ const s = try dec.find(s_code);
+ try expect(s.code == s_code);
+ try expect(s.code_bits == c.len);
+ }
+ }
+}
lib/std/compress/flate/huffman_encoder.zig
@@ -0,0 +1,536 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const math = std.math;
+const mem = std.mem;
+const sort = std.sort;
+const testing = std.testing;
+
+const consts = @import("consts.zig").huffman;
+
+const LiteralNode = struct {
+ literal: u16,
+ freq: u16,
+};
+
+// Describes the state of the constructed tree for a given depth.
+const LevelInfo = struct {
+ // Our level. for better printing
+ level: u32,
+
+ // The frequency of the last node at this level
+ last_freq: u32,
+
+ // The frequency of the next character to add to this level
+ next_char_freq: u32,
+
+ // The frequency of the next pair (from level below) to add to this level.
+ // Only valid if the "needed" value of the next lower level is 0.
+ next_pair_freq: u32,
+
+ // The number of chains remaining to generate for this level before moving
+ // up to the next level
+ needed: u32,
+};
+
+// hcode is a huffman code with a bit code and bit length.
+pub const HuffCode = struct {
+ code: u16 = 0,
+ len: u16 = 0,
+
+ // set sets the code and length of an hcode.
+ fn set(self: *HuffCode, code: u16, length: u16) void {
+ self.len = length;
+ self.code = code;
+ }
+};
+
+pub fn HuffmanEncoder(comptime size: usize) type {
+ return struct {
+ codes: [size]HuffCode = undefined,
+ // Reusable buffer with the longest possible frequency table.
+ freq_cache: [consts.max_num_frequencies + 1]LiteralNode = undefined,
+ bit_count: [17]u32 = undefined,
+ lns: []LiteralNode = undefined, // sorted by literal, stored to avoid repeated allocation in generate
+ lfs: []LiteralNode = undefined, // sorted by frequency, stored to avoid repeated allocation in generate
+
+ const Self = @This();
+
+ // Update this Huffman Code object to be the minimum code for the specified frequency count.
+ //
+ // freq An array of frequencies, in which frequency[i] gives the frequency of literal i.
+ // max_bits The maximum number of bits to use for any literal.
+ pub fn generate(self: *Self, freq: []u16, max_bits: u32) void {
+ var list = self.freq_cache[0 .. freq.len + 1];
+ // Number of non-zero literals
+ var count: u32 = 0;
+ // Set list to be the set of all non-zero literals and their frequencies
+ for (freq, 0..) |f, i| {
+ if (f != 0) {
+ list[count] = LiteralNode{ .literal = @as(u16, @intCast(i)), .freq = f };
+ count += 1;
+ } else {
+ list[count] = LiteralNode{ .literal = 0x00, .freq = 0 };
+ self.codes[i].len = 0;
+ }
+ }
+ list[freq.len] = LiteralNode{ .literal = 0x00, .freq = 0 };
+
+ list = list[0..count];
+ if (count <= 2) {
+ // Handle the small cases here, because they are awkward for the general case code. With
+ // two or fewer literals, everything has bit length 1.
+ for (list, 0..) |node, i| {
+ // "list" is in order of increasing literal value.
+ self.codes[node.literal].set(@as(u16, @intCast(i)), 1);
+ }
+ return;
+ }
+ self.lfs = list;
+ mem.sort(LiteralNode, self.lfs, {}, byFreq);
+
+ // Get the number of literals for each bit count
+ const bit_count = self.bitCounts(list, max_bits);
+ // And do the assignment
+ self.assignEncodingAndSize(bit_count, list);
+ }
+
+ pub fn bitLength(self: *Self, freq: []u16) u32 {
+ var total: u32 = 0;
+ for (freq, 0..) |f, i| {
+ if (f != 0) {
+ total += @as(u32, @intCast(f)) * @as(u32, @intCast(self.codes[i].len));
+ }
+ }
+ return total;
+ }
+
+ // Return the number of literals assigned to each bit size in the Huffman encoding
+ //
+ // This method is only called when list.len >= 3
+ // The cases of 0, 1, and 2 literals are handled by special case code.
+ //
+ // list: An array of the literals with non-zero frequencies
+ // and their associated frequencies. The array is in order of increasing
+ // frequency, and has as its last element a special element with frequency
+ // std.math.maxInt(i32)
+ //
+ // max_bits: The maximum number of bits that should be used to encode any literal.
+ // Must be less than 16.
+ //
+ // Returns an integer array in which array[i] indicates the number of literals
+ // that should be encoded in i bits.
+ fn bitCounts(self: *Self, list: []LiteralNode, max_bits_to_use: usize) []u32 {
+ var max_bits = max_bits_to_use;
+ const n = list.len;
+ const max_bits_limit = 16;
+
+ assert(max_bits < max_bits_limit);
+
+ // The tree can't have greater depth than n - 1, no matter what. This
+ // saves a little bit of work in some small cases
+ max_bits = @min(max_bits, n - 1);
+
+ // Create information about each of the levels.
+ // A bogus "Level 0" whose sole purpose is so that
+ // level1.prev.needed == 0. This makes level1.next_pair_freq
+ // be a legitimate value that never gets chosen.
+ var levels: [max_bits_limit]LevelInfo = mem.zeroes([max_bits_limit]LevelInfo);
+ // leaf_counts[i] counts the number of literals at the left
+ // of ancestors of the rightmost node at level i.
+ // leaf_counts[i][j] is the number of literals at the left
+ // of the level j ancestor.
+ var leaf_counts: [max_bits_limit][max_bits_limit]u32 = mem.zeroes([max_bits_limit][max_bits_limit]u32);
+
+ {
+ var level = @as(u32, 1);
+ while (level <= max_bits) : (level += 1) {
+ // For every level, the first two items are the first two characters.
+ // We initialize the levels as if we had already figured this out.
+ levels[level] = LevelInfo{
+ .level = level,
+ .last_freq = list[1].freq,
+ .next_char_freq = list[2].freq,
+ .next_pair_freq = list[0].freq + list[1].freq,
+ .needed = 0,
+ };
+ leaf_counts[level][level] = 2;
+ if (level == 1) {
+ levels[level].next_pair_freq = math.maxInt(i32);
+ }
+ }
+ }
+
+ // We need a total of 2*n - 2 items at top level and have already generated 2.
+ levels[max_bits].needed = 2 * @as(u32, @intCast(n)) - 4;
+
+ {
+ var level = max_bits;
+ while (true) {
+ var l = &levels[level];
+ if (l.next_pair_freq == math.maxInt(i32) and l.next_char_freq == math.maxInt(i32)) {
+ // We've run out of both leafs and pairs.
+ // End all calculations for this level.
+ // To make sure we never come back to this level or any lower level,
+ // set next_pair_freq impossibly large.
+ l.needed = 0;
+ levels[level + 1].next_pair_freq = math.maxInt(i32);
+ level += 1;
+ continue;
+ }
+
+ const prev_freq = l.last_freq;
+ if (l.next_char_freq < l.next_pair_freq) {
+ // The next item on this row is a leaf node.
+ const next = leaf_counts[level][level] + 1;
+ l.last_freq = l.next_char_freq;
+ // Lower leaf_counts are the same of the previous node.
+ leaf_counts[level][level] = next;
+ if (next >= list.len) {
+ l.next_char_freq = maxNode().freq;
+ } else {
+ l.next_char_freq = list[next].freq;
+ }
+ } else {
+ // The next item on this row is a pair from the previous row.
+ // next_pair_freq isn't valid until we generate two
+ // more values in the level below
+ l.last_freq = l.next_pair_freq;
+ // Take leaf counts from the lower level, except counts[level] remains the same.
+ @memcpy(leaf_counts[level][0..level], leaf_counts[level - 1][0..level]);
+ levels[l.level - 1].needed = 2;
+ }
+
+ l.needed -= 1;
+ if (l.needed == 0) {
+ // We've done everything we need to do for this level.
+ // Continue calculating one level up. Fill in next_pair_freq
+ // of that level with the sum of the two nodes we've just calculated on
+ // this level.
+ if (l.level == max_bits) {
+ // All done!
+ break;
+ }
+ levels[l.level + 1].next_pair_freq = prev_freq + l.last_freq;
+ level += 1;
+ } else {
+ // If we stole from below, move down temporarily to replenish it.
+ while (levels[level - 1].needed > 0) {
+ level -= 1;
+ if (level == 0) {
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // Somethings is wrong if at the end, the top level is null or hasn't used
+ // all of the leaves.
+ assert(leaf_counts[max_bits][max_bits] == n);
+
+ var bit_count = self.bit_count[0 .. max_bits + 1];
+ var bits: u32 = 1;
+ const counts = &leaf_counts[max_bits];
+ {
+ var level = max_bits;
+ while (level > 0) : (level -= 1) {
+ // counts[level] gives the number of literals requiring at least "bits"
+ // bits to encode.
+ bit_count[bits] = counts[level] - counts[level - 1];
+ bits += 1;
+ if (level == 0) {
+ break;
+ }
+ }
+ }
+ return bit_count;
+ }
+
+ // Look at the leaves and assign them a bit count and an encoding as specified
+ // in RFC 1951 3.2.2
+ fn assignEncodingAndSize(self: *Self, bit_count: []u32, list_arg: []LiteralNode) void {
+ var code = @as(u16, 0);
+ var list = list_arg;
+
+ for (bit_count, 0..) |bits, n| {
+ code <<= 1;
+ if (n == 0 or bits == 0) {
+ continue;
+ }
+ // The literals list[list.len-bits] .. list[list.len-bits]
+ // are encoded using "bits" bits, and get the values
+ // code, code + 1, .... The code values are
+ // assigned in literal order (not frequency order).
+ const chunk = list[list.len - @as(u32, @intCast(bits)) ..];
+
+ self.lns = chunk;
+ mem.sort(LiteralNode, self.lns, {}, byLiteral);
+
+ for (chunk) |node| {
+ self.codes[node.literal] = HuffCode{
+ .code = bitReverse(u16, code, @as(u5, @intCast(n))),
+ .len = @as(u16, @intCast(n)),
+ };
+ code += 1;
+ }
+ list = list[0 .. list.len - @as(u32, @intCast(bits))];
+ }
+ }
+ };
+}
+
+fn maxNode() LiteralNode {
+ return LiteralNode{
+ .literal = math.maxInt(u16),
+ .freq = math.maxInt(u16),
+ };
+}
+
+pub fn huffmanEncoder(comptime size: u32) HuffmanEncoder(size) {
+ return .{};
+}
+
+pub const LiteralEncoder = HuffmanEncoder(consts.max_num_frequencies);
+pub const DistanceEncoder = HuffmanEncoder(consts.distance_code_count);
+pub const CodegenEncoder = HuffmanEncoder(19);
+
+// Generates a HuffmanCode corresponding to the fixed literal table
+pub fn fixedLiteralEncoder() LiteralEncoder {
+ var h: LiteralEncoder = undefined;
+ var ch: u16 = 0;
+
+ while (ch < consts.max_num_frequencies) : (ch += 1) {
+ var bits: u16 = undefined;
+ var size: u16 = undefined;
+ switch (ch) {
+ 0...143 => {
+ // size 8, 000110000 .. 10111111
+ bits = ch + 48;
+ size = 8;
+ },
+ 144...255 => {
+ // size 9, 110010000 .. 111111111
+ bits = ch + 400 - 144;
+ size = 9;
+ },
+ 256...279 => {
+ // size 7, 0000000 .. 0010111
+ bits = ch - 256;
+ size = 7;
+ },
+ else => {
+ // size 8, 11000000 .. 11000111
+ bits = ch + 192 - 280;
+ size = 8;
+ },
+ }
+ h.codes[ch] = HuffCode{ .code = bitReverse(u16, bits, @as(u5, @intCast(size))), .len = size };
+ }
+ return h;
+}
+
+pub fn fixedDistanceEncoder() DistanceEncoder {
+ var h: DistanceEncoder = undefined;
+ for (h.codes, 0..) |_, ch| {
+ h.codes[ch] = HuffCode{ .code = bitReverse(u16, @as(u16, @intCast(ch)), 5), .len = 5 };
+ }
+ return h;
+}
+
+pub fn huffmanDistanceEncoder() DistanceEncoder {
+ var distance_freq = [1]u16{0} ** consts.distance_code_count;
+ distance_freq[0] = 1;
+ // huff_distance is a static distance encoder used for huffman only encoding.
+ // It can be reused since we will not be encoding distance values.
+ var h: DistanceEncoder = .{};
+ h.generate(distance_freq[0..], 15);
+ return h;
+}
+
+fn byLiteral(context: void, a: LiteralNode, b: LiteralNode) bool {
+ _ = context;
+ return a.literal < b.literal;
+}
+
+fn byFreq(context: void, a: LiteralNode, b: LiteralNode) bool {
+ _ = context;
+ if (a.freq == b.freq) {
+ return a.literal < b.literal;
+ }
+ return a.freq < b.freq;
+}
+
+test "flate.HuffmanEncoder generate a Huffman code from an array of frequencies" {
+ var freqs: [19]u16 = [_]u16{
+ 8, // 0
+ 1, // 1
+ 1, // 2
+ 2, // 3
+ 5, // 4
+ 10, // 5
+ 9, // 6
+ 1, // 7
+ 0, // 8
+ 0, // 9
+ 0, // 10
+ 0, // 11
+ 0, // 12
+ 0, // 13
+ 0, // 14
+ 0, // 15
+ 1, // 16
+ 3, // 17
+ 5, // 18
+ };
+
+ var enc = huffmanEncoder(19);
+ enc.generate(freqs[0..], 7);
+
+ try testing.expectEqual(@as(u32, 141), enc.bitLength(freqs[0..]));
+
+ try testing.expectEqual(@as(usize, 3), enc.codes[0].len);
+ try testing.expectEqual(@as(usize, 6), enc.codes[1].len);
+ try testing.expectEqual(@as(usize, 6), enc.codes[2].len);
+ try testing.expectEqual(@as(usize, 5), enc.codes[3].len);
+ try testing.expectEqual(@as(usize, 3), enc.codes[4].len);
+ try testing.expectEqual(@as(usize, 2), enc.codes[5].len);
+ try testing.expectEqual(@as(usize, 2), enc.codes[6].len);
+ try testing.expectEqual(@as(usize, 6), enc.codes[7].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[8].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[9].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[10].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[11].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[12].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[13].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[14].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[15].len);
+ try testing.expectEqual(@as(usize, 6), enc.codes[16].len);
+ try testing.expectEqual(@as(usize, 5), enc.codes[17].len);
+ try testing.expectEqual(@as(usize, 3), enc.codes[18].len);
+
+ try testing.expectEqual(@as(u16, 0x0), enc.codes[5].code);
+ try testing.expectEqual(@as(u16, 0x2), enc.codes[6].code);
+ try testing.expectEqual(@as(u16, 0x1), enc.codes[0].code);
+ try testing.expectEqual(@as(u16, 0x5), enc.codes[4].code);
+ try testing.expectEqual(@as(u16, 0x3), enc.codes[18].code);
+ try testing.expectEqual(@as(u16, 0x7), enc.codes[3].code);
+ try testing.expectEqual(@as(u16, 0x17), enc.codes[17].code);
+ try testing.expectEqual(@as(u16, 0x0f), enc.codes[1].code);
+ try testing.expectEqual(@as(u16, 0x2f), enc.codes[2].code);
+ try testing.expectEqual(@as(u16, 0x1f), enc.codes[7].code);
+ try testing.expectEqual(@as(u16, 0x3f), enc.codes[16].code);
+}
+
+test "flate.HuffmanEncoder generate a Huffman code for the fixed literal table specific to Deflate" {
+ const enc = fixedLiteralEncoder();
+ for (enc.codes) |c| {
+ switch (c.len) {
+ 7 => {
+ const v = @bitReverse(@as(u7, @intCast(c.code)));
+ try testing.expect(v <= 0b0010111);
+ },
+ 8 => {
+ const v = @bitReverse(@as(u8, @intCast(c.code)));
+ try testing.expect((v >= 0b000110000 and v <= 0b10111111) or
+ (v >= 0b11000000 and v <= 11000111));
+ },
+ 9 => {
+ const v = @bitReverse(@as(u9, @intCast(c.code)));
+ try testing.expect(v >= 0b110010000 and v <= 0b111111111);
+ },
+ else => unreachable,
+ }
+ }
+}
+
+test "flate.HuffmanEncoder generate a Huffman code for the 30 possible relative distances (LZ77 distances) of Deflate" {
+ const enc = fixedDistanceEncoder();
+ for (enc.codes) |c| {
+ const v = @bitReverse(@as(u5, @intCast(c.code)));
+ try testing.expect(v <= 29);
+ try testing.expect(c.len == 5);
+ }
+}
+
+// Reverse bit-by-bit a N-bit code.
+fn bitReverse(comptime T: type, value: T, n: usize) T {
+ const r = @bitReverse(value);
+ return r >> @as(math.Log2Int(T), @intCast(@typeInfo(T).Int.bits - n));
+}
+
+test "flate bitReverse" {
+ const ReverseBitsTest = struct {
+ in: u16,
+ bit_count: u5,
+ out: u16,
+ };
+
+ const reverse_bits_tests = [_]ReverseBitsTest{
+ .{ .in = 1, .bit_count = 1, .out = 1 },
+ .{ .in = 1, .bit_count = 2, .out = 2 },
+ .{ .in = 1, .bit_count = 3, .out = 4 },
+ .{ .in = 1, .bit_count = 4, .out = 8 },
+ .{ .in = 1, .bit_count = 5, .out = 16 },
+ .{ .in = 17, .bit_count = 5, .out = 17 },
+ .{ .in = 257, .bit_count = 9, .out = 257 },
+ .{ .in = 29, .bit_count = 5, .out = 23 },
+ };
+
+ for (reverse_bits_tests) |h| {
+ const v = bitReverse(u16, h.in, h.bit_count);
+ try std.testing.expectEqual(h.out, v);
+ }
+}
+
+test "flate.HuffmanEncoder fixedLiteralEncoder codes" {
+ var al = std.ArrayList(u8).init(testing.allocator);
+ defer al.deinit();
+ var bw = std.io.bitWriter(.little, al.writer());
+
+ const f = fixedLiteralEncoder();
+ for (f.codes) |c| {
+ try bw.writeBits(c.code, c.len);
+ }
+ try testing.expectEqualSlices(u8, &fixed_codes, al.items);
+}
+
+pub const fixed_codes = [_]u8{
+ 0b00001100, 0b10001100, 0b01001100, 0b11001100, 0b00101100, 0b10101100, 0b01101100, 0b11101100,
+ 0b00011100, 0b10011100, 0b01011100, 0b11011100, 0b00111100, 0b10111100, 0b01111100, 0b11111100,
+ 0b00000010, 0b10000010, 0b01000010, 0b11000010, 0b00100010, 0b10100010, 0b01100010, 0b11100010,
+ 0b00010010, 0b10010010, 0b01010010, 0b11010010, 0b00110010, 0b10110010, 0b01110010, 0b11110010,
+ 0b00001010, 0b10001010, 0b01001010, 0b11001010, 0b00101010, 0b10101010, 0b01101010, 0b11101010,
+ 0b00011010, 0b10011010, 0b01011010, 0b11011010, 0b00111010, 0b10111010, 0b01111010, 0b11111010,
+ 0b00000110, 0b10000110, 0b01000110, 0b11000110, 0b00100110, 0b10100110, 0b01100110, 0b11100110,
+ 0b00010110, 0b10010110, 0b01010110, 0b11010110, 0b00110110, 0b10110110, 0b01110110, 0b11110110,
+ 0b00001110, 0b10001110, 0b01001110, 0b11001110, 0b00101110, 0b10101110, 0b01101110, 0b11101110,
+ 0b00011110, 0b10011110, 0b01011110, 0b11011110, 0b00111110, 0b10111110, 0b01111110, 0b11111110,
+ 0b00000001, 0b10000001, 0b01000001, 0b11000001, 0b00100001, 0b10100001, 0b01100001, 0b11100001,
+ 0b00010001, 0b10010001, 0b01010001, 0b11010001, 0b00110001, 0b10110001, 0b01110001, 0b11110001,
+ 0b00001001, 0b10001001, 0b01001001, 0b11001001, 0b00101001, 0b10101001, 0b01101001, 0b11101001,
+ 0b00011001, 0b10011001, 0b01011001, 0b11011001, 0b00111001, 0b10111001, 0b01111001, 0b11111001,
+ 0b00000101, 0b10000101, 0b01000101, 0b11000101, 0b00100101, 0b10100101, 0b01100101, 0b11100101,
+ 0b00010101, 0b10010101, 0b01010101, 0b11010101, 0b00110101, 0b10110101, 0b01110101, 0b11110101,
+ 0b00001101, 0b10001101, 0b01001101, 0b11001101, 0b00101101, 0b10101101, 0b01101101, 0b11101101,
+ 0b00011101, 0b10011101, 0b01011101, 0b11011101, 0b00111101, 0b10111101, 0b01111101, 0b11111101,
+ 0b00010011, 0b00100110, 0b01001110, 0b10011010, 0b00111100, 0b01100101, 0b11101010, 0b10110100,
+ 0b11101001, 0b00110011, 0b01100110, 0b11001110, 0b10011010, 0b00111101, 0b01100111, 0b11101110,
+ 0b10111100, 0b11111001, 0b00001011, 0b00010110, 0b00101110, 0b01011010, 0b10111100, 0b01100100,
+ 0b11101001, 0b10110010, 0b11100101, 0b00101011, 0b01010110, 0b10101110, 0b01011010, 0b10111101,
+ 0b01100110, 0b11101101, 0b10111010, 0b11110101, 0b00011011, 0b00110110, 0b01101110, 0b11011010,
+ 0b10111100, 0b01100101, 0b11101011, 0b10110110, 0b11101101, 0b00111011, 0b01110110, 0b11101110,
+ 0b11011010, 0b10111101, 0b01100111, 0b11101111, 0b10111110, 0b11111101, 0b00000111, 0b00001110,
+ 0b00011110, 0b00111010, 0b01111100, 0b11100100, 0b11101000, 0b10110001, 0b11100011, 0b00100111,
+ 0b01001110, 0b10011110, 0b00111010, 0b01111101, 0b11100110, 0b11101100, 0b10111001, 0b11110011,
+ 0b00010111, 0b00101110, 0b01011110, 0b10111010, 0b01111100, 0b11100101, 0b11101010, 0b10110101,
+ 0b11101011, 0b00110111, 0b01101110, 0b11011110, 0b10111010, 0b01111101, 0b11100111, 0b11101110,
+ 0b10111101, 0b11111011, 0b00001111, 0b00011110, 0b00111110, 0b01111010, 0b11111100, 0b11100100,
+ 0b11101001, 0b10110011, 0b11100111, 0b00101111, 0b01011110, 0b10111110, 0b01111010, 0b11111101,
+ 0b11100110, 0b11101101, 0b10111011, 0b11110111, 0b00011111, 0b00111110, 0b01111110, 0b11111010,
+ 0b11111100, 0b11100101, 0b11101011, 0b10110111, 0b11101111, 0b00111111, 0b01111110, 0b11111110,
+ 0b11111010, 0b11111101, 0b11100111, 0b11101111, 0b10111111, 0b11111111, 0b00000000, 0b00100000,
+ 0b00001000, 0b00001100, 0b10000001, 0b11000010, 0b11100000, 0b00001000, 0b00100100, 0b00001010,
+ 0b10001101, 0b11000001, 0b11100010, 0b11110000, 0b00000100, 0b00100010, 0b10001001, 0b01001100,
+ 0b10100001, 0b11010010, 0b11101000, 0b00000011, 0b10000011, 0b01000011, 0b11000011, 0b00100011,
+ 0b10100011,
+};
lib/std/compress/flate/inflate.zig
@@ -0,0 +1,546 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const testing = std.testing;
+
+const hfd = @import("huffman_decoder.zig");
+const BitReader = @import("bit_reader.zig").BitReader;
+const CircularBuffer = @import("CircularBuffer.zig");
+const Container = @import("container.zig").Container;
+const Token = @import("Token.zig");
+const codegen_order = @import("consts.zig").huffman.codegen_order;
+
+/// Decompresses deflate bit stream `reader` and writes uncompressed data to the
+/// `writer` stream.
+pub fn decompress(comptime container: Container, reader: anytype, writer: anytype) !void {
+ var d = decompressor(container, reader);
+ try d.decompress(writer);
+}
+
+/// Inflate decompressor for the reader type.
+pub fn decompressor(comptime container: Container, reader: anytype) Inflate(container, @TypeOf(reader)) {
+ return Inflate(container, @TypeOf(reader)).init(reader);
+}
+
+/// Inflate decompresses deflate bit stream. Reads compressed data from reader
+/// provided in init. Decompressed data are stored in internal hist buffer and
+/// can be accesses iterable `next` or reader interface.
+///
+/// Container defines header/footer wrapper around deflate bit stream. Can be
+/// gzip or zlib.
+///
+/// Deflate bit stream consists of multiple blocks. Block can be one of three types:
+/// * stored, non compressed, max 64k in size
+/// * fixed, huffman codes are predefined
+/// * dynamic, huffman code tables are encoded at the block start
+///
+/// `step` function runs decoder until internal `hist` buffer is full. Client
+/// than needs to read that data in order to proceed with decoding.
+///
+/// Allocates 74.5K of internal buffers, most important are:
+/// * 64K for history (CircularBuffer)
+/// * ~10K huffman decoders (Literal and DistanceDecoder)
+///
+pub fn Inflate(comptime container: Container, comptime ReaderType: type) type {
+ return struct {
+ const BitReaderType = BitReader(ReaderType);
+ const F = BitReaderType.flag;
+
+ bits: BitReaderType = .{},
+ hist: CircularBuffer = .{},
+ // Hashes, produces checkusm, of uncompressed data for gzip/zlib footer.
+ hasher: container.Hasher() = .{},
+
+ // dynamic block huffman code decoders
+ lit_dec: hfd.LiteralDecoder = .{}, // literals
+ dst_dec: hfd.DistanceDecoder = .{}, // distances
+
+ // current read state
+ bfinal: u1 = 0,
+ block_type: u2 = 0b11,
+ state: ReadState = .protocol_header,
+
+ const ReadState = enum {
+ protocol_header,
+ block_header,
+ block,
+ protocol_footer,
+ end,
+ };
+
+ const Self = @This();
+
+ pub const Error = BitReaderType.Error || Container.Error || hfd.Error || error{
+ InvalidCode,
+ InvalidMatch,
+ InvalidBlockType,
+ WrongStoredBlockNlen,
+ InvalidDynamicBlockHeader,
+ };
+
+ pub fn init(rt: ReaderType) Self {
+ return .{ .bits = BitReaderType.init(rt) };
+ }
+
+ fn blockHeader(self: *Self) !void {
+ self.bfinal = try self.bits.read(u1);
+ self.block_type = try self.bits.read(u2);
+ }
+
+ fn storedBlock(self: *Self) !bool {
+ self.bits.alignToByte(); // skip padding until byte boundary
+ // everyting after this is byte aligned in stored block
+ var len = try self.bits.read(u16);
+ const nlen = try self.bits.read(u16);
+ if (len != ~nlen) return error.WrongStoredBlockNlen;
+
+ while (len > 0) {
+ const buf = self.hist.getWritable(len);
+ try self.bits.readAll(buf);
+ len -= @intCast(buf.len);
+ }
+ return true;
+ }
+
+ fn fixedBlock(self: *Self) !bool {
+ while (!self.hist.full()) {
+ const code = try self.bits.readFixedCode();
+ switch (code) {
+ 0...255 => self.hist.write(@intCast(code)),
+ 256 => return true, // end of block
+ 257...285 => try self.fixedDistanceCode(@intCast(code - 257)),
+ else => return error.InvalidCode,
+ }
+ }
+ return false;
+ }
+
+ // Handles fixed block non literal (length) code.
+ // Length code is followed by 5 bits of distance code.
+ fn fixedDistanceCode(self: *Self, code: u8) !void {
+ try self.bits.fill(5 + 5 + 13);
+ const length = try self.decodeLength(code);
+ const distance = try self.decodeDistance(try self.bits.readF(u5, F.buffered | F.reverse));
+ try self.hist.writeMatch(length, distance);
+ }
+
+ inline fn decodeLength(self: *Self, code: u8) !u16 {
+ if (code > 28) return error.InvalidCode;
+ const ml = Token.matchLength(code);
+ return if (ml.extra_bits == 0) // 0 - 5 extra bits
+ ml.base
+ else
+ ml.base + try self.bits.readN(ml.extra_bits, F.buffered);
+ }
+
+ fn decodeDistance(self: *Self, code: u8) !u16 {
+ if (code > 29) return error.InvalidCode;
+ const md = Token.matchDistance(code);
+ return if (md.extra_bits == 0) // 0 - 13 extra bits
+ md.base
+ else
+ md.base + try self.bits.readN(md.extra_bits, F.buffered);
+ }
+
+ fn dynamicBlockHeader(self: *Self) !void {
+ const hlit: u16 = @as(u16, try self.bits.read(u5)) + 257; // number of ll code entries present - 257
+ const hdist: u16 = @as(u16, try self.bits.read(u5)) + 1; // number of distance code entries - 1
+ const hclen: u8 = @as(u8, try self.bits.read(u4)) + 4; // hclen + 4 code lenths are encoded
+
+ if (hlit > 286 or hdist > 30)
+ return error.InvalidDynamicBlockHeader;
+
+ // lengths for code lengths
+ var cl_lens = [_]u4{0} ** 19;
+ for (0..hclen) |i| {
+ cl_lens[codegen_order[i]] = try self.bits.read(u3);
+ }
+ var cl_dec: hfd.CodegenDecoder = .{};
+ try cl_dec.generate(&cl_lens);
+
+ // literal code lengths
+ var lit_lens = [_]u4{0} ** (286);
+ var pos: usize = 0;
+ while (pos < hlit) {
+ const sym = try cl_dec.find(try self.bits.peekF(u7, F.reverse));
+ try self.bits.shift(sym.code_bits);
+ pos += try self.dynamicCodeLength(sym.symbol, &lit_lens, pos);
+ }
+ if (pos > hlit)
+ return error.InvalidDynamicBlockHeader;
+
+ // distance code lenths
+ var dst_lens = [_]u4{0} ** (30);
+ pos = 0;
+ while (pos < hdist) {
+ const sym = try cl_dec.find(try self.bits.peekF(u7, F.reverse));
+ try self.bits.shift(sym.code_bits);
+ pos += try self.dynamicCodeLength(sym.symbol, &dst_lens, pos);
+ }
+ if (pos > hdist)
+ return error.InvalidDynamicBlockHeader;
+
+ try self.lit_dec.generate(&lit_lens);
+ try self.dst_dec.generate(&dst_lens);
+ }
+
+ // Decode code length symbol to code length. Writes decoded length into
+ // lens slice starting at position pos. Returns number of positions
+ // advanced.
+ fn dynamicCodeLength(self: *Self, code: u16, lens: []u4, pos: usize) !usize {
+ if (pos >= lens.len)
+ return error.InvalidDynamicBlockHeader;
+
+ switch (code) {
+ 0...15 => {
+ // Represent code lengths of 0 - 15
+ lens[pos] = @intCast(code);
+ return 1;
+ },
+ 16 => {
+ // Copy the previous code length 3 - 6 times.
+ // The next 2 bits indicate repeat length
+ const n: u8 = @as(u8, try self.bits.read(u2)) + 3;
+ if (pos == 0 or pos + n > lens.len)
+ return error.InvalidDynamicBlockHeader;
+ for (0..n) |i| {
+ lens[pos + i] = lens[pos + i - 1];
+ }
+ return n;
+ },
+ // Repeat a code length of 0 for 3 - 10 times. (3 bits of length)
+ 17 => return @as(u8, try self.bits.read(u3)) + 3,
+ // Repeat a code length of 0 for 11 - 138 times (7 bits of length)
+ 18 => return @as(u8, try self.bits.read(u7)) + 11,
+ else => return error.InvalidDynamicBlockHeader,
+ }
+ }
+
+ // In larger archives most blocks are usually dynamic, so decompression
+ // performance depends on this function.
+ fn dynamicBlock(self: *Self) !bool {
+ // Hot path loop!
+ while (!self.hist.full()) {
+ try self.bits.fill(15); // optimization so other bit reads can be buffered (avoiding one `if` in hot path)
+ const sym = try self.decodeSymbol(&self.lit_dec);
+
+ switch (sym.kind) {
+ .literal => self.hist.write(sym.symbol),
+ .match => { // Decode match backreference <length, distance>
+ try self.bits.fill(5 + 15 + 13); // so we can use buffered reads
+ const length = try self.decodeLength(sym.symbol);
+ const dsm = try self.decodeSymbol(&self.dst_dec);
+ const distance = try self.decodeDistance(dsm.symbol);
+ try self.hist.writeMatch(length, distance);
+ },
+ .end_of_block => return true,
+ }
+ }
+ return false;
+ }
+
+ // Peek 15 bits from bits reader (maximum code len is 15 bits). Use
+ // decoder to find symbol for that code. We then know how many bits is
+ // used. Shift bit reader for that much bits, those bits are used. And
+ // return symbol.
+ fn decodeSymbol(self: *Self, decoder: anytype) !hfd.Symbol {
+ const sym = try decoder.find(try self.bits.peekF(u15, F.buffered | F.reverse));
+ try self.bits.shift(sym.code_bits);
+ return sym;
+ }
+
+ fn step(self: *Self) !void {
+ switch (self.state) {
+ .protocol_header => {
+ try container.parseHeader(&self.bits);
+ self.state = .block_header;
+ },
+ .block_header => {
+ try self.blockHeader();
+ self.state = .block;
+ if (self.block_type == 2) try self.dynamicBlockHeader();
+ },
+ .block => {
+ const done = switch (self.block_type) {
+ 0 => try self.storedBlock(),
+ 1 => try self.fixedBlock(),
+ 2 => try self.dynamicBlock(),
+ else => return error.InvalidBlockType,
+ };
+ if (done) {
+ self.state = if (self.bfinal == 1) .protocol_footer else .block_header;
+ }
+ },
+ .protocol_footer => {
+ self.bits.alignToByte();
+ try container.parseFooter(&self.hasher, &self.bits);
+ self.state = .end;
+ },
+ .end => {},
+ }
+ }
+
+ /// Replaces the inner reader with new reader.
+ pub fn setReader(self: *Self, new_reader: ReaderType) void {
+ self.bits.forward_reader = new_reader;
+ if (self.state == .end or self.state == .protocol_footer) {
+ self.state = .protocol_header;
+ }
+ }
+
+ // Reads all compressed data from the internal reader and outputs plain
+ // (uncompressed) data to the provided writer.
+ pub fn decompress(self: *Self, writer: anytype) !void {
+ while (try self.next()) |buf| {
+ try writer.writeAll(buf);
+ }
+ }
+
+ // Iterator interface
+
+ /// Can be used in iterator like loop without memcpy to another buffer:
+ /// while (try inflate.next()) |buf| { ... }
+ pub fn next(self: *Self) Error!?[]const u8 {
+ const out = try self.get(0);
+ if (out.len == 0) return null;
+ return out;
+ }
+
+ /// Returns decompressed data from internal sliding window buffer.
+ /// Returned buffer can be any length between 0 and `limit` bytes.
+ /// 0 returned bytes means end of stream reached.
+ /// With limit=0 returns as much data can. It newer will be more
+ /// than 65536 bytes, which is limit of internal buffer.
+ pub fn get(self: *Self, limit: usize) Error![]const u8 {
+ while (true) {
+ const out = self.hist.readAtMost(limit);
+ if (out.len > 0) {
+ self.hasher.update(out);
+ return out;
+ }
+ if (self.state == .end) return out;
+ try self.step();
+ }
+ }
+
+ // Reader interface
+
+ pub const Reader = std.io.Reader(*Self, Error, read);
+
+ /// Returns the number of bytes read. It may be less than buffer.len.
+ /// If the number of bytes read is 0, it means end of stream.
+ /// End of stream is not an error condition.
+ pub fn read(self: *Self, buffer: []u8) Error!usize {
+ const out = try self.get(buffer.len);
+ @memcpy(buffer[0..out.len], out);
+ return out.len;
+ }
+
+ pub fn reader(self: *Self) Reader {
+ return .{ .context = self };
+ }
+ };
+}
+
+test "flate.Inflate struct sizes" {
+ var fbs = std.io.fixedBufferStream("");
+ const ReaderType = @TypeOf(fbs.reader());
+ const inflate_size = @sizeOf(Inflate(.gzip, ReaderType));
+
+ try testing.expectEqual(76320, inflate_size);
+ try testing.expectEqual(
+ @sizeOf(CircularBuffer) + @sizeOf(hfd.LiteralDecoder) + @sizeOf(hfd.DistanceDecoder) + 48,
+ inflate_size,
+ );
+ try testing.expectEqual(65536 + 8 + 8, @sizeOf(CircularBuffer));
+ try testing.expectEqual(8, @sizeOf(Container.raw.Hasher()));
+ try testing.expectEqual(24, @sizeOf(BitReader(ReaderType)));
+ try testing.expectEqual(6384, @sizeOf(hfd.LiteralDecoder));
+ try testing.expectEqual(4336, @sizeOf(hfd.DistanceDecoder));
+}
+
+test "flate.Inflate decompress" {
+ const cases = [_]struct {
+ in: []const u8,
+ out: []const u8,
+ }{
+ // non compressed block (type 0)
+ .{
+ .in = &[_]u8{
+ 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
+ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
+ },
+ .out = "Hello world\n",
+ },
+ // fixed code block (type 1)
+ .{
+ .in = &[_]u8{
+ 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1
+ 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00,
+ },
+ .out = "Hello world\n",
+ },
+ // dynamic block (type 2)
+ .{
+ .in = &[_]u8{
+ 0x3d, 0xc6, 0x39, 0x11, 0x00, 0x00, 0x0c, 0x02, // deflate data block type 2
+ 0x30, 0x2b, 0xb5, 0x52, 0x1e, 0xff, 0x96, 0x38,
+ 0x16, 0x96, 0x5c, 0x1e, 0x94, 0xcb, 0x6d, 0x01,
+ },
+ .out = "ABCDEABCD ABCDEABCD",
+ },
+ };
+ for (cases) |c| {
+ var fb = std.io.fixedBufferStream(c.in);
+ var al = std.ArrayList(u8).init(testing.allocator);
+ defer al.deinit();
+
+ try decompress(.raw, fb.reader(), al.writer());
+ try testing.expectEqualStrings(c.out, al.items);
+ }
+}
+
+test "flate.Inflate gzip decompress" {
+ const cases = [_]struct {
+ in: []const u8,
+ out: []const u8,
+ }{
+ // non compressed block (type 0)
+ .{
+ .in = &[_]u8{
+ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header (10 bytes)
+ 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
+ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
+ 0xd5, 0xe0, 0x39, 0xb7, // gzip footer: checksum
+ 0x0c, 0x00, 0x00, 0x00, // gzip footer: size
+ },
+ .out = "Hello world\n",
+ },
+ // fixed code block (type 1)
+ .{
+ .in = &[_]u8{
+ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x03, // gzip header (10 bytes)
+ 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1
+ 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00,
+ 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00, // gzip footer (chksum, len)
+ },
+ .out = "Hello world\n",
+ },
+ // dynamic block (type 2)
+ .{
+ .in = &[_]u8{
+ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header (10 bytes)
+ 0x3d, 0xc6, 0x39, 0x11, 0x00, 0x00, 0x0c, 0x02, // deflate data block type 2
+ 0x30, 0x2b, 0xb5, 0x52, 0x1e, 0xff, 0x96, 0x38,
+ 0x16, 0x96, 0x5c, 0x1e, 0x94, 0xcb, 0x6d, 0x01,
+ 0x17, 0x1c, 0x39, 0xb4, 0x13, 0x00, 0x00, 0x00, // gzip footer (chksum, len)
+ },
+ .out = "ABCDEABCD ABCDEABCD",
+ },
+ // gzip header with name
+ .{
+ .in = &[_]u8{
+ 0x1f, 0x8b, 0x08, 0x08, 0xe5, 0x70, 0xb1, 0x65, 0x00, 0x03, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x2e,
+ 0x74, 0x78, 0x74, 0x00, 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, 0x2f, 0xca, 0x49, 0xe1,
+ 0x02, 0x00, 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00,
+ },
+ .out = "Hello world\n",
+ },
+ };
+ for (cases) |c| {
+ var fb = std.io.fixedBufferStream(c.in);
+ var al = std.ArrayList(u8).init(testing.allocator);
+ defer al.deinit();
+
+ try decompress(.gzip, fb.reader(), al.writer());
+ try testing.expectEqualStrings(c.out, al.items);
+ }
+}
+
+test "flate.Inflate zlib decompress" {
+ const cases = [_]struct {
+ in: []const u8,
+ out: []const u8,
+ }{
+ // non compressed block (type 0)
+ .{
+ .in = &[_]u8{
+ 0x78, 0b10_0_11100, // zlib header (2 bytes)
+ 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
+ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
+ 0x1c, 0xf2, 0x04, 0x47, // zlib footer: checksum
+ },
+ .out = "Hello world\n",
+ },
+ };
+ for (cases) |c| {
+ var fb = std.io.fixedBufferStream(c.in);
+ var al = std.ArrayList(u8).init(testing.allocator);
+ defer al.deinit();
+
+ try decompress(.zlib, fb.reader(), al.writer());
+ try testing.expectEqualStrings(c.out, al.items);
+ }
+}
+
+test "flate.Inflate fuzzing tests" {
+ const cases = [_]struct {
+ input: []const u8,
+ out: []const u8 = "",
+ err: ?anyerror = null,
+ }{
+ .{ .input = "deflate-stream", .out = @embedFile("testdata/fuzz/deflate-stream.expect") }, // 0
+ .{ .input = "empty-distance-alphabet01" },
+ .{ .input = "empty-distance-alphabet02" },
+ .{ .input = "end-of-stream", .err = error.EndOfStream },
+ .{ .input = "invalid-distance", .err = error.InvalidMatch },
+ .{ .input = "invalid-tree01", .err = error.IncompleteHuffmanTree }, // 5
+ .{ .input = "invalid-tree02", .err = error.IncompleteHuffmanTree },
+ .{ .input = "invalid-tree03", .err = error.IncompleteHuffmanTree },
+ .{ .input = "lengths-overflow", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "out-of-codes", .err = error.InvalidCode },
+ .{ .input = "puff01", .err = error.WrongStoredBlockNlen }, // 10
+ .{ .input = "puff02", .err = error.EndOfStream },
+ .{ .input = "puff03", .out = &[_]u8{0xa} },
+ .{ .input = "puff04", .err = error.InvalidCode },
+ .{ .input = "puff05", .err = error.EndOfStream },
+ .{ .input = "puff06", .err = error.EndOfStream },
+ .{ .input = "puff08", .err = error.InvalidCode },
+ .{ .input = "puff09", .out = "P" },
+ .{ .input = "puff10", .err = error.InvalidCode },
+ .{ .input = "puff11", .err = error.InvalidMatch },
+ .{ .input = "puff12", .err = error.InvalidDynamicBlockHeader }, // 20
+ .{ .input = "puff13", .err = error.IncompleteHuffmanTree },
+ .{ .input = "puff14", .err = error.EndOfStream },
+ .{ .input = "puff15", .err = error.IncompleteHuffmanTree },
+ .{ .input = "puff16", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "puff17", .err = error.InvalidDynamicBlockHeader }, // 25
+ .{ .input = "fuzz1", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "fuzz2", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "fuzz3", .err = error.InvalidMatch },
+ .{ .input = "fuzz4", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff18", .err = error.OversubscribedHuffmanTree }, // 30
+ .{ .input = "puff19", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff20", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff21", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff22", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff23", .err = error.InvalidDynamicBlockHeader }, // 35
+ .{ .input = "puff24", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "puff25", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff26", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "puff27", .err = error.InvalidDynamicBlockHeader },
+ };
+
+ inline for (cases, 0..) |c, case_no| {
+ var in = std.io.fixedBufferStream(@embedFile("testdata/fuzz/" ++ c.input ++ ".input"));
+ var out = std.ArrayList(u8).init(testing.allocator);
+ defer out.deinit();
+ errdefer std.debug.print("test case failed {}\n", .{case_no});
+
+ if (c.err) |expected_err| {
+ try testing.expectError(expected_err, decompress(.raw, in.reader(), out.writer()));
+ } else {
+ try decompress(.raw, in.reader(), out.writer());
+ try testing.expectEqualStrings(c.out, out.items);
+ }
+ }
+}
lib/std/compress/flate/Lookup.zig
@@ -0,0 +1,125 @@
+/// Lookup of the previous locations for the same 4 byte data. Works on hash of
+/// 4 bytes data. Head contains position of the first match for each hash. Chain
+/// points to the previous position of the same hash given the current location.
+///
+const std = @import("std");
+const testing = std.testing;
+const expect = testing.expect;
+const consts = @import("consts.zig");
+
+const Self = @This();
+
+const prime4 = 0x9E3779B1; // 4 bytes prime number 2654435761
+const chain_len = 2 * consts.history.len;
+
+// Maps hash => first position
+head: [consts.lookup.len]u16 = [_]u16{0} ** consts.lookup.len,
+// Maps position => previous positions for the same hash value
+chain: [chain_len]u16 = [_]u16{0} ** (chain_len),
+
+// Calculates hash of the 4 bytes from data.
+// Inserts `pos` position of that hash in the lookup tables.
+// Returns previous location with the same hash value.
+pub fn add(self: *Self, data: []const u8, pos: u16) u16 {
+ if (data.len < 4) return 0;
+ const h = hash(data[0..4]);
+ return self.set(h, pos);
+}
+
+// Retruns previous location with the same hash value given the current
+// position.
+pub fn prev(self: *Self, pos: u16) u16 {
+ return self.chain[pos];
+}
+
+fn set(self: *Self, h: u32, pos: u16) u16 {
+ const p = self.head[h];
+ self.head[h] = pos;
+ self.chain[pos] = p;
+ return p;
+}
+
+// Slide all positions in head and chain for `n`
+pub fn slide(self: *Self, n: u16) void {
+ for (&self.head) |*v| {
+ v.* -|= n;
+ }
+ var i: usize = 0;
+ while (i < n) : (i += 1) {
+ self.chain[i] = self.chain[i + n] -| n;
+ }
+}
+
+// Add `len` 4 bytes hashes from `data` into lookup.
+// Position of the first byte is `pos`.
+pub fn bulkAdd(self: *Self, data: []const u8, len: u16, pos: u16) void {
+ if (len == 0 or data.len < consts.match.min_length) {
+ return;
+ }
+ var hb =
+ @as(u32, data[3]) |
+ @as(u32, data[2]) << 8 |
+ @as(u32, data[1]) << 16 |
+ @as(u32, data[0]) << 24;
+ _ = self.set(hashu(hb), pos);
+
+ var i = pos;
+ for (4..@min(len + 3, data.len)) |j| {
+ hb = (hb << 8) | @as(u32, data[j]);
+ i += 1;
+ _ = self.set(hashu(hb), i);
+ }
+}
+
+// Calculates hash of the first 4 bytes of `b`.
+fn hash(b: *const [4]u8) u32 {
+ return hashu(@as(u32, b[3]) |
+ @as(u32, b[2]) << 8 |
+ @as(u32, b[1]) << 16 |
+ @as(u32, b[0]) << 24);
+}
+
+fn hashu(v: u32) u32 {
+ return @intCast((v *% prime4) >> consts.lookup.shift);
+}
+
+test "flate.Lookup add/prev" {
+ const data = [_]u8{
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
+ 0x01, 0x02, 0x03,
+ };
+
+ var h: Self = .{};
+ for (data, 0..) |_, i| {
+ const p = h.add(data[i..], @intCast(i));
+ if (i >= 8 and i < 24) {
+ try expect(p == i - 8);
+ } else {
+ try expect(p == 0);
+ }
+ }
+
+ const v = Self.hash(data[2 .. 2 + 4]);
+ try expect(h.head[v] == 2 + 16);
+ try expect(h.chain[2 + 16] == 2 + 8);
+ try expect(h.chain[2 + 8] == 2);
+}
+
+test "flate.Lookup bulkAdd" {
+ const data = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
+
+ // one by one
+ var h: Self = .{};
+ for (data, 0..) |_, i| {
+ _ = h.add(data[i..], @intCast(i));
+ }
+
+ // in bulk
+ var bh: Self = .{};
+ bh.bulkAdd(data, data.len, 0);
+
+ try testing.expectEqualSlices(u16, &h.head, &bh.head);
+ try testing.expectEqualSlices(u16, &h.chain, &bh.chain);
+}
lib/std/compress/flate/root.zig
@@ -0,0 +1,133 @@
+pub const flate = @import("flate.zig");
+pub const gzip = @import("gzip.zig");
+pub const zlib = @import("zlib.zig");
+
+test "flate" {
+ _ = @import("deflate.zig");
+ _ = @import("inflate.zig");
+}
+
+test "flate public interface" {
+ const plain_data = [_]u8{ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a };
+
+ // deflate final stored block, header + plain (stored) data
+ const deflate_block = [_]u8{
+ 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
+ } ++ plain_data;
+
+ // gzip header/footer + deflate block
+ const gzip_data =
+ [_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 } ++ // gzip header (10 bytes)
+ deflate_block ++
+ [_]u8{ 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00 }; // gzip footer checksum (4 byte), size (4 bytes)
+
+ // zlib header/footer + deflate block
+ const zlib_data = [_]u8{ 0x78, 0b10_0_11100 } ++ // zlib header (2 bytes)}
+ deflate_block ++
+ [_]u8{ 0x1c, 0xf2, 0x04, 0x47 }; // zlib footer: checksum
+
+ try testInterface(gzip, &gzip_data, &plain_data);
+ try testInterface(zlib, &zlib_data, &plain_data);
+ try testInterface(flate, &deflate_block, &plain_data);
+}
+
+fn testInterface(comptime pkg: type, gzip_data: []const u8, plain_data: []const u8) !void {
+ const std = @import("std");
+ const testing = std.testing;
+ const fixedBufferStream = std.io.fixedBufferStream;
+
+ var buffer1: [64]u8 = undefined;
+ var buffer2: [64]u8 = undefined;
+
+ var compressed = fixedBufferStream(&buffer1);
+ var plain = fixedBufferStream(&buffer2);
+
+ // decompress
+ {
+ var in = fixedBufferStream(gzip_data);
+ try pkg.decompress(in.reader(), plain.writer());
+ try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ }
+ plain.reset();
+ compressed.reset();
+
+ // compress/decompress
+ {
+ var in = fixedBufferStream(plain_data);
+ try pkg.compress(in.reader(), compressed.writer(), .{});
+ compressed.reset();
+ try pkg.decompress(compressed.reader(), plain.writer());
+ try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ }
+ plain.reset();
+ compressed.reset();
+
+ // compressor/decompressor
+ {
+ var in = fixedBufferStream(plain_data);
+ var cmp = try pkg.compressor(compressed.writer(), .{});
+ try cmp.compress(in.reader());
+ try cmp.finish();
+
+ compressed.reset();
+ var dcp = pkg.decompressor(compressed.reader());
+ try dcp.decompress(plain.writer());
+ try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ }
+ plain.reset();
+ compressed.reset();
+
+ // huffman
+ {
+ // huffman compress/decompress
+ {
+ var in = fixedBufferStream(plain_data);
+ try pkg.huffman.compress(in.reader(), compressed.writer());
+ compressed.reset();
+ try pkg.decompress(compressed.reader(), plain.writer());
+ try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ }
+ plain.reset();
+ compressed.reset();
+
+ // huffman compressor/decompressor
+ {
+ var in = fixedBufferStream(plain_data);
+ var cmp = try pkg.huffman.compressor(compressed.writer());
+ try cmp.compress(in.reader());
+ try cmp.finish();
+
+ compressed.reset();
+ try pkg.decompress(compressed.reader(), plain.writer());
+ try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ }
+ }
+ plain.reset();
+ compressed.reset();
+
+ // store
+ {
+ // store compress/decompress
+ {
+ var in = fixedBufferStream(plain_data);
+ try pkg.store.compress(in.reader(), compressed.writer());
+ compressed.reset();
+ try pkg.decompress(compressed.reader(), plain.writer());
+ try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ }
+ plain.reset();
+ compressed.reset();
+
+ // store compressor/decompressor
+ {
+ var in = fixedBufferStream(plain_data);
+ var cmp = try pkg.store.compressor(compressed.writer());
+ try cmp.compress(in.reader());
+ try cmp.finish();
+
+ compressed.reset();
+ try pkg.decompress(compressed.reader(), plain.writer());
+ try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ }
+ }
+}
lib/std/compress/flate/SlidingWindow.zig
@@ -0,0 +1,160 @@
+/// Used in deflate (compression), holds uncompressed data form which Tokens are
+/// produces. In combination with Lookup it is used to find matches in history data.
+///
+const std = @import("std");
+const consts = @import("consts.zig");
+
+const expect = testing.expect;
+const assert = std.debug.assert;
+const testing = std.testing;
+
+const hist_len = consts.history.len;
+const buffer_len = 2 * hist_len;
+const min_lookahead = consts.match.min_length + consts.match.max_length;
+const max_rp = buffer_len - min_lookahead;
+
+const Self = @This();
+
+buffer: [buffer_len]u8 = undefined,
+wp: usize = 0, // write position
+rp: usize = 0, // read position
+fp: isize = 0, // last flush position, tokens are build from fp..rp
+
+// Returns number of bytes written, or 0 if buffer is full and need to slide.
+pub fn write(self: *Self, buf: []const u8) usize {
+ if (self.rp >= max_rp) return 0; // need to slide
+
+ const n = @min(buf.len, buffer_len - self.wp);
+ @memcpy(self.buffer[self.wp .. self.wp + n], buf[0..n]);
+ self.wp += n;
+ return n;
+}
+
+// Slide buffer for hist_len.
+// Drops old history, preserves between hist_len and hist_len - min_lookahead.
+// Returns number of bytes removed.
+pub fn slide(self: *Self) u16 {
+ assert(self.rp >= max_rp and self.wp >= self.rp);
+ const n = self.wp - hist_len;
+ @memcpy(self.buffer[0..n], self.buffer[hist_len..self.wp]);
+ self.rp -= hist_len;
+ self.wp -= hist_len;
+ self.fp -= hist_len;
+ return @intCast(n);
+}
+
+// Data from the current position (read position). Those part of the buffer is
+// not converted to tokens yet.
+fn lookahead(self: *Self) []const u8 {
+ assert(self.wp >= self.rp);
+ return self.buffer[self.rp..self.wp];
+}
+
+// Returns part of the lookahead buffer. If should_flush is set no lookahead is
+// preserved otherwise preserves enough data for the longest match. Returns
+// null if there is not enough data.
+pub fn activeLookahead(self: *Self, should_flush: bool) ?[]const u8 {
+ const min: usize = if (should_flush) 0 else min_lookahead;
+ const lh = self.lookahead();
+ return if (lh.len > min) lh else null;
+}
+
+// Advances read position, shrinks lookahead.
+pub fn advance(self: *Self, n: u16) void {
+ assert(self.wp >= self.rp + n);
+ self.rp += n;
+}
+
+// Returns writable part of the buffer, where new uncompressed data can be
+// written.
+pub fn writable(self: *Self) []u8 {
+ return self.buffer[self.wp..];
+}
+
+// Notification of what part of writable buffer is filled with data.
+pub fn written(self: *Self, n: usize) void {
+ self.wp += n;
+}
+
+// Finds match length between previous and current position.
+// Used in hot path!
+pub fn match(self: *Self, prev_pos: u16, curr_pos: u16, min_len: u16) u16 {
+ const max_len: usize = @min(self.wp - curr_pos, consts.match.max_length);
+ // lookahead buffers from previous and current positions
+ const prev_lh = self.buffer[prev_pos..][0..max_len];
+ const curr_lh = self.buffer[curr_pos..][0..max_len];
+
+ // If we alread have match (min_len > 0),
+ // test the first byte above previous len a[min_len] != b[min_len]
+ // and then all the bytes from that position to zero.
+ // That is likely positions to find difference than looping from first bytes.
+ var i: usize = min_len;
+ if (i > 0) {
+ if (max_len <= i) return 0;
+ while (true) {
+ if (prev_lh[i] != curr_lh[i]) return 0;
+ if (i == 0) break;
+ i -= 1;
+ }
+ i = min_len;
+ }
+ while (i < max_len) : (i += 1)
+ if (prev_lh[i] != curr_lh[i]) break;
+ return if (i >= consts.match.min_length) @intCast(i) else 0;
+}
+
+// Current position of non-compressed data. Data before rp are already converted
+// to tokens.
+pub fn pos(self: *Self) u16 {
+ return @intCast(self.rp);
+}
+
+// Notification that token list is cleared.
+pub fn flush(self: *Self) void {
+ self.fp = @intCast(self.rp);
+}
+
+// Part of the buffer since last flush or null if there was slide in between (so
+// fp becomes negative).
+pub fn tokensBuffer(self: *Self) ?[]const u8 {
+ assert(self.fp <= self.rp);
+ if (self.fp < 0) return null;
+ return self.buffer[@intCast(self.fp)..self.rp];
+}
+
+test "flate.SlidingWindow match" {
+ const data = "Blah blah blah blah blah!";
+ var win: Self = .{};
+ try expect(win.write(data) == data.len);
+ try expect(win.wp == data.len);
+ try expect(win.rp == 0);
+
+ // length between l symbols
+ try expect(win.match(1, 6, 0) == 18);
+ try expect(win.match(1, 11, 0) == 13);
+ try expect(win.match(1, 16, 0) == 8);
+ try expect(win.match(1, 21, 0) == 0);
+
+ // position 15 = "blah blah!"
+ // position 20 = "blah!"
+ try expect(win.match(15, 20, 0) == 4);
+ try expect(win.match(15, 20, 3) == 4);
+ try expect(win.match(15, 20, 4) == 0);
+}
+
+test "flate.SlidingWindow slide" {
+ var win: Self = .{};
+ win.wp = Self.buffer_len - 11;
+ win.rp = Self.buffer_len - 111;
+ win.buffer[win.rp] = 0xab;
+ try expect(win.lookahead().len == 100);
+ try expect(win.tokensBuffer().?.len == win.rp);
+
+ const n = win.slide();
+ try expect(n == 32757);
+ try expect(win.buffer[win.rp] == 0xab);
+ try expect(win.rp == Self.hist_len - 111);
+ try expect(win.wp == Self.hist_len - 11);
+ try expect(win.lookahead().len == 100);
+ try expect(win.tokensBuffer() == null);
+}
lib/std/compress/flate/Token.zig
@@ -0,0 +1,327 @@
+/// Token cat be literal: single byte of data or match; reference to the slice of
+/// data in the same stream represented with <length, distance>. Where length
+/// can be 3 - 258 bytes, and distance 1 - 32768 bytes.
+///
+const std = @import("std");
+const assert = std.debug.assert;
+const print = std.debug.print;
+const expect = std.testing.expect;
+const consts = @import("consts.zig").match;
+
+const Token = @This();
+
+pub const Kind = enum(u1) {
+ literal,
+ match,
+};
+
+// Distance range 1 - 32768, stored in dist as 0 - 32767 (fits u15)
+dist: u15 = 0,
+// Length range 3 - 258, stored in len_lit as 0 - 255 (fits u8)
+len_lit: u8 = 0,
+kind: Kind = .literal,
+
+pub fn literal(t: Token) u8 {
+ return t.len_lit;
+}
+
+pub fn distance(t: Token) u16 {
+ return @as(u16, t.dist) + consts.min_distance;
+}
+
+pub fn length(t: Token) u16 {
+ return @as(u16, t.len_lit) + consts.base_length;
+}
+
+pub fn initLiteral(lit: u8) Token {
+ return .{ .kind = .literal, .len_lit = lit };
+}
+
+// distance range 1 - 32768, stored in dist as 0 - 32767 (u15)
+// length range 3 - 258, stored in len_lit as 0 - 255 (u8)
+pub fn initMatch(dist: u16, len: u16) Token {
+ assert(len >= consts.min_length and len <= consts.max_length);
+ assert(dist >= consts.min_distance and dist <= consts.max_distance);
+ return .{
+ .kind = .match,
+ .dist = @intCast(dist - consts.min_distance),
+ .len_lit = @intCast(len - consts.base_length),
+ };
+}
+
+pub fn eql(t: Token, o: Token) bool {
+ return t.kind == o.kind and
+ t.dist == o.dist and
+ t.len_lit == o.len_lit;
+}
+
+pub fn lengthCode(t: Token) u16 {
+ return match_lengths[match_lengths_index[t.len_lit]].code;
+}
+
+pub fn lengthEncoding(t: Token) MatchLength {
+ var c = match_lengths[match_lengths_index[t.len_lit]];
+ c.extra_length = t.len_lit - c.base_scaled;
+ return c;
+}
+
+// Returns the distance code corresponding to a specific distance.
+// Distance code is in range: 0 - 29.
+pub fn distanceCode(t: Token) u8 {
+ var dist: u16 = t.dist;
+ if (dist < match_distances_index.len) {
+ return match_distances_index[dist];
+ }
+ dist >>= 7;
+ if (dist < match_distances_index.len) {
+ return match_distances_index[dist] + 14;
+ }
+ dist >>= 7;
+ return match_distances_index[dist] + 28;
+}
+
+pub fn distanceEncoding(t: Token) MatchDistance {
+ var c = match_distances[t.distanceCode()];
+ c.extra_distance = t.dist - c.base_scaled;
+ return c;
+}
+
+pub fn lengthExtraBits(code: u32) u8 {
+ return match_lengths[code - length_codes_start].extra_bits;
+}
+
+pub fn matchLength(code: u8) MatchLength {
+ return match_lengths[code];
+}
+
+pub fn matchDistance(code: u8) MatchDistance {
+ return match_distances[code];
+}
+
+pub fn distanceExtraBits(code: u32) u8 {
+ return match_distances[code].extra_bits;
+}
+
+pub fn show(t: Token) void {
+ if (t.kind == .literal) {
+ print("L('{c}'), ", .{t.literal()});
+ } else {
+ print("M({d}, {d}), ", .{ t.distance(), t.length() });
+ }
+}
+
+// Retruns index in match_lengths table for each length in range 0-255.
+const match_lengths_index = [_]u8{
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
+ 9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
+ 13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
+ 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+ 17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
+ 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+ 19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 28,
+};
+
+const MatchLength = struct {
+ code: u16,
+ base_scaled: u8, // base - 3, scaled to fit into u8 (0-255), same as lit_len field in Token.
+ base: u16, // 3-258
+ extra_length: u8 = 0,
+ extra_bits: u4,
+};
+
+// match_lengths represents table from rfc (https://datatracker.ietf.org/doc/html/rfc1951#page-12)
+//
+// Extra Extra Extra
+// Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
+// ---- ---- ------ ---- ---- ------- ---- ---- -------
+// 257 0 3 267 1 15,16 277 4 67-82
+// 258 0 4 268 1 17,18 278 4 83-98
+// 259 0 5 269 2 19-22 279 4 99-114
+// 260 0 6 270 2 23-26 280 4 115-130
+// 261 0 7 271 2 27-30 281 5 131-162
+// 262 0 8 272 2 31-34 282 5 163-194
+// 263 0 9 273 3 35-42 283 5 195-226
+// 264 0 10 274 3 43-50 284 5 227-257
+// 265 1 11,12 275 3 51-58 285 0 258
+// 266 1 13,14 276 3 59-66
+//
+pub const length_codes_start = 257;
+
+const match_lengths = [_]MatchLength{
+ .{ .extra_bits = 0, .base_scaled = 0, .base = 3, .code = 257 },
+ .{ .extra_bits = 0, .base_scaled = 1, .base = 4, .code = 258 },
+ .{ .extra_bits = 0, .base_scaled = 2, .base = 5, .code = 259 },
+ .{ .extra_bits = 0, .base_scaled = 3, .base = 6, .code = 260 },
+ .{ .extra_bits = 0, .base_scaled = 4, .base = 7, .code = 261 },
+ .{ .extra_bits = 0, .base_scaled = 5, .base = 8, .code = 262 },
+ .{ .extra_bits = 0, .base_scaled = 6, .base = 9, .code = 263 },
+ .{ .extra_bits = 0, .base_scaled = 7, .base = 10, .code = 264 },
+ .{ .extra_bits = 1, .base_scaled = 8, .base = 11, .code = 265 },
+ .{ .extra_bits = 1, .base_scaled = 10, .base = 13, .code = 266 },
+ .{ .extra_bits = 1, .base_scaled = 12, .base = 15, .code = 267 },
+ .{ .extra_bits = 1, .base_scaled = 14, .base = 17, .code = 268 },
+ .{ .extra_bits = 2, .base_scaled = 16, .base = 19, .code = 269 },
+ .{ .extra_bits = 2, .base_scaled = 20, .base = 23, .code = 270 },
+ .{ .extra_bits = 2, .base_scaled = 24, .base = 27, .code = 271 },
+ .{ .extra_bits = 2, .base_scaled = 28, .base = 31, .code = 272 },
+ .{ .extra_bits = 3, .base_scaled = 32, .base = 35, .code = 273 },
+ .{ .extra_bits = 3, .base_scaled = 40, .base = 43, .code = 274 },
+ .{ .extra_bits = 3, .base_scaled = 48, .base = 51, .code = 275 },
+ .{ .extra_bits = 3, .base_scaled = 56, .base = 59, .code = 276 },
+ .{ .extra_bits = 4, .base_scaled = 64, .base = 67, .code = 277 },
+ .{ .extra_bits = 4, .base_scaled = 80, .base = 83, .code = 278 },
+ .{ .extra_bits = 4, .base_scaled = 96, .base = 99, .code = 279 },
+ .{ .extra_bits = 4, .base_scaled = 112, .base = 115, .code = 280 },
+ .{ .extra_bits = 5, .base_scaled = 128, .base = 131, .code = 281 },
+ .{ .extra_bits = 5, .base_scaled = 160, .base = 163, .code = 282 },
+ .{ .extra_bits = 5, .base_scaled = 192, .base = 195, .code = 283 },
+ .{ .extra_bits = 5, .base_scaled = 224, .base = 227, .code = 284 },
+ .{ .extra_bits = 0, .base_scaled = 255, .base = 258, .code = 285 },
+};
+
+// Used in distanceCode fn to get index in match_distance table for each distance in range 0-32767.
+const match_distances_index = [_]u8{
+ 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+};
+
+const MatchDistance = struct {
+ base_scaled: u16, // base - 1, same as Token dist field
+ base: u16,
+ extra_distance: u16 = 0,
+ code: u8,
+ extra_bits: u4,
+};
+
+// match_distances represents table from rfc (https://datatracker.ietf.org/doc/html/rfc1951#page-12)
+//
+// Extra Extra Extra
+// Code Bits Dist Code Bits Dist Code Bits Distance
+// ---- ---- ---- ---- ---- ------ ---- ---- --------
+// 0 0 1 10 4 33-48 20 9 1025-1536
+// 1 0 2 11 4 49-64 21 9 1537-2048
+// 2 0 3 12 5 65-96 22 10 2049-3072
+// 3 0 4 13 5 97-128 23 10 3073-4096
+// 4 1 5,6 14 6 129-192 24 11 4097-6144
+// 5 1 7,8 15 6 193-256 25 11 6145-8192
+// 6 2 9-12 16 7 257-384 26 12 8193-12288
+// 7 2 13-16 17 7 385-512 27 12 12289-16384
+// 8 3 17-24 18 8 513-768 28 13 16385-24576
+// 9 3 25-32 19 8 769-1024 29 13 24577-32768
+//
+const match_distances = [_]MatchDistance{
+ .{ .extra_bits = 0, .base_scaled = 0x0000, .code = 0, .base = 1 },
+ .{ .extra_bits = 0, .base_scaled = 0x0001, .code = 1, .base = 2 },
+ .{ .extra_bits = 0, .base_scaled = 0x0002, .code = 2, .base = 3 },
+ .{ .extra_bits = 0, .base_scaled = 0x0003, .code = 3, .base = 4 },
+ .{ .extra_bits = 1, .base_scaled = 0x0004, .code = 4, .base = 5 },
+ .{ .extra_bits = 1, .base_scaled = 0x0006, .code = 5, .base = 7 },
+ .{ .extra_bits = 2, .base_scaled = 0x0008, .code = 6, .base = 9 },
+ .{ .extra_bits = 2, .base_scaled = 0x000c, .code = 7, .base = 13 },
+ .{ .extra_bits = 3, .base_scaled = 0x0010, .code = 8, .base = 17 },
+ .{ .extra_bits = 3, .base_scaled = 0x0018, .code = 9, .base = 25 },
+ .{ .extra_bits = 4, .base_scaled = 0x0020, .code = 10, .base = 33 },
+ .{ .extra_bits = 4, .base_scaled = 0x0030, .code = 11, .base = 49 },
+ .{ .extra_bits = 5, .base_scaled = 0x0040, .code = 12, .base = 65 },
+ .{ .extra_bits = 5, .base_scaled = 0x0060, .code = 13, .base = 97 },
+ .{ .extra_bits = 6, .base_scaled = 0x0080, .code = 14, .base = 129 },
+ .{ .extra_bits = 6, .base_scaled = 0x00c0, .code = 15, .base = 193 },
+ .{ .extra_bits = 7, .base_scaled = 0x0100, .code = 16, .base = 257 },
+ .{ .extra_bits = 7, .base_scaled = 0x0180, .code = 17, .base = 385 },
+ .{ .extra_bits = 8, .base_scaled = 0x0200, .code = 18, .base = 513 },
+ .{ .extra_bits = 8, .base_scaled = 0x0300, .code = 19, .base = 769 },
+ .{ .extra_bits = 9, .base_scaled = 0x0400, .code = 20, .base = 1025 },
+ .{ .extra_bits = 9, .base_scaled = 0x0600, .code = 21, .base = 1537 },
+ .{ .extra_bits = 10, .base_scaled = 0x0800, .code = 22, .base = 2049 },
+ .{ .extra_bits = 10, .base_scaled = 0x0c00, .code = 23, .base = 3073 },
+ .{ .extra_bits = 11, .base_scaled = 0x1000, .code = 24, .base = 4097 },
+ .{ .extra_bits = 11, .base_scaled = 0x1800, .code = 25, .base = 6145 },
+ .{ .extra_bits = 12, .base_scaled = 0x2000, .code = 26, .base = 8193 },
+ .{ .extra_bits = 12, .base_scaled = 0x3000, .code = 27, .base = 12289 },
+ .{ .extra_bits = 13, .base_scaled = 0x4000, .code = 28, .base = 16385 },
+ .{ .extra_bits = 13, .base_scaled = 0x6000, .code = 29, .base = 24577 },
+};
+
+test "flate.Token size" {
+ try expect(@sizeOf(Token) == 4);
+}
+
+// testing table https://datatracker.ietf.org/doc/html/rfc1951#page-12
+test "flate.Token MatchLength" {
+ var c = Token.initMatch(1, 4).lengthEncoding();
+ try expect(c.code == 258);
+ try expect(c.extra_bits == 0);
+ try expect(c.extra_length == 0);
+
+ c = Token.initMatch(1, 11).lengthEncoding();
+ try expect(c.code == 265);
+ try expect(c.extra_bits == 1);
+ try expect(c.extra_length == 0);
+
+ c = Token.initMatch(1, 12).lengthEncoding();
+ try expect(c.code == 265);
+ try expect(c.extra_bits == 1);
+ try expect(c.extra_length == 1);
+
+ c = Token.initMatch(1, 130).lengthEncoding();
+ try expect(c.code == 280);
+ try expect(c.extra_bits == 4);
+ try expect(c.extra_length == 130 - 115);
+}
+
+test "flate.Token MatchDistance" {
+ var c = Token.initMatch(1, 4).distanceEncoding();
+ try expect(c.code == 0);
+ try expect(c.extra_bits == 0);
+ try expect(c.extra_distance == 0);
+
+ c = Token.initMatch(192, 4).distanceEncoding();
+ try expect(c.code == 14);
+ try expect(c.extra_bits == 6);
+ try expect(c.extra_distance == 192 - 129);
+}
+
+test "flate.Token match_lengths" {
+ for (match_lengths, 0..) |ml, i| {
+ try expect(@as(u16, ml.base_scaled) + 3 == ml.base);
+ try expect(i + 257 == ml.code);
+ }
+
+ for (match_distances, 0..) |mo, i| {
+ try expect(mo.base_scaled + 1 == mo.base);
+ try expect(i == mo.code);
+ }
+}
lib/std/compress/flate/zlib.zig
@@ -0,0 +1,66 @@
+const deflate = @import("deflate.zig");
+const inflate = @import("inflate.zig");
+
+/// Decompress compressed data from reader and write plain data to the writer.
+pub fn decompress(reader: anytype, writer: anytype) !void {
+ try inflate.decompress(.zlib, reader, writer);
+}
+
+/// Decompressor type
+pub fn Decompressor(comptime ReaderType: type) type {
+ return inflate.Inflate(.zlib, ReaderType);
+}
+
+/// Create Decompressor which will read compressed data from reader.
+pub fn decompressor(reader: anytype) Decompressor(@TypeOf(reader)) {
+ return inflate.decompressor(.zlib, reader);
+}
+
+/// Compression level, trades between speed and compression size.
+pub const Options = deflate.Options;
+
+/// Compress plain data from reader and write compressed data to the writer.
+pub fn compress(reader: anytype, writer: anytype, options: Options) !void {
+ try deflate.compress(.zlib, reader, writer, options);
+}
+
+/// Compressor type
+pub fn Compressor(comptime WriterType: type) type {
+ return deflate.Compressor(.zlib, WriterType);
+}
+
+/// Create Compressor which outputs compressed data to the writer.
+pub fn compressor(writer: anytype, options: Options) !Compressor(@TypeOf(writer)) {
+ return try deflate.compressor(.zlib, writer, options);
+}
+
+/// Huffman only compression. Without Lempel-Ziv match searching. Faster
+/// compression, less memory requirements but bigger compressed sizes.
+pub const huffman = struct {
+ pub fn compress(reader: anytype, writer: anytype) !void {
+ try deflate.huffman.compress(.zlib, reader, writer);
+ }
+
+ pub fn Compressor(comptime WriterType: type) type {
+ return deflate.huffman.Compressor(.zlib, WriterType);
+ }
+
+ pub fn compressor(writer: anytype) !huffman.Compressor(@TypeOf(writer)) {
+ return deflate.huffman.compressor(.zlib, writer);
+ }
+};
+
+// No compression store only. Compressed size is slightly bigger than plain.
+pub const store = struct {
+ pub fn compress(reader: anytype, writer: anytype) !void {
+ try deflate.store.compress(.zlib, reader, writer);
+ }
+
+ pub fn Compressor(comptime WriterType: type) type {
+ return deflate.store.Compressor(.zlib, WriterType);
+ }
+
+ pub fn compressor(writer: anytype) !store.Compressor(@TypeOf(writer)) {
+ return deflate.store.compressor(.zlib, writer);
+ }
+};
lib/std/compress/gzip.zig
@@ -6,7 +6,7 @@ const io = std.io;
const fs = std.fs;
const testing = std.testing;
const mem = std.mem;
-const deflate = std.compress.deflate;
+const deflate = @import("deflate.zig");
const magic = &[2]u8{ 0x1f, 0x8b };
lib/std/compress/zlib.zig
@@ -6,7 +6,7 @@ const io = std.io;
const fs = std.fs;
const testing = std.testing;
const mem = std.mem;
-const deflate = std.compress.deflate;
+const deflate = @import("deflate.zig");
// Zlib header format as specified in RFC1950
const ZLibHeader = packed struct {
lib/std/http/Client.zig
@@ -404,8 +404,8 @@ pub const RequestTransfer = union(enum) {
/// The decompressor for response messages.
pub const Compression = union(enum) {
- pub const DeflateDecompressor = std.compress.zlib.DecompressStream(Request.TransferReader);
- pub const GzipDecompressor = std.compress.gzip.Decompress(Request.TransferReader);
+ pub const DeflateDecompressor = std.compress.zlib.Decompressor(Request.TransferReader);
+ pub const GzipDecompressor = std.compress.gzip.Decompressor(Request.TransferReader);
pub const ZstdDecompressor = std.compress.zstd.DecompressStream(Request.TransferReader, .{});
deflate: DeflateDecompressor,
@@ -601,8 +601,8 @@ pub const Request = struct {
pub fn deinit(req: *Request) void {
switch (req.response.compression) {
.none => {},
- .deflate => |*deflate| deflate.deinit(),
- .gzip => |*gzip| gzip.deinit(),
+ .deflate => {},
+ .gzip => {},
.zstd => |*zstd| zstd.deinit(),
}
@@ -632,8 +632,8 @@ pub const Request = struct {
switch (req.response.compression) {
.none => {},
- .deflate => |*deflate| deflate.deinit(),
- .gzip => |*gzip| gzip.deinit(),
+ .deflate => {},
+ .gzip => {},
.zstd => |*zstd| zstd.deinit(),
}
@@ -941,10 +941,10 @@ pub const Request = struct {
.identity => req.response.compression = .none,
.compress, .@"x-compress" => return error.CompressionNotSupported,
.deflate => req.response.compression = .{
- .deflate = std.compress.zlib.decompressStream(req.client.allocator, req.transferReader()) catch return error.CompressionInitializationFailed,
+ .deflate = std.compress.zlib.decompressor(req.transferReader()),
},
.gzip, .@"x-gzip" => req.response.compression = .{
- .gzip = std.compress.gzip.decompress(req.client.allocator, req.transferReader()) catch return error.CompressionInitializationFailed,
+ .gzip = std.compress.gzip.decompressor(req.transferReader()),
},
.zstd => req.response.compression = .{
.zstd = std.compress.zstd.decompressStream(req.client.allocator, req.transferReader()),
lib/std/http/Server.zig
@@ -195,8 +195,8 @@ pub const ResponseTransfer = union(enum) {
/// The decompressor for request messages.
pub const Compression = union(enum) {
- pub const DeflateDecompressor = std.compress.zlib.DecompressStream(Response.TransferReader);
- pub const GzipDecompressor = std.compress.gzip.Decompress(Response.TransferReader);
+ pub const DeflateDecompressor = std.compress.zlib.Decompressor(Response.TransferReader);
+ pub const GzipDecompressor = std.compress.gzip.Decompressor(Response.TransferReader);
pub const ZstdDecompressor = std.compress.zstd.DecompressStream(Response.TransferReader, .{});
deflate: DeflateDecompressor,
@@ -420,8 +420,8 @@ pub const Response = struct {
switch (res.request.compression) {
.none => {},
- .deflate => |*deflate| deflate.deinit(),
- .gzip => |*gzip| gzip.deinit(),
+ .deflate => {},
+ .gzip => {},
.zstd => |*zstd| zstd.deinit(),
}
@@ -605,10 +605,10 @@ pub const Response = struct {
.identity => res.request.compression = .none,
.compress, .@"x-compress" => return error.CompressionNotSupported,
.deflate => res.request.compression = .{
- .deflate = std.compress.zlib.decompressStream(res.allocator, res.transferReader()) catch return error.CompressionInitializationFailed,
+ .deflate = std.compress.zlib.decompressor(res.transferReader()),
},
.gzip, .@"x-gzip" => res.request.compression = .{
- .gzip = std.compress.gzip.decompress(res.allocator, res.transferReader()) catch return error.CompressionInitializationFailed,
+ .gzip = std.compress.gzip.decompressor(res.transferReader()),
},
.zstd => res.request.compression = .{
.zstd = std.compress.zstd.decompressStream(res.allocator, res.transferReader()),
lib/std/compress.zig
@@ -1,13 +1,21 @@
const std = @import("std.zig");
-pub const deflate = @import("compress/deflate.zig");
-pub const gzip = @import("compress/gzip.zig");
pub const lzma = @import("compress/lzma.zig");
pub const lzma2 = @import("compress/lzma2.zig");
pub const xz = @import("compress/xz.zig");
-pub const zlib = @import("compress/zlib.zig");
pub const zstd = @import("compress/zstandard.zig");
+pub const flate = @import("compress/flate/root.zig").flate;
+pub const gzip = @import("compress/flate/root.zig").gzip;
+pub const zlib = @import("compress/flate/root.zig").zlib;
+
+// Version 1 interface
+pub const v1 = struct {
+ pub const deflate = @import("compress/deflate.zig");
+ pub const gzip = @import("compress/gzip.zig");
+ pub const zlib = @import("compress/zlib.zig");
+};
+
pub fn HashedReader(
comptime ReaderType: anytype,
comptime HasherType: anytype,
@@ -69,11 +77,14 @@ pub fn hashedWriter(
}
test {
- _ = deflate;
- _ = gzip;
+ _ = v1.deflate;
+ _ = v1.gzip;
_ = lzma;
_ = lzma2;
_ = xz;
- _ = zlib;
+ _ = v1.zlib;
_ = zstd;
+ _ = flate;
+ _ = gzip;
+ _ = zlib;
}
lib/std/debug.zig
@@ -1212,8 +1212,7 @@ pub fn readElfDebugInfo(
const chdr = section_reader.readStruct(elf.Chdr) catch continue;
if (chdr.ch_type != .ZLIB) continue;
- var zlib_stream = std.compress.zlib.decompressStream(allocator, section_stream.reader()) catch continue;
- defer zlib_stream.deinit();
+ var zlib_stream = std.compress.zlib.decompressor(section_stream.reader());
const decompressed_section = try allocator.alloc(u8, chdr.ch_size);
errdefer allocator.free(decompressed_section);
src/link/Elf/Object.zig
@@ -902,9 +902,7 @@ pub fn codeDecompressAlloc(self: Object, elf_file: *Elf, atom_index: Atom.Index)
switch (chdr.ch_type) {
.ZLIB => {
var stream = std.io.fixedBufferStream(data[@sizeOf(elf.Elf64_Chdr)..]);
- var zlib_stream = std.compress.zlib.decompressStream(gpa, stream.reader()) catch
- return error.InputOutput;
- defer zlib_stream.deinit();
+ var zlib_stream = std.compress.zlib.decompressor(stream.reader());
const size = std.math.cast(usize, chdr.ch_size) orelse return error.Overflow;
const decomp = try gpa.alloc(u8, size);
const nread = zlib_stream.reader().readAll(decomp) catch return error.InputOutput;
src/Package/Fetch/git.zig
@@ -1115,8 +1115,7 @@ fn indexPackFirstPass(
const entry_header = try EntryHeader.read(entry_crc32_reader.reader());
switch (entry_header) {
inline .commit, .tree, .blob, .tag => |object, tag| {
- var entry_decompress_stream = try std.compress.zlib.decompressStream(allocator, entry_crc32_reader.reader());
- defer entry_decompress_stream.deinit();
+ var entry_decompress_stream = std.compress.zlib.decompressor(entry_crc32_reader.reader());
var entry_counting_reader = std.io.countingReader(entry_decompress_stream.reader());
var entry_hashed_writer = hashedWriter(std.io.null_writer, Sha1.init(.{}));
const entry_writer = entry_hashed_writer.writer();
@@ -1135,8 +1134,7 @@ fn indexPackFirstPass(
});
},
inline .ofs_delta, .ref_delta => |delta| {
- var entry_decompress_stream = try std.compress.zlib.decompressStream(allocator, entry_crc32_reader.reader());
- defer entry_decompress_stream.deinit();
+ var entry_decompress_stream = std.compress.zlib.decompressor(entry_crc32_reader.reader());
var entry_counting_reader = std.io.countingReader(entry_decompress_stream.reader());
var fifo = std.fifo.LinearFifo(u8, .{ .Static = 4096 }).init();
try fifo.pump(entry_counting_reader.reader(), std.io.null_writer);
@@ -1257,8 +1255,7 @@ fn resolveDeltaChain(
fn readObjectRaw(allocator: Allocator, reader: anytype, size: u64) ![]u8 {
const alloc_size = std.math.cast(usize, size) orelse return error.ObjectTooLarge;
var buffered_reader = std.io.bufferedReader(reader);
- var decompress_stream = try std.compress.zlib.decompressStream(allocator, buffered_reader.reader());
- defer decompress_stream.deinit();
+ var decompress_stream = std.compress.zlib.decompressor(buffered_reader.reader());
const data = try allocator.alloc(u8, alloc_size);
errdefer allocator.free(data);
try decompress_stream.reader().readNoEof(data);
src/Package/Fetch.zig
@@ -1099,7 +1099,12 @@ fn unpackResource(
switch (file_type) {
.tar => try unpackTarball(f, tmp_directory.handle, resource.reader()),
- .@"tar.gz" => try unpackTarballCompressed(f, tmp_directory.handle, resource, std.compress.gzip),
+ .@"tar.gz" => {
+ const reader = resource.reader();
+ var br = std.io.bufferedReaderSize(std.crypto.tls.max_ciphertext_record_len, reader);
+ var dcp = std.compress.gzip.decompressor(br.reader());
+ try unpackTarball(f, tmp_directory.handle, dcp.reader());
+ },
.@"tar.xz" => try unpackTarballCompressed(f, tmp_directory.handle, resource, std.compress.xz),
.@"tar.zst" => try unpackTarballCompressed(f, tmp_directory.handle, resource, ZstdWrapper),
.git_pack => unpackGitPack(f, tmp_directory.handle, resource) catch |err| switch (err) {
src/objcopy.zig
@@ -1298,8 +1298,7 @@ const ElfFileHelper = struct {
try compressed_stream.writer().writeAll(prefix);
{
- var compressor = try std.compress.zlib.compressStream(allocator, compressed_stream.writer(), .{});
- defer compressor.deinit();
+ var compressor = try std.compress.zlib.compressor(compressed_stream.writer(), .{});
var buf: [8000]u8 = undefined;
while (true) {
build.zig
@@ -151,6 +151,7 @@ pub fn build(b: *std.Build) !void {
"rfc1952.txt",
"rfc8478.txt",
// exclude files from lib/std/compress/deflate/testdata
+ // and lib/std/compress/flate/testdata
".expect",
".expect-noinput",
".golden",