Commit 83513ade35
Changed files (24)
lib
std
compress
debug
lib/std/compress/flate/bit_reader.zig
@@ -1,422 +0,0 @@
-const std = @import("std");
-const assert = std.debug.assert;
-const testing = std.testing;
-
-pub fn bitReader(comptime T: type, reader: anytype) BitReader(T, @TypeOf(reader)) {
- return BitReader(T, @TypeOf(reader)).init(reader);
-}
-
-pub fn BitReader64(comptime ReaderType: type) type {
- return BitReader(u64, ReaderType);
-}
-
-pub fn BitReader32(comptime ReaderType: type) type {
- return BitReader(u32, ReaderType);
-}
-
-/// Bit reader used during inflate (decompression). Has internal buffer of 64
-/// bits which shifts right after bits are consumed. Uses forward_reader to fill
-/// that internal buffer when needed.
-///
-/// readF is the core function. Supports few different ways of getting bits
-/// controlled by flags. In hot path we try to avoid checking whether we need to
-/// fill buffer from forward_reader by calling fill in advance and readF with
-/// buffered flag set.
-///
-pub fn BitReader(comptime T: type, comptime ReaderType: type) type {
- assert(T == u32 or T == u64);
- const t_bytes: usize = @sizeOf(T);
- const Tshift = if (T == u64) u6 else u5;
-
- return struct {
- // Underlying reader used for filling internal bits buffer
- forward_reader: ReaderType = undefined,
- // Internal buffer of 64 bits
- bits: T = 0,
- // Number of bits in the buffer
- nbits: u32 = 0,
-
- const Self = @This();
-
- pub const Error = ReaderType.Error || error{EndOfStream};
-
- pub fn init(rdr: ReaderType) Self {
- var self = Self{ .forward_reader = rdr };
- self.fill(1) catch {};
- return self;
- }
-
- /// Try to have `nice` bits are available in buffer. Reads from
- /// forward reader if there is no `nice` bits in buffer. Returns error
- /// if end of forward stream is reached and internal buffer is empty.
- /// It will not error if less than `nice` bits are in buffer, only when
- /// all bits are exhausted. During inflate we usually know what is the
- /// maximum bits for the next step but usually that step will need less
- /// bits to decode. So `nice` is not hard limit, it will just try to have
- /// that number of bits available. If end of forward stream is reached
- /// it may be some extra zero bits in buffer.
- pub inline fn fill(self: *Self, nice: u6) !void {
- if (self.nbits >= nice and nice != 0) {
- return; // We have enough bits
- }
- // Read more bits from forward reader
-
- // Number of empty bytes in bits, round nbits to whole bytes.
- const empty_bytes =
- @as(u8, if (self.nbits & 0x7 == 0) t_bytes else t_bytes - 1) - // 8 for 8, 16, 24..., 7 otherwise
- (self.nbits >> 3); // 0 for 0-7, 1 for 8-16, ... same as / 8
-
- var buf: [t_bytes]u8 = [_]u8{0} ** t_bytes;
- const bytes_read = self.forward_reader.readAll(buf[0..empty_bytes]) catch 0;
- if (bytes_read > 0) {
- const u: T = std.mem.readInt(T, buf[0..t_bytes], .little);
- self.bits |= u << @as(Tshift, @intCast(self.nbits));
- self.nbits += 8 * @as(u8, @intCast(bytes_read));
- return;
- }
-
- if (self.nbits == 0)
- return error.EndOfStream;
- }
-
- /// Read exactly buf.len bytes into buf.
- pub fn readAll(self: *Self, buf: []u8) !void {
- assert(self.alignBits() == 0); // internal bits must be at byte boundary
-
- // First read from internal bits buffer.
- var n: usize = 0;
- while (self.nbits > 0 and n < buf.len) {
- buf[n] = try self.readF(u8, flag.buffered);
- n += 1;
- }
- // Then use forward reader for all other bytes.
- try self.forward_reader.readNoEof(buf[n..]);
- }
-
- pub const flag = struct {
- pub const peek: u3 = 0b001; // dont advance internal buffer, just get bits, leave them in buffer
- pub const buffered: u3 = 0b010; // assume that there is no need to fill, fill should be called before
- pub const reverse: u3 = 0b100; // bit reverse read bits
- };
-
- /// Alias for readF(U, 0).
- pub fn read(self: *Self, comptime U: type) !U {
- return self.readF(U, 0);
- }
-
- /// Alias for readF with flag.peak set.
- pub inline fn peekF(self: *Self, comptime U: type, comptime how: u3) !U {
- return self.readF(U, how | flag.peek);
- }
-
- /// Read with flags provided.
- pub fn readF(self: *Self, comptime U: type, comptime how: u3) !U {
- if (U == T) {
- assert(how == 0);
- assert(self.alignBits() == 0);
- try self.fill(@bitSizeOf(T));
- if (self.nbits != @bitSizeOf(T)) return error.EndOfStream;
- const v = self.bits;
- self.nbits = 0;
- self.bits = 0;
- return v;
- }
- const n: Tshift = @bitSizeOf(U);
- switch (how) {
- 0 => { // `normal` read
- try self.fill(n); // ensure that there are n bits in the buffer
- const u: U = @truncate(self.bits); // get n bits
- try self.shift(n); // advance buffer for n
- return u;
- },
- (flag.peek) => { // no shift, leave bits in the buffer
- try self.fill(n);
- return @truncate(self.bits);
- },
- flag.buffered => { // no fill, assume that buffer has enough bits
- const u: U = @truncate(self.bits);
- try self.shift(n);
- return u;
- },
- (flag.reverse) => { // same as 0 with bit reverse
- try self.fill(n);
- const u: U = @truncate(self.bits);
- try self.shift(n);
- return @bitReverse(u);
- },
- (flag.peek | flag.reverse) => {
- try self.fill(n);
- return @bitReverse(@as(U, @truncate(self.bits)));
- },
- (flag.buffered | flag.reverse) => {
- const u: U = @truncate(self.bits);
- try self.shift(n);
- return @bitReverse(u);
- },
- (flag.peek | flag.buffered) => {
- return @truncate(self.bits);
- },
- (flag.peek | flag.buffered | flag.reverse) => {
- return @bitReverse(@as(U, @truncate(self.bits)));
- },
- }
- }
-
- /// Read n number of bits.
- /// Only buffered flag can be used in how.
- pub fn readN(self: *Self, n: u4, comptime how: u3) !u16 {
- switch (how) {
- 0 => {
- try self.fill(n);
- },
- flag.buffered => {},
- else => unreachable,
- }
- const mask: u16 = (@as(u16, 1) << n) - 1;
- const u: u16 = @as(u16, @truncate(self.bits)) & mask;
- try self.shift(n);
- return u;
- }
-
- /// Advance buffer for n bits.
- pub fn shift(self: *Self, n: Tshift) !void {
- if (n > self.nbits) return error.EndOfStream;
- self.bits >>= n;
- self.nbits -= n;
- }
-
- /// Skip n bytes.
- pub fn skipBytes(self: *Self, n: u16) !void {
- for (0..n) |_| {
- try self.fill(8);
- try self.shift(8);
- }
- }
-
- // Number of bits to align stream to the byte boundary.
- fn alignBits(self: *Self) u3 {
- return @intCast(self.nbits & 0x7);
- }
-
- /// Align stream to the byte boundary.
- pub fn alignToByte(self: *Self) void {
- const ab = self.alignBits();
- if (ab > 0) self.shift(ab) catch unreachable;
- }
-
- /// Skip zero terminated string.
- pub fn skipStringZ(self: *Self) !void {
- while (true) {
- if (try self.readF(u8, 0) == 0) break;
- }
- }
-
- /// Read deflate fixed fixed code.
- /// Reads first 7 bits, and then maybe 1 or 2 more to get full 7,8 or 9 bit code.
- /// ref: https://datatracker.ietf.org/doc/html/rfc1951#page-12
- /// Lit Value Bits Codes
- /// --------- ---- -----
- /// 0 - 143 8 00110000 through
- /// 10111111
- /// 144 - 255 9 110010000 through
- /// 111111111
- /// 256 - 279 7 0000000 through
- /// 0010111
- /// 280 - 287 8 11000000 through
- /// 11000111
- pub fn readFixedCode(self: *Self) !u16 {
- try self.fill(7 + 2);
- const code7 = try self.readF(u7, flag.buffered | flag.reverse);
- if (code7 <= 0b0010_111) { // 7 bits, 256-279, codes 0000_000 - 0010_111
- return @as(u16, code7) + 256;
- } else if (code7 <= 0b1011_111) { // 8 bits, 0-143, codes 0011_0000 through 1011_1111
- return (@as(u16, code7) << 1) + @as(u16, try self.readF(u1, flag.buffered)) - 0b0011_0000;
- } else if (code7 <= 0b1100_011) { // 8 bit, 280-287, codes 1100_0000 - 1100_0111
- return (@as(u16, code7 - 0b1100000) << 1) + try self.readF(u1, flag.buffered) + 280;
- } else { // 9 bit, 144-255, codes 1_1001_0000 - 1_1111_1111
- return (@as(u16, code7 - 0b1100_100) << 2) + @as(u16, try self.readF(u2, flag.buffered | flag.reverse)) + 144;
- }
- }
- };
-}
-
-test "readF" {
- var fbs = std.io.fixedBufferStream(&[_]u8{ 0xf3, 0x48, 0xcd, 0xc9, 0x00, 0x00 });
- var br = bitReader(u64, fbs.reader());
- const F = BitReader64(@TypeOf(fbs.reader())).flag;
-
- try testing.expectEqual(@as(u8, 48), br.nbits);
- try testing.expectEqual(@as(u64, 0xc9cd48f3), br.bits);
-
- try testing.expect(try br.readF(u1, 0) == 0b0000_0001);
- try testing.expect(try br.readF(u2, 0) == 0b0000_0001);
- try testing.expectEqual(@as(u8, 48 - 3), br.nbits);
- try testing.expectEqual(@as(u3, 5), br.alignBits());
-
- try testing.expect(try br.readF(u8, F.peek) == 0b0001_1110);
- try testing.expect(try br.readF(u9, F.peek) == 0b1_0001_1110);
- try br.shift(9);
- try testing.expectEqual(@as(u8, 36), br.nbits);
- try testing.expectEqual(@as(u3, 4), br.alignBits());
-
- try testing.expect(try br.readF(u4, 0) == 0b0100);
- try testing.expectEqual(@as(u8, 32), br.nbits);
- try testing.expectEqual(@as(u3, 0), br.alignBits());
-
- try br.shift(1);
- try testing.expectEqual(@as(u3, 7), br.alignBits());
- try br.shift(1);
- try testing.expectEqual(@as(u3, 6), br.alignBits());
- br.alignToByte();
- try testing.expectEqual(@as(u3, 0), br.alignBits());
-
- try testing.expectEqual(@as(u64, 0xc9), br.bits);
- try testing.expectEqual(@as(u16, 0x9), try br.readN(4, 0));
- try testing.expectEqual(@as(u16, 0xc), try br.readN(4, 0));
-}
-
-test "read block type 1 data" {
- inline for ([_]type{ u64, u32 }) |T| {
- const data = [_]u8{
- 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1
- 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00,
- 0x0c, 0x01, 0x02, 0x03, //
- 0xaa, 0xbb, 0xcc, 0xdd,
- };
- var fbs = std.io.fixedBufferStream(&data);
- var br = bitReader(T, fbs.reader());
- const F = BitReader(T, @TypeOf(fbs.reader())).flag;
-
- try testing.expectEqual(@as(u1, 1), try br.readF(u1, 0)); // bfinal
- try testing.expectEqual(@as(u2, 1), try br.readF(u2, 0)); // block_type
-
- for ("Hello world\n") |c| {
- try testing.expectEqual(@as(u8, c), try br.readF(u8, F.reverse) - 0x30);
- }
- try testing.expectEqual(@as(u7, 0), try br.readF(u7, 0)); // end of block
- br.alignToByte();
- try testing.expectEqual(@as(u32, 0x0302010c), try br.readF(u32, 0));
- try testing.expectEqual(@as(u16, 0xbbaa), try br.readF(u16, 0));
- try testing.expectEqual(@as(u16, 0xddcc), try br.readF(u16, 0));
- }
-}
-
-test "shift/fill" {
- const data = [_]u8{
- 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
- 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
- };
- var fbs = std.io.fixedBufferStream(&data);
- var br = bitReader(u64, fbs.reader());
-
- try testing.expectEqual(@as(u64, 0x08_07_06_05_04_03_02_01), br.bits);
- try br.shift(8);
- try testing.expectEqual(@as(u64, 0x00_08_07_06_05_04_03_02), br.bits);
- try br.fill(60); // fill with 1 byte
- try testing.expectEqual(@as(u64, 0x01_08_07_06_05_04_03_02), br.bits);
- try br.shift(8 * 4 + 4);
- try testing.expectEqual(@as(u64, 0x00_00_00_00_00_10_80_70), br.bits);
-
- try br.fill(60); // fill with 4 bytes (shift by 4)
- try testing.expectEqual(@as(u64, 0x00_50_40_30_20_10_80_70), br.bits);
- try testing.expectEqual(@as(u8, 8 * 7 + 4), br.nbits);
-
- try br.shift(@intCast(br.nbits)); // clear buffer
- try br.fill(8); // refill with the rest of the bytes
- try testing.expectEqual(@as(u64, 0x00_00_00_00_00_08_07_06), br.bits);
-}
-
-test "readAll" {
- inline for ([_]type{ u64, u32 }) |T| {
- const data = [_]u8{
- 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
- 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
- };
- var fbs = std.io.fixedBufferStream(&data);
- var br = bitReader(T, fbs.reader());
-
- switch (T) {
- u64 => try testing.expectEqual(@as(u64, 0x08_07_06_05_04_03_02_01), br.bits),
- u32 => try testing.expectEqual(@as(u32, 0x04_03_02_01), br.bits),
- else => unreachable,
- }
-
- var out: [16]u8 = undefined;
- try br.readAll(out[0..]);
- try testing.expect(br.nbits == 0);
- try testing.expect(br.bits == 0);
-
- try testing.expectEqualSlices(u8, data[0..16], &out);
- }
-}
-
-test "readFixedCode" {
- inline for ([_]type{ u64, u32 }) |T| {
- const fixed_codes = @import("huffman_encoder.zig").fixed_codes;
-
- var fbs = std.io.fixedBufferStream(&fixed_codes);
- var rdr = bitReader(T, fbs.reader());
-
- for (0..286) |c| {
- try testing.expectEqual(c, try rdr.readFixedCode());
- }
- try testing.expect(rdr.nbits == 0);
- }
-}
-
-test "u32 leaves no bits on u32 reads" {
- const data = [_]u8{
- 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
- };
- var fbs = std.io.fixedBufferStream(&data);
- var br = bitReader(u32, fbs.reader());
-
- _ = try br.read(u3);
- try testing.expectEqual(29, br.nbits);
- br.alignToByte();
- try testing.expectEqual(24, br.nbits);
- try testing.expectEqual(0x04_03_02_01, try br.read(u32));
- try testing.expectEqual(0, br.nbits);
- try testing.expectEqual(0x08_07_06_05, try br.read(u32));
- try testing.expectEqual(0, br.nbits);
-
- _ = try br.read(u9);
- try testing.expectEqual(23, br.nbits);
- br.alignToByte();
- try testing.expectEqual(16, br.nbits);
- try testing.expectEqual(0x0e_0d_0c_0b, try br.read(u32));
- try testing.expectEqual(0, br.nbits);
-}
-
-test "u64 need fill after alignToByte" {
- const data = [_]u8{
- 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
- };
-
- // without fill
- var fbs = std.io.fixedBufferStream(&data);
- var br = bitReader(u64, fbs.reader());
- _ = try br.read(u23);
- try testing.expectEqual(41, br.nbits);
- br.alignToByte();
- try testing.expectEqual(40, br.nbits);
- try testing.expectEqual(0x06_05_04_03, try br.read(u32));
- try testing.expectEqual(8, br.nbits);
- try testing.expectEqual(0x0a_09_08_07, try br.read(u32));
- try testing.expectEqual(32, br.nbits);
-
- // fill after align ensures all bits filled
- fbs.reset();
- br = bitReader(u64, fbs.reader());
- _ = try br.read(u23);
- try testing.expectEqual(41, br.nbits);
- br.alignToByte();
- try br.fill(0);
- try testing.expectEqual(64, br.nbits);
- try testing.expectEqual(0x06_05_04_03, try br.read(u32));
- try testing.expectEqual(32, br.nbits);
- try testing.expectEqual(0x0a_09_08_07, try br.read(u32));
- try testing.expectEqual(0, br.nbits);
-}
lib/std/compress/flate/bit_writer.zig
@@ -1,99 +0,0 @@
-const std = @import("std");
-const assert = std.debug.assert;
-
-/// Bit writer for use in deflate (compression).
-///
-/// Has internal bits buffer of 64 bits and internal bytes buffer of 248 bytes.
-/// When we accumulate 48 bits 6 bytes are moved to the bytes buffer. When we
-/// accumulate 240 bytes they are flushed to the underlying inner_writer.
-///
-pub fn BitWriter(comptime WriterType: type) type {
- // buffer_flush_size indicates the buffer size
- // after which bytes are flushed to the writer.
- // Should preferably be a multiple of 6, since
- // we accumulate 6 bytes between writes to the buffer.
- const buffer_flush_size = 240;
-
- // buffer_size is the actual output byte buffer size.
- // It must have additional headroom for a flush
- // which can contain up to 8 bytes.
- const buffer_size = buffer_flush_size + 8;
-
- return struct {
- inner_writer: WriterType,
-
- // Data waiting to be written is bytes[0 .. nbytes]
- // and then the low nbits of bits. Data is always written
- // sequentially into the bytes array.
- bits: u64 = 0,
- nbits: u32 = 0, // number of bits
- bytes: [buffer_size]u8 = undefined,
- nbytes: u32 = 0, // number of bytes
-
- const Self = @This();
-
- pub const Error = WriterType.Error || error{UnfinishedBits};
-
- pub fn init(writer: WriterType) Self {
- return .{ .inner_writer = writer };
- }
-
- pub fn setWriter(self: *Self, new_writer: WriterType) void {
- //assert(self.bits == 0 and self.nbits == 0 and self.nbytes == 0);
- self.inner_writer = new_writer;
- }
-
- pub fn flush(self: *Self) Error!void {
- var n = self.nbytes;
- while (self.nbits != 0) {
- self.bytes[n] = @as(u8, @truncate(self.bits));
- self.bits >>= 8;
- if (self.nbits > 8) { // Avoid underflow
- self.nbits -= 8;
- } else {
- self.nbits = 0;
- }
- n += 1;
- }
- self.bits = 0;
- _ = try self.inner_writer.write(self.bytes[0..n]);
- self.nbytes = 0;
- }
-
- pub fn writeBits(self: *Self, b: u32, nb: u32) Error!void {
- self.bits |= @as(u64, @intCast(b)) << @as(u6, @intCast(self.nbits));
- self.nbits += nb;
- if (self.nbits < 48)
- return;
-
- var n = self.nbytes;
- std.mem.writeInt(u64, self.bytes[n..][0..8], self.bits, .little);
- n += 6;
- if (n >= buffer_flush_size) {
- _ = try self.inner_writer.write(self.bytes[0..n]);
- n = 0;
- }
- self.nbytes = n;
- self.bits >>= 48;
- self.nbits -= 48;
- }
-
- pub fn writeBytes(self: *Self, bytes: []const u8) Error!void {
- var n = self.nbytes;
- if (self.nbits & 7 != 0) {
- return error.UnfinishedBits;
- }
- while (self.nbits != 0) {
- self.bytes[n] = @as(u8, @truncate(self.bits));
- self.bits >>= 8;
- self.nbits -= 8;
- n += 1;
- }
- if (n != 0) {
- _ = try self.inner_writer.write(self.bytes[0..n]);
- }
- self.nbytes = 0;
- _ = try self.inner_writer.write(bytes);
- }
- };
-}
lib/std/compress/flate/block_writer.zig
@@ -1,706 +0,0 @@
-const std = @import("std");
-const io = std.io;
-const assert = std.debug.assert;
-
-const hc = @import("huffman_encoder.zig");
-const consts = @import("consts.zig").huffman;
-const Token = @import("Token.zig");
-const BitWriter = @import("bit_writer.zig").BitWriter;
-
-pub fn blockWriter(writer: anytype) BlockWriter(@TypeOf(writer)) {
- return BlockWriter(@TypeOf(writer)).init(writer);
-}
-
-/// Accepts list of tokens, decides what is best block type to write. What block
-/// type will provide best compression. Writes header and body of the block.
-///
-pub fn BlockWriter(comptime WriterType: type) type {
- const BitWriterType = BitWriter(WriterType);
- return struct {
- const codegen_order = consts.codegen_order;
- const end_code_mark = 255;
- const Self = @This();
-
- pub const Error = BitWriterType.Error;
- bit_writer: BitWriterType,
-
- codegen_freq: [consts.codegen_code_count]u16 = undefined,
- literal_freq: [consts.max_num_lit]u16 = undefined,
- distance_freq: [consts.distance_code_count]u16 = undefined,
- codegen: [consts.max_num_lit + consts.distance_code_count + 1]u8 = undefined,
- literal_encoding: hc.LiteralEncoder = .{},
- distance_encoding: hc.DistanceEncoder = .{},
- codegen_encoding: hc.CodegenEncoder = .{},
- fixed_literal_encoding: hc.LiteralEncoder,
- fixed_distance_encoding: hc.DistanceEncoder,
- huff_distance: hc.DistanceEncoder,
-
- pub fn init(writer: WriterType) Self {
- return .{
- .bit_writer = BitWriterType.init(writer),
- .fixed_literal_encoding = hc.fixedLiteralEncoder(),
- .fixed_distance_encoding = hc.fixedDistanceEncoder(),
- .huff_distance = hc.huffmanDistanceEncoder(),
- };
- }
-
- /// Flush intrenal bit buffer to the writer.
- /// Should be called only when bit stream is at byte boundary.
- ///
- /// That is after final block; when last byte could be incomplete or
- /// after stored block; which is aligned to the byte boundary (it has x
- /// padding bits after first 3 bits).
- pub fn flush(self: *Self) Error!void {
- try self.bit_writer.flush();
- }
-
- pub fn setWriter(self: *Self, new_writer: WriterType) void {
- self.bit_writer.setWriter(new_writer);
- }
-
- fn writeCode(self: *Self, c: hc.HuffCode) Error!void {
- try self.bit_writer.writeBits(c.code, c.len);
- }
-
- // RFC 1951 3.2.7 specifies a special run-length encoding for specifying
- // the literal and distance lengths arrays (which are concatenated into a single
- // array). This method generates that run-length encoding.
- //
- // The result is written into the codegen array, and the frequencies
- // of each code is written into the codegen_freq array.
- // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
- // information. Code bad_code is an end marker
- //
- // num_literals: The number of literals in literal_encoding
- // num_distances: The number of distances in distance_encoding
- // lit_enc: The literal encoder to use
- // dist_enc: The distance encoder to use
- fn generateCodegen(
- self: *Self,
- num_literals: u32,
- num_distances: u32,
- lit_enc: *hc.LiteralEncoder,
- dist_enc: *hc.DistanceEncoder,
- ) void {
- for (self.codegen_freq, 0..) |_, i| {
- self.codegen_freq[i] = 0;
- }
-
- // Note that we are using codegen both as a temporary variable for holding
- // a copy of the frequencies, and as the place where we put the result.
- // This is fine because the output is always shorter than the input used
- // so far.
- var codegen = &self.codegen; // cache
- // Copy the concatenated code sizes to codegen. Put a marker at the end.
- var cgnl = codegen[0..num_literals];
- for (cgnl, 0..) |_, i| {
- cgnl[i] = @as(u8, @intCast(lit_enc.codes[i].len));
- }
-
- cgnl = codegen[num_literals .. num_literals + num_distances];
- for (cgnl, 0..) |_, i| {
- cgnl[i] = @as(u8, @intCast(dist_enc.codes[i].len));
- }
- codegen[num_literals + num_distances] = end_code_mark;
-
- var size = codegen[0];
- var count: i32 = 1;
- var out_index: u32 = 0;
- var in_index: u32 = 1;
- while (size != end_code_mark) : (in_index += 1) {
- // INVARIANT: We have seen "count" copies of size that have not yet
- // had output generated for them.
- const next_size = codegen[in_index];
- if (next_size == size) {
- count += 1;
- continue;
- }
- // We need to generate codegen indicating "count" of size.
- if (size != 0) {
- codegen[out_index] = size;
- out_index += 1;
- self.codegen_freq[size] += 1;
- count -= 1;
- while (count >= 3) {
- var n: i32 = 6;
- if (n > count) {
- n = count;
- }
- codegen[out_index] = 16;
- out_index += 1;
- codegen[out_index] = @as(u8, @intCast(n - 3));
- out_index += 1;
- self.codegen_freq[16] += 1;
- count -= n;
- }
- } else {
- while (count >= 11) {
- var n: i32 = 138;
- if (n > count) {
- n = count;
- }
- codegen[out_index] = 18;
- out_index += 1;
- codegen[out_index] = @as(u8, @intCast(n - 11));
- out_index += 1;
- self.codegen_freq[18] += 1;
- count -= n;
- }
- if (count >= 3) {
- // 3 <= count <= 10
- codegen[out_index] = 17;
- out_index += 1;
- codegen[out_index] = @as(u8, @intCast(count - 3));
- out_index += 1;
- self.codegen_freq[17] += 1;
- count = 0;
- }
- }
- count -= 1;
- while (count >= 0) : (count -= 1) {
- codegen[out_index] = size;
- out_index += 1;
- self.codegen_freq[size] += 1;
- }
- // Set up invariant for next time through the loop.
- size = next_size;
- count = 1;
- }
- // Marker indicating the end of the codegen.
- codegen[out_index] = end_code_mark;
- }
-
- const DynamicSize = struct {
- size: u32,
- num_codegens: u32,
- };
-
- // dynamicSize returns the size of dynamically encoded data in bits.
- fn dynamicSize(
- self: *Self,
- lit_enc: *hc.LiteralEncoder, // literal encoder
- dist_enc: *hc.DistanceEncoder, // distance encoder
- extra_bits: u32,
- ) DynamicSize {
- var num_codegens = self.codegen_freq.len;
- while (num_codegens > 4 and self.codegen_freq[codegen_order[num_codegens - 1]] == 0) {
- num_codegens -= 1;
- }
- const header = 3 + 5 + 5 + 4 + (3 * num_codegens) +
- self.codegen_encoding.bitLength(self.codegen_freq[0..]) +
- self.codegen_freq[16] * 2 +
- self.codegen_freq[17] * 3 +
- self.codegen_freq[18] * 7;
- const size = header +
- lit_enc.bitLength(&self.literal_freq) +
- dist_enc.bitLength(&self.distance_freq) +
- extra_bits;
-
- return DynamicSize{
- .size = @as(u32, @intCast(size)),
- .num_codegens = @as(u32, @intCast(num_codegens)),
- };
- }
-
- // fixedSize returns the size of dynamically encoded data in bits.
- fn fixedSize(self: *Self, extra_bits: u32) u32 {
- return 3 +
- self.fixed_literal_encoding.bitLength(&self.literal_freq) +
- self.fixed_distance_encoding.bitLength(&self.distance_freq) +
- extra_bits;
- }
-
- const StoredSize = struct {
- size: u32,
- storable: bool,
- };
-
- // storedSizeFits calculates the stored size, including header.
- // The function returns the size in bits and whether the block
- // fits inside a single block.
- fn storedSizeFits(in: ?[]const u8) StoredSize {
- if (in == null) {
- return .{ .size = 0, .storable = false };
- }
- if (in.?.len <= consts.max_store_block_size) {
- return .{ .size = @as(u32, @intCast((in.?.len + 5) * 8)), .storable = true };
- }
- return .{ .size = 0, .storable = false };
- }
-
- // Write the header of a dynamic Huffman block to the output stream.
- //
- // num_literals: The number of literals specified in codegen
- // num_distances: The number of distances specified in codegen
- // num_codegens: The number of codegens used in codegen
- // eof: Is it the end-of-file? (end of stream)
- fn dynamicHeader(
- self: *Self,
- num_literals: u32,
- num_distances: u32,
- num_codegens: u32,
- eof: bool,
- ) Error!void {
- const first_bits: u32 = if (eof) 5 else 4;
- try self.bit_writer.writeBits(first_bits, 3);
- try self.bit_writer.writeBits(num_literals - 257, 5);
- try self.bit_writer.writeBits(num_distances - 1, 5);
- try self.bit_writer.writeBits(num_codegens - 4, 4);
-
- var i: u32 = 0;
- while (i < num_codegens) : (i += 1) {
- const value = self.codegen_encoding.codes[codegen_order[i]].len;
- try self.bit_writer.writeBits(value, 3);
- }
-
- i = 0;
- while (true) {
- const code_word: u32 = @as(u32, @intCast(self.codegen[i]));
- i += 1;
- if (code_word == end_code_mark) {
- break;
- }
- try self.writeCode(self.codegen_encoding.codes[@as(u32, @intCast(code_word))]);
-
- switch (code_word) {
- 16 => {
- try self.bit_writer.writeBits(self.codegen[i], 2);
- i += 1;
- },
- 17 => {
- try self.bit_writer.writeBits(self.codegen[i], 3);
- i += 1;
- },
- 18 => {
- try self.bit_writer.writeBits(self.codegen[i], 7);
- i += 1;
- },
- else => {},
- }
- }
- }
-
- fn storedHeader(self: *Self, length: usize, eof: bool) Error!void {
- assert(length <= 65535);
- const flag: u32 = if (eof) 1 else 0;
- try self.bit_writer.writeBits(flag, 3);
- try self.flush();
- const l: u16 = @intCast(length);
- try self.bit_writer.writeBits(l, 16);
- try self.bit_writer.writeBits(~l, 16);
- }
-
- fn fixedHeader(self: *Self, eof: bool) Error!void {
- // Indicate that we are a fixed Huffman block
- var value: u32 = 2;
- if (eof) {
- value = 3;
- }
- try self.bit_writer.writeBits(value, 3);
- }
-
- // Write a block of tokens with the smallest encoding. Will choose block type.
- // The original input can be supplied, and if the huffman encoded data
- // is larger than the original bytes, the data will be written as a
- // stored block.
- // If the input is null, the tokens will always be Huffman encoded.
- pub fn write(self: *Self, tokens: []const Token, eof: bool, input: ?[]const u8) Error!void {
- const lit_and_dist = self.indexTokens(tokens);
- const num_literals = lit_and_dist.num_literals;
- const num_distances = lit_and_dist.num_distances;
-
- var extra_bits: u32 = 0;
- const ret = storedSizeFits(input);
- const stored_size = ret.size;
- const storable = ret.storable;
-
- if (storable) {
- // We only bother calculating the costs of the extra bits required by
- // the length of distance fields (which will be the same for both fixed
- // and dynamic encoding), if we need to compare those two encodings
- // against stored encoding.
- var length_code: u16 = Token.length_codes_start + 8;
- while (length_code < num_literals) : (length_code += 1) {
- // First eight length codes have extra size = 0.
- extra_bits += @as(u32, @intCast(self.literal_freq[length_code])) *
- @as(u32, @intCast(Token.lengthExtraBits(length_code)));
- }
- var distance_code: u16 = 4;
- while (distance_code < num_distances) : (distance_code += 1) {
- // First four distance codes have extra size = 0.
- extra_bits += @as(u32, @intCast(self.distance_freq[distance_code])) *
- @as(u32, @intCast(Token.distanceExtraBits(distance_code)));
- }
- }
-
- // Figure out smallest code.
- // Fixed Huffman baseline.
- var literal_encoding = &self.fixed_literal_encoding;
- var distance_encoding = &self.fixed_distance_encoding;
- var size = self.fixedSize(extra_bits);
-
- // Dynamic Huffman?
- var num_codegens: u32 = 0;
-
- // Generate codegen and codegenFrequencies, which indicates how to encode
- // the literal_encoding and the distance_encoding.
- self.generateCodegen(
- num_literals,
- num_distances,
- &self.literal_encoding,
- &self.distance_encoding,
- );
- self.codegen_encoding.generate(self.codegen_freq[0..], 7);
- const dynamic_size = self.dynamicSize(
- &self.literal_encoding,
- &self.distance_encoding,
- extra_bits,
- );
- const dyn_size = dynamic_size.size;
- num_codegens = dynamic_size.num_codegens;
-
- if (dyn_size < size) {
- size = dyn_size;
- literal_encoding = &self.literal_encoding;
- distance_encoding = &self.distance_encoding;
- }
-
- // Stored bytes?
- if (storable and stored_size < size) {
- try self.storedBlock(input.?, eof);
- return;
- }
-
- // Huffman.
- if (@intFromPtr(literal_encoding) == @intFromPtr(&self.fixed_literal_encoding)) {
- try self.fixedHeader(eof);
- } else {
- try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
- }
-
- // Write the tokens.
- try self.writeTokens(tokens, &literal_encoding.codes, &distance_encoding.codes);
- }
-
- pub fn storedBlock(self: *Self, input: []const u8, eof: bool) Error!void {
- try self.storedHeader(input.len, eof);
- try self.bit_writer.writeBytes(input);
- }
-
- // writeBlockDynamic encodes a block using a dynamic Huffman table.
- // This should be used if the symbols used have a disproportionate
- // histogram distribution.
- // If input is supplied and the compression savings are below 1/16th of the
- // input size the block is stored.
- fn dynamicBlock(
- self: *Self,
- tokens: []const Token,
- eof: bool,
- input: ?[]const u8,
- ) Error!void {
- const total_tokens = self.indexTokens(tokens);
- const num_literals = total_tokens.num_literals;
- const num_distances = total_tokens.num_distances;
-
- // Generate codegen and codegenFrequencies, which indicates how to encode
- // the literal_encoding and the distance_encoding.
- self.generateCodegen(
- num_literals,
- num_distances,
- &self.literal_encoding,
- &self.distance_encoding,
- );
- self.codegen_encoding.generate(self.codegen_freq[0..], 7);
- const dynamic_size = self.dynamicSize(&self.literal_encoding, &self.distance_encoding, 0);
- const size = dynamic_size.size;
- const num_codegens = dynamic_size.num_codegens;
-
- // Store bytes, if we don't get a reasonable improvement.
-
- const stored_size = storedSizeFits(input);
- const ssize = stored_size.size;
- const storable = stored_size.storable;
- if (storable and ssize < (size + (size >> 4))) {
- try self.storedBlock(input.?, eof);
- return;
- }
-
- // Write Huffman table.
- try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
-
- // Write the tokens.
- try self.writeTokens(tokens, &self.literal_encoding.codes, &self.distance_encoding.codes);
- }
-
- const TotalIndexedTokens = struct {
- num_literals: u32,
- num_distances: u32,
- };
-
- // Indexes a slice of tokens followed by an end_block_marker, and updates
- // literal_freq and distance_freq, and generates literal_encoding
- // and distance_encoding.
- // The number of literal and distance tokens is returned.
- fn indexTokens(self: *Self, tokens: []const Token) TotalIndexedTokens {
- var num_literals: u32 = 0;
- var num_distances: u32 = 0;
-
- for (self.literal_freq, 0..) |_, i| {
- self.literal_freq[i] = 0;
- }
- for (self.distance_freq, 0..) |_, i| {
- self.distance_freq[i] = 0;
- }
-
- for (tokens) |t| {
- if (t.kind == Token.Kind.literal) {
- self.literal_freq[t.literal()] += 1;
- continue;
- }
- self.literal_freq[t.lengthCode()] += 1;
- self.distance_freq[t.distanceCode()] += 1;
- }
- // add end_block_marker token at the end
- self.literal_freq[consts.end_block_marker] += 1;
-
- // get the number of literals
- num_literals = @as(u32, @intCast(self.literal_freq.len));
- while (self.literal_freq[num_literals - 1] == 0) {
- num_literals -= 1;
- }
- // get the number of distances
- num_distances = @as(u32, @intCast(self.distance_freq.len));
- while (num_distances > 0 and self.distance_freq[num_distances - 1] == 0) {
- num_distances -= 1;
- }
- if (num_distances == 0) {
- // We haven't found a single match. If we want to go with the dynamic encoding,
- // we should count at least one distance to be sure that the distance huffman tree could be encoded.
- self.distance_freq[0] = 1;
- num_distances = 1;
- }
- self.literal_encoding.generate(&self.literal_freq, 15);
- self.distance_encoding.generate(&self.distance_freq, 15);
- return TotalIndexedTokens{
- .num_literals = num_literals,
- .num_distances = num_distances,
- };
- }
-
- // Writes a slice of tokens to the output followed by and end_block_marker.
- // codes for literal and distance encoding must be supplied.
- fn writeTokens(
- self: *Self,
- tokens: []const Token,
- le_codes: []hc.HuffCode,
- oe_codes: []hc.HuffCode,
- ) Error!void {
- for (tokens) |t| {
- if (t.kind == Token.Kind.literal) {
- try self.writeCode(le_codes[t.literal()]);
- continue;
- }
-
- // Write the length
- const le = t.lengthEncoding();
- try self.writeCode(le_codes[le.code]);
- if (le.extra_bits > 0) {
- try self.bit_writer.writeBits(le.extra_length, le.extra_bits);
- }
-
- // Write the distance
- const oe = t.distanceEncoding();
- try self.writeCode(oe_codes[oe.code]);
- if (oe.extra_bits > 0) {
- try self.bit_writer.writeBits(oe.extra_distance, oe.extra_bits);
- }
- }
- // add end_block_marker at the end
- try self.writeCode(le_codes[consts.end_block_marker]);
- }
-
- // Encodes a block of bytes as either Huffman encoded literals or uncompressed bytes
- // if the results only gains very little from compression.
- pub fn huffmanBlock(self: *Self, input: []const u8, eof: bool) Error!void {
- // Add everything as literals
- histogram(input, &self.literal_freq);
-
- self.literal_freq[consts.end_block_marker] = 1;
-
- const num_literals = consts.end_block_marker + 1;
- self.distance_freq[0] = 1;
- const num_distances = 1;
-
- self.literal_encoding.generate(&self.literal_freq, 15);
-
- // Figure out smallest code.
- // Always use dynamic Huffman or Store
- var num_codegens: u32 = 0;
-
- // Generate codegen and codegenFrequencies, which indicates how to encode
- // the literal_encoding and the distance_encoding.
- self.generateCodegen(
- num_literals,
- num_distances,
- &self.literal_encoding,
- &self.huff_distance,
- );
- self.codegen_encoding.generate(self.codegen_freq[0..], 7);
- const dynamic_size = self.dynamicSize(&self.literal_encoding, &self.huff_distance, 0);
- const size = dynamic_size.size;
- num_codegens = dynamic_size.num_codegens;
-
- // Store bytes, if we don't get a reasonable improvement.
- const stored_size_ret = storedSizeFits(input);
- const ssize = stored_size_ret.size;
- const storable = stored_size_ret.storable;
-
- if (storable and ssize < (size + (size >> 4))) {
- try self.storedBlock(input, eof);
- return;
- }
-
- // Huffman.
- try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
- const encoding = self.literal_encoding.codes[0..257];
-
- for (input) |t| {
- const c = encoding[t];
- try self.bit_writer.writeBits(c.code, c.len);
- }
- try self.writeCode(encoding[consts.end_block_marker]);
- }
-
- // histogram accumulates a histogram of b in h.
- fn histogram(b: []const u8, h: *[286]u16) void {
- // Clear histogram
- for (h, 0..) |_, i| {
- h[i] = 0;
- }
-
- var lh = h.*[0..256];
- for (b) |t| {
- lh[t] += 1;
- }
- }
- };
-}
-
-// tests
-const expect = std.testing.expect;
-const fmt = std.fmt;
-const testing = std.testing;
-const ArrayList = std.ArrayList;
-
-const TestCase = @import("testdata/block_writer.zig").TestCase;
-const testCases = @import("testdata/block_writer.zig").testCases;
-
-// tests if the writeBlock encoding has changed.
-test "write" {
- inline for (0..testCases.len) |i| {
- try testBlock(testCases[i], .write_block);
- }
-}
-
-// tests if the writeBlockDynamic encoding has changed.
-test "dynamicBlock" {
- inline for (0..testCases.len) |i| {
- try testBlock(testCases[i], .write_dyn_block);
- }
-}
-
-test "huffmanBlock" {
- inline for (0..testCases.len) |i| {
- try testBlock(testCases[i], .write_huffman_block);
- }
- try testBlock(.{
- .tokens = &[_]Token{},
- .input = "huffman-rand-max.input",
- .want = "huffman-rand-max.{s}.expect",
- }, .write_huffman_block);
-}
-
-const TestFn = enum {
- write_block,
- write_dyn_block, // write dynamic block
- write_huffman_block,
-
- fn to_s(self: TestFn) []const u8 {
- return switch (self) {
- .write_block => "wb",
- .write_dyn_block => "dyn",
- .write_huffman_block => "huff",
- };
- }
-
- fn write(
- comptime self: TestFn,
- bw: anytype,
- tok: []const Token,
- input: ?[]const u8,
- final: bool,
- ) !void {
- switch (self) {
- .write_block => try bw.write(tok, final, input),
- .write_dyn_block => try bw.dynamicBlock(tok, final, input),
- .write_huffman_block => try bw.huffmanBlock(input.?, final),
- }
- try bw.flush();
- }
-};
-
-// testBlock tests a block against its references
-//
-// size
-// 64K [file-name].input - input non compressed file
-// 8.1K [file-name].golden -
-// 78 [file-name].dyn.expect - output with writeBlockDynamic
-// 78 [file-name].wb.expect - output with writeBlock
-// 8.1K [file-name].huff.expect - output with writeBlockHuff
-// 78 [file-name].dyn.expect-noinput - output with writeBlockDynamic when input is null
-// 78 [file-name].wb.expect-noinput - output with writeBlock when input is null
-//
-// wb - writeBlock
-// dyn - writeBlockDynamic
-// huff - writeBlockHuff
-//
-fn testBlock(comptime tc: TestCase, comptime tfn: TestFn) !void {
- if (tc.input.len != 0 and tc.want.len != 0) {
- const want_name = comptime fmt.comptimePrint(tc.want, .{tfn.to_s()});
- const input = @embedFile("testdata/block_writer/" ++ tc.input);
- const want = @embedFile("testdata/block_writer/" ++ want_name);
- try testWriteBlock(tfn, input, want, tc.tokens);
- }
-
- if (tfn == .write_huffman_block) {
- return;
- }
-
- const want_name_no_input = comptime fmt.comptimePrint(tc.want_no_input, .{tfn.to_s()});
- const want = @embedFile("testdata/block_writer/" ++ want_name_no_input);
- try testWriteBlock(tfn, null, want, tc.tokens);
-}
-
-// Uses writer function `tfn` to write `tokens`, tests that we got `want` as output.
-fn testWriteBlock(comptime tfn: TestFn, input: ?[]const u8, want: []const u8, tokens: []const Token) !void {
- var buf = ArrayList(u8).init(testing.allocator);
- var bw = blockWriter(buf.writer());
- try tfn.write(&bw, tokens, input, false);
- var got = buf.items;
- try testing.expectEqualSlices(u8, want, got); // expect writeBlock to yield expected result
- try expect(got[0] & 0b0000_0001 == 0); // bfinal is not set
- //
- // Test if the writer produces the same output after reset.
- buf.deinit();
- buf = ArrayList(u8).init(testing.allocator);
- defer buf.deinit();
- bw.setWriter(buf.writer());
-
- try tfn.write(&bw, tokens, input, true);
- try bw.flush();
- got = buf.items;
-
- try expect(got[0] & 1 == 1); // bfinal is set
- buf.items[0] &= 0b1111_1110; // remove bfinal bit, so we can run test slices
- try testing.expectEqualSlices(u8, want, got); // expect writeBlock to yield expected result
-}
lib/std/compress/flate/BlockWriter.zig
@@ -0,0 +1,696 @@
+//! Accepts list of tokens, decides what is best block type to write. What block
+//! type will provide best compression. Writes header and body of the block.
+const std = @import("std");
+const io = std.io;
+const assert = std.debug.assert;
+const Writer = std.io.Writer;
+
+const BlockWriter = @This();
+const flate = @import("../flate.zig");
+const Compress = flate.Compress;
+const huffman = flate.huffman;
+const Token = @import("Token.zig");
+
+const codegen_order = huffman.codegen_order;
+const end_code_mark = 255;
+
+output: *Writer,
+
+codegen_freq: [huffman.codegen_code_count]u16 = undefined,
+literal_freq: [huffman.max_num_lit]u16 = undefined,
+distance_freq: [huffman.distance_code_count]u16 = undefined,
+codegen: [huffman.max_num_lit + huffman.distance_code_count + 1]u8 = undefined,
+literal_encoding: Compress.LiteralEncoder = .{},
+distance_encoding: Compress.DistanceEncoder = .{},
+codegen_encoding: Compress.CodegenEncoder = .{},
+fixed_literal_encoding: Compress.LiteralEncoder,
+fixed_distance_encoding: Compress.DistanceEncoder,
+huff_distance: Compress.DistanceEncoder,
+
+pub fn init(output: *Writer) BlockWriter {
+ return .{
+ .output = output,
+ .fixed_literal_encoding = Compress.fixedLiteralEncoder(),
+ .fixed_distance_encoding = Compress.fixedDistanceEncoder(),
+ .huff_distance = Compress.huffmanDistanceEncoder(),
+ };
+}
+
+/// Flush intrenal bit buffer to the writer.
+/// Should be called only when bit stream is at byte boundary.
+///
+/// That is after final block; when last byte could be incomplete or
+/// after stored block; which is aligned to the byte boundary (it has x
+/// padding bits after first 3 bits).
+pub fn flush(self: *BlockWriter) Writer.Error!void {
+ try self.bit_writer.flush();
+}
+
+pub fn setWriter(self: *BlockWriter, new_writer: *Writer) void {
+ self.bit_writer.setWriter(new_writer);
+}
+
+fn writeCode(self: *BlockWriter, c: Compress.HuffCode) Writer.Error!void {
+ try self.bit_writer.writeBits(c.code, c.len);
+}
+
+// RFC 1951 3.2.7 specifies a special run-length encoding for specifying
+// the literal and distance lengths arrays (which are concatenated into a single
+// array). This method generates that run-length encoding.
+//
+// The result is written into the codegen array, and the frequencies
+// of each code is written into the codegen_freq array.
+// Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
+// information. Code bad_code is an end marker
+//
+// num_literals: The number of literals in literal_encoding
+// num_distances: The number of distances in distance_encoding
+// lit_enc: The literal encoder to use
+// dist_enc: The distance encoder to use
+fn generateCodegen(
+ self: *BlockWriter,
+ num_literals: u32,
+ num_distances: u32,
+ lit_enc: *Compress.LiteralEncoder,
+ dist_enc: *Compress.DistanceEncoder,
+) void {
+ for (self.codegen_freq, 0..) |_, i| {
+ self.codegen_freq[i] = 0;
+ }
+
+ // Note that we are using codegen both as a temporary variable for holding
+ // a copy of the frequencies, and as the place where we put the result.
+ // This is fine because the output is always shorter than the input used
+ // so far.
+ var codegen = &self.codegen; // cache
+ // Copy the concatenated code sizes to codegen. Put a marker at the end.
+ var cgnl = codegen[0..num_literals];
+ for (cgnl, 0..) |_, i| {
+ cgnl[i] = @as(u8, @intCast(lit_enc.codes[i].len));
+ }
+
+ cgnl = codegen[num_literals .. num_literals + num_distances];
+ for (cgnl, 0..) |_, i| {
+ cgnl[i] = @as(u8, @intCast(dist_enc.codes[i].len));
+ }
+ codegen[num_literals + num_distances] = end_code_mark;
+
+ var size = codegen[0];
+ var count: i32 = 1;
+ var out_index: u32 = 0;
+ var in_index: u32 = 1;
+ while (size != end_code_mark) : (in_index += 1) {
+ // INVARIANT: We have seen "count" copies of size that have not yet
+ // had output generated for them.
+ const next_size = codegen[in_index];
+ if (next_size == size) {
+ count += 1;
+ continue;
+ }
+ // We need to generate codegen indicating "count" of size.
+ if (size != 0) {
+ codegen[out_index] = size;
+ out_index += 1;
+ self.codegen_freq[size] += 1;
+ count -= 1;
+ while (count >= 3) {
+ var n: i32 = 6;
+ if (n > count) {
+ n = count;
+ }
+ codegen[out_index] = 16;
+ out_index += 1;
+ codegen[out_index] = @as(u8, @intCast(n - 3));
+ out_index += 1;
+ self.codegen_freq[16] += 1;
+ count -= n;
+ }
+ } else {
+ while (count >= 11) {
+ var n: i32 = 138;
+ if (n > count) {
+ n = count;
+ }
+ codegen[out_index] = 18;
+ out_index += 1;
+ codegen[out_index] = @as(u8, @intCast(n - 11));
+ out_index += 1;
+ self.codegen_freq[18] += 1;
+ count -= n;
+ }
+ if (count >= 3) {
+ // 3 <= count <= 10
+ codegen[out_index] = 17;
+ out_index += 1;
+ codegen[out_index] = @as(u8, @intCast(count - 3));
+ out_index += 1;
+ self.codegen_freq[17] += 1;
+ count = 0;
+ }
+ }
+ count -= 1;
+ while (count >= 0) : (count -= 1) {
+ codegen[out_index] = size;
+ out_index += 1;
+ self.codegen_freq[size] += 1;
+ }
+ // Set up invariant for next time through the loop.
+ size = next_size;
+ count = 1;
+ }
+ // Marker indicating the end of the codegen.
+ codegen[out_index] = end_code_mark;
+}
+
+const DynamicSize = struct {
+ size: u32,
+ num_codegens: u32,
+};
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+fn dynamicSize(
+ self: *BlockWriter,
+ lit_enc: *Compress.LiteralEncoder, // literal encoder
+ dist_enc: *Compress.DistanceEncoder, // distance encoder
+ extra_bits: u32,
+) DynamicSize {
+ var num_codegens = self.codegen_freq.len;
+ while (num_codegens > 4 and self.codegen_freq[codegen_order[num_codegens - 1]] == 0) {
+ num_codegens -= 1;
+ }
+ const header = 3 + 5 + 5 + 4 + (3 * num_codegens) +
+ self.codegen_encoding.bitLength(self.codegen_freq[0..]) +
+ self.codegen_freq[16] * 2 +
+ self.codegen_freq[17] * 3 +
+ self.codegen_freq[18] * 7;
+ const size = header +
+ lit_enc.bitLength(&self.literal_freq) +
+ dist_enc.bitLength(&self.distance_freq) +
+ extra_bits;
+
+ return DynamicSize{
+ .size = @as(u32, @intCast(size)),
+ .num_codegens = @as(u32, @intCast(num_codegens)),
+ };
+}
+
+// fixedSize returns the size of dynamically encoded data in bits.
+fn fixedSize(self: *BlockWriter, extra_bits: u32) u32 {
+ return 3 +
+ self.fixed_literal_encoding.bitLength(&self.literal_freq) +
+ self.fixed_distance_encoding.bitLength(&self.distance_freq) +
+ extra_bits;
+}
+
+const StoredSize = struct {
+ size: u32,
+ storable: bool,
+};
+
+// storedSizeFits calculates the stored size, including header.
+// The function returns the size in bits and whether the block
+// fits inside a single block.
+fn storedSizeFits(in: ?[]const u8) StoredSize {
+ if (in == null) {
+ return .{ .size = 0, .storable = false };
+ }
+ if (in.?.len <= huffman.max_store_block_size) {
+ return .{ .size = @as(u32, @intCast((in.?.len + 5) * 8)), .storable = true };
+ }
+ return .{ .size = 0, .storable = false };
+}
+
+// Write the header of a dynamic Huffman block to the output stream.
+//
+// num_literals: The number of literals specified in codegen
+// num_distances: The number of distances specified in codegen
+// num_codegens: The number of codegens used in codegen
+// eof: Is it the end-of-file? (end of stream)
+fn dynamicHeader(
+ self: *BlockWriter,
+ num_literals: u32,
+ num_distances: u32,
+ num_codegens: u32,
+ eof: bool,
+) Writer.Error!void {
+ const first_bits: u32 = if (eof) 5 else 4;
+ try self.bit_writer.writeBits(first_bits, 3);
+ try self.bit_writer.writeBits(num_literals - 257, 5);
+ try self.bit_writer.writeBits(num_distances - 1, 5);
+ try self.bit_writer.writeBits(num_codegens - 4, 4);
+
+ var i: u32 = 0;
+ while (i < num_codegens) : (i += 1) {
+ const value = self.codegen_encoding.codes[codegen_order[i]].len;
+ try self.bit_writer.writeBits(value, 3);
+ }
+
+ i = 0;
+ while (true) {
+ const code_word: u32 = @as(u32, @intCast(self.codegen[i]));
+ i += 1;
+ if (code_word == end_code_mark) {
+ break;
+ }
+ try self.writeCode(self.codegen_encoding.codes[@as(u32, @intCast(code_word))]);
+
+ switch (code_word) {
+ 16 => {
+ try self.bit_writer.writeBits(self.codegen[i], 2);
+ i += 1;
+ },
+ 17 => {
+ try self.bit_writer.writeBits(self.codegen[i], 3);
+ i += 1;
+ },
+ 18 => {
+ try self.bit_writer.writeBits(self.codegen[i], 7);
+ i += 1;
+ },
+ else => {},
+ }
+ }
+}
+
+fn storedHeader(self: *BlockWriter, length: usize, eof: bool) Writer.Error!void {
+ assert(length <= 65535);
+ const flag: u32 = if (eof) 1 else 0;
+ try self.bit_writer.writeBits(flag, 3);
+ try self.flush();
+ const l: u16 = @intCast(length);
+ try self.bit_writer.writeBits(l, 16);
+ try self.bit_writer.writeBits(~l, 16);
+}
+
+fn fixedHeader(self: *BlockWriter, eof: bool) Writer.Error!void {
+ // Indicate that we are a fixed Huffman block
+ var value: u32 = 2;
+ if (eof) {
+ value = 3;
+ }
+ try self.bit_writer.writeBits(value, 3);
+}
+
+// Write a block of tokens with the smallest encoding. Will choose block type.
+// The original input can be supplied, and if the huffman encoded data
+// is larger than the original bytes, the data will be written as a
+// stored block.
+// If the input is null, the tokens will always be Huffman encoded.
+pub fn write(self: *BlockWriter, tokens: []const Token, eof: bool, input: ?[]const u8) Writer.Error!void {
+ const lit_and_dist = self.indexTokens(tokens);
+ const num_literals = lit_and_dist.num_literals;
+ const num_distances = lit_and_dist.num_distances;
+
+ var extra_bits: u32 = 0;
+ const ret = storedSizeFits(input);
+ const stored_size = ret.size;
+ const storable = ret.storable;
+
+ if (storable) {
+ // We only bother calculating the costs of the extra bits required by
+ // the length of distance fields (which will be the same for both fixed
+ // and dynamic encoding), if we need to compare those two encodings
+ // against stored encoding.
+ var length_code: u16 = Token.length_codes_start + 8;
+ while (length_code < num_literals) : (length_code += 1) {
+ // First eight length codes have extra size = 0.
+ extra_bits += @as(u32, @intCast(self.literal_freq[length_code])) *
+ @as(u32, @intCast(Token.lengthExtraBits(length_code)));
+ }
+ var distance_code: u16 = 4;
+ while (distance_code < num_distances) : (distance_code += 1) {
+ // First four distance codes have extra size = 0.
+ extra_bits += @as(u32, @intCast(self.distance_freq[distance_code])) *
+ @as(u32, @intCast(Token.distanceExtraBits(distance_code)));
+ }
+ }
+
+ // Figure out smallest code.
+ // Fixed Huffman baseline.
+ var literal_encoding = &self.fixed_literal_encoding;
+ var distance_encoding = &self.fixed_distance_encoding;
+ var size = self.fixedSize(extra_bits);
+
+ // Dynamic Huffman?
+ var num_codegens: u32 = 0;
+
+ // Generate codegen and codegenFrequencies, which indicates how to encode
+ // the literal_encoding and the distance_encoding.
+ self.generateCodegen(
+ num_literals,
+ num_distances,
+ &self.literal_encoding,
+ &self.distance_encoding,
+ );
+ self.codegen_encoding.generate(self.codegen_freq[0..], 7);
+ const dynamic_size = self.dynamicSize(
+ &self.literal_encoding,
+ &self.distance_encoding,
+ extra_bits,
+ );
+ const dyn_size = dynamic_size.size;
+ num_codegens = dynamic_size.num_codegens;
+
+ if (dyn_size < size) {
+ size = dyn_size;
+ literal_encoding = &self.literal_encoding;
+ distance_encoding = &self.distance_encoding;
+ }
+
+ // Stored bytes?
+ if (storable and stored_size < size) {
+ try self.storedBlock(input.?, eof);
+ return;
+ }
+
+ // Huffman.
+ if (@intFromPtr(literal_encoding) == @intFromPtr(&self.fixed_literal_encoding)) {
+ try self.fixedHeader(eof);
+ } else {
+ try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
+ }
+
+ // Write the tokens.
+ try self.writeTokens(tokens, &literal_encoding.codes, &distance_encoding.codes);
+}
+
+pub fn storedBlock(self: *BlockWriter, input: []const u8, eof: bool) Writer.Error!void {
+ try self.storedHeader(input.len, eof);
+ try self.bit_writer.writeBytes(input);
+}
+
+// writeBlockDynamic encodes a block using a dynamic Huffman table.
+// This should be used if the symbols used have a disproportionate
+// histogram distribution.
+// If input is supplied and the compression savings are below 1/16th of the
+// input size the block is stored.
+fn dynamicBlock(
+ self: *BlockWriter,
+ tokens: []const Token,
+ eof: bool,
+ input: ?[]const u8,
+) Writer.Error!void {
+ const total_tokens = self.indexTokens(tokens);
+ const num_literals = total_tokens.num_literals;
+ const num_distances = total_tokens.num_distances;
+
+ // Generate codegen and codegenFrequencies, which indicates how to encode
+ // the literal_encoding and the distance_encoding.
+ self.generateCodegen(
+ num_literals,
+ num_distances,
+ &self.literal_encoding,
+ &self.distance_encoding,
+ );
+ self.codegen_encoding.generate(self.codegen_freq[0..], 7);
+ const dynamic_size = self.dynamicSize(&self.literal_encoding, &self.distance_encoding, 0);
+ const size = dynamic_size.size;
+ const num_codegens = dynamic_size.num_codegens;
+
+ // Store bytes, if we don't get a reasonable improvement.
+
+ const stored_size = storedSizeFits(input);
+ const ssize = stored_size.size;
+ const storable = stored_size.storable;
+ if (storable and ssize < (size + (size >> 4))) {
+ try self.storedBlock(input.?, eof);
+ return;
+ }
+
+ // Write Huffman table.
+ try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
+
+ // Write the tokens.
+ try self.writeTokens(tokens, &self.literal_encoding.codes, &self.distance_encoding.codes);
+}
+
+const TotalIndexedTokens = struct {
+ num_literals: u32,
+ num_distances: u32,
+};
+
+// Indexes a slice of tokens followed by an end_block_marker, and updates
+// literal_freq and distance_freq, and generates literal_encoding
+// and distance_encoding.
+// The number of literal and distance tokens is returned.
+fn indexTokens(self: *BlockWriter, tokens: []const Token) TotalIndexedTokens {
+ var num_literals: u32 = 0;
+ var num_distances: u32 = 0;
+
+ for (self.literal_freq, 0..) |_, i| {
+ self.literal_freq[i] = 0;
+ }
+ for (self.distance_freq, 0..) |_, i| {
+ self.distance_freq[i] = 0;
+ }
+
+ for (tokens) |t| {
+ if (t.kind == Token.Kind.literal) {
+ self.literal_freq[t.literal()] += 1;
+ continue;
+ }
+ self.literal_freq[t.lengthCode()] += 1;
+ self.distance_freq[t.distanceCode()] += 1;
+ }
+ // add end_block_marker token at the end
+ self.literal_freq[huffman.end_block_marker] += 1;
+
+ // get the number of literals
+ num_literals = @as(u32, @intCast(self.literal_freq.len));
+ while (self.literal_freq[num_literals - 1] == 0) {
+ num_literals -= 1;
+ }
+ // get the number of distances
+ num_distances = @as(u32, @intCast(self.distance_freq.len));
+ while (num_distances > 0 and self.distance_freq[num_distances - 1] == 0) {
+ num_distances -= 1;
+ }
+ if (num_distances == 0) {
+ // We haven't found a single match. If we want to go with the dynamic encoding,
+ // we should count at least one distance to be sure that the distance huffman tree could be encoded.
+ self.distance_freq[0] = 1;
+ num_distances = 1;
+ }
+ self.literal_encoding.generate(&self.literal_freq, 15);
+ self.distance_encoding.generate(&self.distance_freq, 15);
+ return TotalIndexedTokens{
+ .num_literals = num_literals,
+ .num_distances = num_distances,
+ };
+}
+
+// Writes a slice of tokens to the output followed by and end_block_marker.
+// codes for literal and distance encoding must be supplied.
+fn writeTokens(
+ self: *BlockWriter,
+ tokens: []const Token,
+ le_codes: []Compress.HuffCode,
+ oe_codes: []Compress.HuffCode,
+) Writer.Error!void {
+ for (tokens) |t| {
+ if (t.kind == Token.Kind.literal) {
+ try self.writeCode(le_codes[t.literal()]);
+ continue;
+ }
+
+ // Write the length
+ const le = t.lengthEncoding();
+ try self.writeCode(le_codes[le.code]);
+ if (le.extra_bits > 0) {
+ try self.bit_writer.writeBits(le.extra_length, le.extra_bits);
+ }
+
+ // Write the distance
+ const oe = t.distanceEncoding();
+ try self.writeCode(oe_codes[oe.code]);
+ if (oe.extra_bits > 0) {
+ try self.bit_writer.writeBits(oe.extra_distance, oe.extra_bits);
+ }
+ }
+ // add end_block_marker at the end
+ try self.writeCode(le_codes[huffman.end_block_marker]);
+}
+
+// Encodes a block of bytes as either Huffman encoded literals or uncompressed bytes
+// if the results only gains very little from compression.
+pub fn huffmanBlock(self: *BlockWriter, input: []const u8, eof: bool) Writer.Error!void {
+ // Add everything as literals
+ histogram(input, &self.literal_freq);
+
+ self.literal_freq[huffman.end_block_marker] = 1;
+
+ const num_literals = huffman.end_block_marker + 1;
+ self.distance_freq[0] = 1;
+ const num_distances = 1;
+
+ self.literal_encoding.generate(&self.literal_freq, 15);
+
+ // Figure out smallest code.
+ // Always use dynamic Huffman or Store
+ var num_codegens: u32 = 0;
+
+ // Generate codegen and codegenFrequencies, which indicates how to encode
+ // the literal_encoding and the distance_encoding.
+ self.generateCodegen(
+ num_literals,
+ num_distances,
+ &self.literal_encoding,
+ &self.huff_distance,
+ );
+ self.codegen_encoding.generate(self.codegen_freq[0..], 7);
+ const dynamic_size = self.dynamicSize(&self.literal_encoding, &self.huff_distance, 0);
+ const size = dynamic_size.size;
+ num_codegens = dynamic_size.num_codegens;
+
+ // Store bytes, if we don't get a reasonable improvement.
+ const stored_size_ret = storedSizeFits(input);
+ const ssize = stored_size_ret.size;
+ const storable = stored_size_ret.storable;
+
+ if (storable and ssize < (size + (size >> 4))) {
+ try self.storedBlock(input, eof);
+ return;
+ }
+
+ // Huffman.
+ try self.dynamicHeader(num_literals, num_distances, num_codegens, eof);
+ const encoding = self.literal_encoding.codes[0..257];
+
+ for (input) |t| {
+ const c = encoding[t];
+ try self.bit_writer.writeBits(c.code, c.len);
+ }
+ try self.writeCode(encoding[huffman.end_block_marker]);
+}
+
+// histogram accumulates a histogram of b in h.
+fn histogram(b: []const u8, h: *[286]u16) void {
+ // Clear histogram
+ for (h, 0..) |_, i| {
+ h[i] = 0;
+ }
+
+ var lh = h.*[0..256];
+ for (b) |t| {
+ lh[t] += 1;
+ }
+}
+
+// tests
+const expect = std.testing.expect;
+const fmt = std.fmt;
+const testing = std.testing;
+const ArrayList = std.ArrayList;
+
+const TestCase = @import("testdata/block_writer.zig").TestCase;
+const testCases = @import("testdata/block_writer.zig").testCases;
+
+// tests if the writeBlock encoding has changed.
+test "write" {
+ inline for (0..testCases.len) |i| {
+ try testBlock(testCases[i], .write_block);
+ }
+}
+
+// tests if the writeBlockDynamic encoding has changed.
+test "dynamicBlock" {
+ inline for (0..testCases.len) |i| {
+ try testBlock(testCases[i], .write_dyn_block);
+ }
+}
+
+test "huffmanBlock" {
+ inline for (0..testCases.len) |i| {
+ try testBlock(testCases[i], .write_huffman_block);
+ }
+ try testBlock(.{
+ .tokens = &[_]Token{},
+ .input = "huffman-rand-max.input",
+ .want = "huffman-rand-max.{s}.expect",
+ }, .write_huffman_block);
+}
+
+const TestFn = enum {
+ write_block,
+ write_dyn_block, // write dynamic block
+ write_huffman_block,
+
+ fn to_s(self: TestFn) []const u8 {
+ return switch (self) {
+ .write_block => "wb",
+ .write_dyn_block => "dyn",
+ .write_huffman_block => "huff",
+ };
+ }
+
+ fn write(
+ comptime self: TestFn,
+ bw: anytype,
+ tok: []const Token,
+ input: ?[]const u8,
+ final: bool,
+ ) !void {
+ switch (self) {
+ .write_block => try bw.write(tok, final, input),
+ .write_dyn_block => try bw.dynamicBlock(tok, final, input),
+ .write_huffman_block => try bw.huffmanBlock(input.?, final),
+ }
+ try bw.flush();
+ }
+};
+
+// testBlock tests a block against its references
+//
+// size
+// 64K [file-name].input - input non compressed file
+// 8.1K [file-name].golden -
+// 78 [file-name].dyn.expect - output with writeBlockDynamic
+// 78 [file-name].wb.expect - output with writeBlock
+// 8.1K [file-name].huff.expect - output with writeBlockHuff
+// 78 [file-name].dyn.expect-noinput - output with writeBlockDynamic when input is null
+// 78 [file-name].wb.expect-noinput - output with writeBlock when input is null
+//
+// wb - writeBlock
+// dyn - writeBlockDynamic
+// huff - writeBlockHuff
+//
+fn testBlock(comptime tc: TestCase, comptime tfn: TestFn) !void {
+ if (tc.input.len != 0 and tc.want.len != 0) {
+ const want_name = comptime fmt.comptimePrint(tc.want, .{tfn.to_s()});
+ const input = @embedFile("testdata/block_writer/" ++ tc.input);
+ const want = @embedFile("testdata/block_writer/" ++ want_name);
+ try testWriteBlock(tfn, input, want, tc.tokens);
+ }
+
+ if (tfn == .write_huffman_block) {
+ return;
+ }
+
+ const want_name_no_input = comptime fmt.comptimePrint(tc.want_no_input, .{tfn.to_s()});
+ const want = @embedFile("testdata/block_writer/" ++ want_name_no_input);
+ try testWriteBlock(tfn, null, want, tc.tokens);
+}
+
+// Uses writer function `tfn` to write `tokens`, tests that we got `want` as output.
+fn testWriteBlock(comptime tfn: TestFn, input: ?[]const u8, want: []const u8, tokens: []const Token) !void {
+ var buf = ArrayList(u8).init(testing.allocator);
+ var bw: BlockWriter = .init(buf.writer());
+ try tfn.write(&bw, tokens, input, false);
+ var got = buf.items;
+ try testing.expectEqualSlices(u8, want, got); // expect writeBlock to yield expected result
+ try expect(got[0] & 0b0000_0001 == 0); // bfinal is not set
+ //
+ // Test if the writer produces the same output after reset.
+ buf.deinit();
+ buf = ArrayList(u8).init(testing.allocator);
+ defer buf.deinit();
+ bw.setWriter(buf.writer());
+
+ try tfn.write(&bw, tokens, input, true);
+ try bw.flush();
+ got = buf.items;
+
+ try expect(got[0] & 1 == 1); // bfinal is set
+ buf.items[0] &= 0b1111_1110; // remove bfinal bit, so we can run test slices
+ try testing.expectEqualSlices(u8, want, got); // expect writeBlock to yield expected result
+}
lib/std/compress/flate/CircularBuffer.zig
@@ -1,240 +0,0 @@
-//! 64K buffer of uncompressed data created in inflate (decompression). Has enough
-//! history to support writing match<length, distance>; copying length of bytes
-//! from the position distance backward from current.
-//!
-//! Reads can return less than available bytes if they are spread across
-//! different circles. So reads should repeat until get required number of bytes
-//! or until returned slice is zero length.
-//!
-//! Note on deflate limits:
-//! * non-compressible block is limited to 65,535 bytes.
-//! * backward pointer is limited in distance to 32K bytes and in length to 258 bytes.
-//!
-//! Whole non-compressed block can be written without overlap. We always have
-//! history of up to 64K, more then 32K needed.
-//!
-const std = @import("std");
-const assert = std.debug.assert;
-const testing = std.testing;
-
-const consts = @import("consts.zig").match;
-
-const mask = 0xffff; // 64K - 1
-const buffer_len = mask + 1; // 64K buffer
-
-const Self = @This();
-
-buffer: [buffer_len]u8 = undefined,
-wp: usize = 0, // write position
-rp: usize = 0, // read position
-
-fn writeAll(self: *Self, buf: []const u8) void {
- for (buf) |c| self.write(c);
-}
-
-/// Write literal.
-pub fn write(self: *Self, b: u8) void {
- assert(self.wp - self.rp < mask);
- self.buffer[self.wp & mask] = b;
- self.wp += 1;
-}
-
-/// Write match (back-reference to the same data slice) starting at `distance`
-/// back from current write position, and `length` of bytes.
-pub fn writeMatch(self: *Self, length: u16, distance: u16) !void {
- if (self.wp < distance or
- length < consts.base_length or length > consts.max_length or
- distance < consts.min_distance or distance > consts.max_distance)
- {
- return error.InvalidMatch;
- }
- assert(self.wp - self.rp < mask);
-
- var from: usize = self.wp - distance & mask;
- const from_end: usize = from + length;
- var to: usize = self.wp & mask;
- const to_end: usize = to + length;
-
- self.wp += length;
-
- // Fast path using memcpy
- if (from_end < buffer_len and to_end < buffer_len) // start and end at the same circle
- {
- var cur_len = distance;
- var remaining_len = length;
- while (cur_len < remaining_len) {
- @memcpy(self.buffer[to..][0..cur_len], self.buffer[from..][0..cur_len]);
- to += cur_len;
- remaining_len -= cur_len;
- cur_len = cur_len * 2;
- }
- @memcpy(self.buffer[to..][0..remaining_len], self.buffer[from..][0..remaining_len]);
- return;
- }
-
- // Slow byte by byte
- while (to < to_end) {
- self.buffer[to & mask] = self.buffer[from & mask];
- to += 1;
- from += 1;
- }
-}
-
-/// Returns writable part of the internal buffer of size `n` at most. Advances
-/// write pointer, assumes that returned buffer will be filled with data.
-pub fn getWritable(self: *Self, n: usize) []u8 {
- const wp = self.wp & mask;
- const len = @min(n, buffer_len - wp);
- self.wp += len;
- return self.buffer[wp .. wp + len];
-}
-
-/// Read available data. Can return part of the available data if it is
-/// spread across two circles. So read until this returns zero length.
-pub fn read(self: *Self) []const u8 {
- return self.readAtMost(buffer_len);
-}
-
-/// Read part of available data. Can return less than max even if there are
-/// more than max decoded data.
-pub fn readAtMost(self: *Self, limit: usize) []const u8 {
- const rb = self.readBlock(if (limit == 0) buffer_len else limit);
- defer self.rp += rb.len;
- return self.buffer[rb.head..rb.tail];
-}
-
-const ReadBlock = struct {
- head: usize,
- tail: usize,
- len: usize,
-};
-
-/// Returns position of continuous read block data.
-fn readBlock(self: *Self, max: usize) ReadBlock {
- const r = self.rp & mask;
- const w = self.wp & mask;
- const n = @min(
- max,
- if (w >= r) w - r else buffer_len - r,
- );
- return .{
- .head = r,
- .tail = r + n,
- .len = n,
- };
-}
-
-/// Number of free bytes for write.
-pub fn free(self: *Self) usize {
- return buffer_len - (self.wp - self.rp);
-}
-
-/// Full if largest match can't fit. 258 is largest match length. That much
-/// bytes can be produced in single decode step.
-pub fn full(self: *Self) bool {
- return self.free() < 258 + 1;
-}
-
-// example from: https://youtu.be/SJPvNi4HrWQ?t=3558
-test writeMatch {
- var cb: Self = .{};
-
- cb.writeAll("a salad; ");
- try cb.writeMatch(5, 9);
- try cb.writeMatch(3, 3);
-
- try testing.expectEqualStrings("a salad; a salsal", cb.read());
-}
-
-test "writeMatch overlap" {
- var cb: Self = .{};
-
- cb.writeAll("a b c ");
- try cb.writeMatch(8, 4);
- cb.write('d');
-
- try testing.expectEqualStrings("a b c b c b c d", cb.read());
-}
-
-test readAtMost {
- var cb: Self = .{};
-
- cb.writeAll("0123456789");
- try cb.writeMatch(50, 10);
-
- try testing.expectEqualStrings("0123456789" ** 6, cb.buffer[cb.rp..cb.wp]);
- for (0..6) |i| {
- try testing.expectEqual(i * 10, cb.rp);
- try testing.expectEqualStrings("0123456789", cb.readAtMost(10));
- }
- try testing.expectEqualStrings("", cb.readAtMost(10));
- try testing.expectEqualStrings("", cb.read());
-}
-
-test Self {
- var cb: Self = .{};
-
- const data = "0123456789abcdef" ** (1024 / 16);
- cb.writeAll(data);
- try testing.expectEqual(@as(usize, 0), cb.rp);
- try testing.expectEqual(@as(usize, 1024), cb.wp);
- try testing.expectEqual(@as(usize, 1024 * 63), cb.free());
-
- for (0..62 * 4) |_|
- try cb.writeMatch(256, 1024); // write 62K
-
- try testing.expectEqual(@as(usize, 0), cb.rp);
- try testing.expectEqual(@as(usize, 63 * 1024), cb.wp);
- try testing.expectEqual(@as(usize, 1024), cb.free());
-
- cb.writeAll(data[0..200]);
- _ = cb.readAtMost(1024); // make some space
- cb.writeAll(data); // overflows write position
- try testing.expectEqual(@as(usize, 200 + 65536), cb.wp);
- try testing.expectEqual(@as(usize, 1024), cb.rp);
- try testing.expectEqual(@as(usize, 1024 - 200), cb.free());
-
- const rb = cb.readBlock(Self.buffer_len);
- try testing.expectEqual(@as(usize, 65536 - 1024), rb.len);
- try testing.expectEqual(@as(usize, 1024), rb.head);
- try testing.expectEqual(@as(usize, 65536), rb.tail);
-
- try testing.expectEqual(@as(usize, 65536 - 1024), cb.read().len); // read to the end of the buffer
- try testing.expectEqual(@as(usize, 200 + 65536), cb.wp);
- try testing.expectEqual(@as(usize, 65536), cb.rp);
- try testing.expectEqual(@as(usize, 65536 - 200), cb.free());
-
- try testing.expectEqual(@as(usize, 200), cb.read().len); // read the rest
-}
-
-test "write overlap" {
- var cb: Self = .{};
- cb.wp = cb.buffer.len - 15;
- cb.rp = cb.wp;
-
- cb.writeAll("0123456789");
- cb.writeAll("abcdefghij");
-
- try testing.expectEqual(cb.buffer.len + 5, cb.wp);
- try testing.expectEqual(cb.buffer.len - 15, cb.rp);
-
- try testing.expectEqualStrings("0123456789abcde", cb.read());
- try testing.expectEqualStrings("fghij", cb.read());
-
- try testing.expect(cb.wp == cb.rp);
-}
-
-test "writeMatch/read overlap" {
- var cb: Self = .{};
- cb.wp = cb.buffer.len - 15;
- cb.rp = cb.wp;
-
- cb.writeAll("0123456789");
- try cb.writeMatch(15, 5);
-
- try testing.expectEqualStrings("012345678956789", cb.read());
- try testing.expectEqualStrings("5678956789", cb.read());
-
- try cb.writeMatch(20, 25);
- try testing.expectEqualStrings("01234567895678956789", cb.read());
-}
lib/std/compress/flate/Compress.zig
@@ -0,0 +1,1264 @@
+//! Default compression algorithm. Has two steps: tokenization and token
+//! encoding.
+//!
+//! Tokenization takes uncompressed input stream and produces list of tokens.
+//! Each token can be literal (byte of data) or match (backrefernce to previous
+//! data with length and distance). Tokenization accumulators 32K tokens, when
+//! full or `flush` is called tokens are passed to the `block_writer`. Level
+//! defines how hard (how slow) it tries to find match.
+//!
+//! Block writer will decide which type of deflate block to write (stored, fixed,
+//! dynamic) and encode tokens to the output byte stream. Client has to call
+//! `finish` to write block with the final bit set.
+//!
+//! Container defines type of header and footer which can be gzip, zlib or raw.
+//! They all share same deflate body. Raw has no header or footer just deflate
+//! body.
+//!
+//! Compression algorithm explained in rfc-1951 (slightly edited for this case):
+//!
+//! The compressor uses a chained hash table `lookup` to find duplicated
+//! strings, using a hash function that operates on 4-byte sequences. At any
+//! given point during compression, let XYZW be the next 4 input bytes
+//! (lookahead) to be examined (not necessarily all different, of course).
+//! First, the compressor examines the hash chain for XYZW. If the chain is
+//! empty, the compressor simply writes out X as a literal byte and advances
+//! one byte in the input. If the hash chain is not empty, indicating that the
+//! sequence XYZW (or, if we are unlucky, some other 4 bytes with the same
+//! hash function value) has occurred recently, the compressor compares all
+//! strings on the XYZW hash chain with the actual input data sequence
+//! starting at the current point, and selects the longest match.
+//!
+//! To improve overall compression, the compressor defers the selection of
+//! matches ("lazy matching"): after a match of length N has been found, the
+//! compressor searches for a longer match starting at the next input byte. If
+//! it finds a longer match, it truncates the previous match to a length of
+//! one (thus producing a single literal byte) and then emits the longer
+//! match. Otherwise, it emits the original match, and, as described above,
+//! advances N bytes before continuing.
+//!
+//!
+//! Allocates statically ~400K (192K lookup, 128K tokens, 64K window).
+const builtin = @import("builtin");
+const std = @import("std");
+const assert = std.debug.assert;
+const testing = std.testing;
+const expect = testing.expect;
+const mem = std.mem;
+const math = std.math;
+const Writer = std.Io.Writer;
+const Reader = std.Io.Reader;
+
+const Compress = @This();
+const Token = @import("Token.zig");
+const BlockWriter = @import("BlockWriter.zig");
+const flate = @import("../flate.zig");
+const Container = flate.Container;
+const Lookup = @import("Lookup.zig");
+const huffman = flate.huffman;
+
+lookup: Lookup = .{},
+tokens: Tokens = .{},
+/// Asserted to have a buffer capacity of at least `flate.max_window_len`.
+input: *Reader,
+block_writer: BlockWriter,
+level: LevelArgs,
+hasher: Container.Hasher,
+reader: Reader,
+
+// Match and literal at the previous position.
+// Used for lazy match finding in processWindow.
+prev_match: ?Token = null,
+prev_literal: ?u8 = null,
+
+/// Trades between speed and compression size.
+/// Starts with level 4: in [zlib](https://github.com/madler/zlib/blob/abd3d1a28930f89375d4b41408b39f6c1be157b2/deflate.c#L115C1-L117C43)
+/// levels 1-3 are using different algorithm to perform faster but with less
+/// compression. That is not implemented here.
+pub const Level = enum(u4) {
+ level_4 = 4,
+ level_5 = 5,
+ level_6 = 6,
+ level_7 = 7,
+ level_8 = 8,
+ level_9 = 9,
+
+ fast = 0xb,
+ default = 0xc,
+ best = 0xd,
+};
+
+/// Number of tokens to accumulate in deflate before starting block encoding.
+///
+/// In zlib this depends on memlevel: 6 + memlevel, where default memlevel is
+/// 8 and max 9 that gives 14 or 15 bits.
+pub const n_tokens = 1 << 15;
+
+/// Algorithm knobs for each level.
+const LevelArgs = struct {
+ good: u16, // Do less lookups if we already have match of this length.
+ nice: u16, // Stop looking for better match if we found match with at least this length.
+ lazy: u16, // Don't do lazy match find if got match with at least this length.
+ chain: u16, // How many lookups for previous match to perform.
+
+ pub fn get(level: Level) LevelArgs {
+ return switch (level) {
+ .fast, .level_4 => .{ .good = 4, .lazy = 4, .nice = 16, .chain = 16 },
+ .level_5 => .{ .good = 8, .lazy = 16, .nice = 32, .chain = 32 },
+ .default, .level_6 => .{ .good = 8, .lazy = 16, .nice = 128, .chain = 128 },
+ .level_7 => .{ .good = 8, .lazy = 32, .nice = 128, .chain = 256 },
+ .level_8 => .{ .good = 32, .lazy = 128, .nice = 258, .chain = 1024 },
+ .best, .level_9 => .{ .good = 32, .lazy = 258, .nice = 258, .chain = 4096 },
+ };
+ }
+};
+
+pub const Options = struct {
+ level: Level = .default,
+ container: Container = .raw,
+};
+
+pub fn init(input: *Reader, buffer: []u8, options: Options) Compress {
+ return .{
+ .input = input,
+ .block_writer = undefined,
+ .level = .get(options.level),
+ .hasher = .init(options.container),
+ .state = .header,
+ .reader = .{
+ .buffer = buffer,
+ .stream = stream,
+ },
+ };
+}
+
+const FlushOption = enum { none, flush, final };
+
+/// Process data in window and create tokens. If token buffer is full
+/// flush tokens to the token writer.
+///
+/// Returns number of bytes consumed from `lh`.
+fn tokenizeSlice(c: *Compress, bw: *Writer, limit: std.Io.Limit, lh: []const u8) !usize {
+ _ = bw;
+ _ = limit;
+ if (true) @panic("TODO");
+ var step: u16 = 1; // 1 in the case of literal, match length otherwise
+ const pos: u16 = c.win.pos();
+ const literal = lh[0]; // literal at current position
+ const min_len: u16 = if (c.prev_match) |m| m.length() else 0;
+
+ // Try to find match at least min_len long.
+ if (c.findMatch(pos, lh, min_len)) |match| {
+ // Found better match than previous.
+ try c.addPrevLiteral();
+
+ // Is found match length good enough?
+ if (match.length() >= c.level.lazy) {
+ // Don't try to lazy find better match, use this.
+ step = try c.addMatch(match);
+ } else {
+ // Store this match.
+ c.prev_literal = literal;
+ c.prev_match = match;
+ }
+ } else {
+ // There is no better match at current pos then it was previous.
+ // Write previous match or literal.
+ if (c.prev_match) |m| {
+ // Write match from previous position.
+ step = try c.addMatch(m) - 1; // we already advanced 1 from previous position
+ } else {
+ // No match at previous position.
+ // Write previous literal if any, and remember this literal.
+ try c.addPrevLiteral();
+ c.prev_literal = literal;
+ }
+ }
+ // Advance window and add hashes.
+ c.windowAdvance(step, lh, pos);
+}
+
+fn windowAdvance(self: *Compress, step: u16, lh: []const u8, pos: u16) void {
+ // current position is already added in findMatch
+ self.lookup.bulkAdd(lh[1..], step - 1, pos + 1);
+ self.win.advance(step);
+}
+
+// Add previous literal (if any) to the tokens list.
+fn addPrevLiteral(self: *Compress) !void {
+ if (self.prev_literal) |l| try self.addToken(Token.initLiteral(l));
+}
+
+// Add match to the tokens list, reset prev pointers.
+// Returns length of the added match.
+fn addMatch(self: *Compress, m: Token) !u16 {
+ try self.addToken(m);
+ self.prev_literal = null;
+ self.prev_match = null;
+ return m.length();
+}
+
+fn addToken(self: *Compress, token: Token) !void {
+ self.tokens.add(token);
+ if (self.tokens.full()) try self.flushTokens(.none);
+}
+
+// Finds largest match in the history window with the data at current pos.
+fn findMatch(self: *Compress, pos: u16, lh: []const u8, min_len: u16) ?Token {
+ var len: u16 = min_len;
+ // Previous location with the same hash (same 4 bytes).
+ var prev_pos = self.lookup.add(lh, pos);
+ // Last found match.
+ var match: ?Token = null;
+
+ // How much back-references to try, performance knob.
+ var chain: usize = self.level.chain;
+ if (len >= self.level.good) {
+ // If we've got a match that's good enough, only look in 1/4 the chain.
+ chain >>= 2;
+ }
+
+ // Hot path loop!
+ while (prev_pos > 0 and chain > 0) : (chain -= 1) {
+ const distance = pos - prev_pos;
+ if (distance > flate.match.max_distance)
+ break;
+
+ const new_len = self.win.match(prev_pos, pos, len);
+ if (new_len > len) {
+ match = Token.initMatch(@intCast(distance), new_len);
+ if (new_len >= self.level.nice) {
+ // The match is good enough that we don't try to find a better one.
+ return match;
+ }
+ len = new_len;
+ }
+ prev_pos = self.lookup.prev(prev_pos);
+ }
+
+ return match;
+}
+
+fn flushTokens(self: *Compress, flush_opt: FlushOption) !void {
+ // Pass tokens to the token writer
+ try self.block_writer.write(self.tokens.tokens(), flush_opt == .final, self.win.tokensBuffer());
+ // Stored block ensures byte alignment.
+ // It has 3 bits (final, block_type) and then padding until byte boundary.
+ // After that everything is aligned to the boundary in the stored block.
+ // Empty stored block is Ob000 + (0-7) bits of padding + 0x00 0x00 0xFF 0xFF.
+ // Last 4 bytes are byte aligned.
+ if (flush_opt == .flush) {
+ try self.block_writer.storedBlock("", false);
+ }
+ if (flush_opt != .none) {
+ // Safe to call only when byte aligned or it is OK to add
+ // padding bits (on last byte of the final block).
+ try self.block_writer.flush();
+ }
+ // Reset internal tokens store.
+ self.tokens.reset();
+ // Notify win that tokens are flushed.
+ self.win.flush();
+}
+
+// Slide win and if needed lookup tables.
+fn slide(self: *Compress) void {
+ const n = self.win.slide();
+ self.lookup.slide(n);
+}
+
+/// Flushes internal buffers to the output writer. Outputs empty stored
+/// block to sync bit stream to the byte boundary, so that the
+/// decompressor can get all input data available so far.
+///
+/// It is useful mainly in compressed network protocols, to ensure that
+/// deflate bit stream can be used as byte stream. May degrade
+/// compression so it should be used only when necessary.
+///
+/// Completes the current deflate block and follows it with an empty
+/// stored block that is three zero bits plus filler bits to the next
+/// byte, followed by four bytes (00 00 ff ff).
+///
+pub fn flush(c: *Compress) !void {
+ try c.tokenize(.flush);
+}
+
+/// Completes deflate bit stream by writing any pending data as deflate
+/// final deflate block. HAS to be called once all data are written to
+/// the compressor as a signal that next block has to have final bit
+/// set.
+///
+pub fn finish(c: *Compress) !void {
+ _ = c;
+ @panic("TODO");
+}
+
+/// Use another writer while preserving history. Most probably flush
+/// should be called on old writer before setting new.
+pub fn setWriter(self: *Compress, new_writer: *Writer) void {
+ self.block_writer.setWriter(new_writer);
+ self.output = new_writer;
+}
+
+// Tokens store
+const Tokens = struct {
+ list: [n_tokens]Token = undefined,
+ pos: usize = 0,
+
+ fn add(self: *Tokens, t: Token) void {
+ self.list[self.pos] = t;
+ self.pos += 1;
+ }
+
+ fn full(self: *Tokens) bool {
+ return self.pos == self.list.len;
+ }
+
+ fn reset(self: *Tokens) void {
+ self.pos = 0;
+ }
+
+ fn tokens(self: *Tokens) []const Token {
+ return self.list[0..self.pos];
+ }
+};
+
+/// Creates huffman only deflate blocks. Disables Lempel-Ziv match searching and
+/// only performs Huffman entropy encoding. Results in faster compression, much
+/// less memory requirements during compression but bigger compressed sizes.
+pub const Huffman = SimpleCompressor(.huffman, .raw);
+
+/// Creates store blocks only. Data are not compressed only packed into deflate
+/// store blocks. That adds 9 bytes of header for each block. Max stored block
+/// size is 64K. Block is emitted when flush is called on on finish.
+pub const store = struct {
+ pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
+ return SimpleCompressor(.store, container, WriterType);
+ }
+
+ pub fn compressor(comptime container: Container, writer: anytype) !store.Compressor(container, @TypeOf(writer)) {
+ return try store.Compressor(container, @TypeOf(writer)).init(writer);
+ }
+};
+
+const SimpleCompressorKind = enum {
+ huffman,
+ store,
+};
+
+fn simpleCompressor(
+ comptime kind: SimpleCompressorKind,
+ comptime container: Container,
+ writer: anytype,
+) !SimpleCompressor(kind, container, @TypeOf(writer)) {
+ return try SimpleCompressor(kind, container, @TypeOf(writer)).init(writer);
+}
+
+fn SimpleCompressor(
+ comptime kind: SimpleCompressorKind,
+ comptime container: Container,
+ comptime WriterType: type,
+) type {
+ const BlockWriterType = BlockWriter(WriterType);
+ return struct {
+ buffer: [65535]u8 = undefined, // because store blocks are limited to 65535 bytes
+ wp: usize = 0,
+
+ output: WriterType,
+ block_writer: BlockWriterType,
+ hasher: container.Hasher() = .{},
+
+ const Self = @This();
+
+ pub fn init(output: WriterType) !Self {
+ const self = Self{
+ .output = output,
+ .block_writer = BlockWriterType.init(output),
+ };
+ try container.writeHeader(self.output);
+ return self;
+ }
+
+ pub fn flush(self: *Self) !void {
+ try self.flushBuffer(false);
+ try self.block_writer.storedBlock("", false);
+ try self.block_writer.flush();
+ }
+
+ pub fn finish(self: *Self) !void {
+ try self.flushBuffer(true);
+ try self.block_writer.flush();
+ try container.writeFooter(&self.hasher, self.output);
+ }
+
+ fn flushBuffer(self: *Self, final: bool) !void {
+ const buf = self.buffer[0..self.wp];
+ switch (kind) {
+ .huffman => try self.block_writer.huffmanBlock(buf, final),
+ .store => try self.block_writer.storedBlock(buf, final),
+ }
+ self.wp = 0;
+ }
+ };
+}
+
+const LiteralNode = struct {
+ literal: u16,
+ freq: u16,
+};
+
+// Describes the state of the constructed tree for a given depth.
+const LevelInfo = struct {
+ // Our level. for better printing
+ level: u32,
+
+ // The frequency of the last node at this level
+ last_freq: u32,
+
+ // The frequency of the next character to add to this level
+ next_char_freq: u32,
+
+ // The frequency of the next pair (from level below) to add to this level.
+ // Only valid if the "needed" value of the next lower level is 0.
+ next_pair_freq: u32,
+
+ // The number of chains remaining to generate for this level before moving
+ // up to the next level
+ needed: u32,
+};
+
+// hcode is a huffman code with a bit code and bit length.
+pub const HuffCode = struct {
+ code: u16 = 0,
+ len: u16 = 0,
+
+ // set sets the code and length of an hcode.
+ fn set(self: *HuffCode, code: u16, length: u16) void {
+ self.len = length;
+ self.code = code;
+ }
+};
+
+pub fn HuffmanEncoder(comptime size: usize) type {
+ return struct {
+ codes: [size]HuffCode = undefined,
+ // Reusable buffer with the longest possible frequency table.
+ freq_cache: [huffman.max_num_frequencies + 1]LiteralNode = undefined,
+ bit_count: [17]u32 = undefined,
+ lns: []LiteralNode = undefined, // sorted by literal, stored to avoid repeated allocation in generate
+ lfs: []LiteralNode = undefined, // sorted by frequency, stored to avoid repeated allocation in generate
+
+ const Self = @This();
+
+ // Update this Huffman Code object to be the minimum code for the specified frequency count.
+ //
+ // freq An array of frequencies, in which frequency[i] gives the frequency of literal i.
+ // max_bits The maximum number of bits to use for any literal.
+ pub fn generate(self: *Self, freq: []u16, max_bits: u32) void {
+ var list = self.freq_cache[0 .. freq.len + 1];
+ // Number of non-zero literals
+ var count: u32 = 0;
+ // Set list to be the set of all non-zero literals and their frequencies
+ for (freq, 0..) |f, i| {
+ if (f != 0) {
+ list[count] = LiteralNode{ .literal = @as(u16, @intCast(i)), .freq = f };
+ count += 1;
+ } else {
+ list[count] = LiteralNode{ .literal = 0x00, .freq = 0 };
+ self.codes[i].len = 0;
+ }
+ }
+ list[freq.len] = LiteralNode{ .literal = 0x00, .freq = 0 };
+
+ list = list[0..count];
+ if (count <= 2) {
+ // Handle the small cases here, because they are awkward for the general case code. With
+ // two or fewer literals, everything has bit length 1.
+ for (list, 0..) |node, i| {
+ // "list" is in order of increasing literal value.
+ self.codes[node.literal].set(@as(u16, @intCast(i)), 1);
+ }
+ return;
+ }
+ self.lfs = list;
+ mem.sort(LiteralNode, self.lfs, {}, byFreq);
+
+ // Get the number of literals for each bit count
+ const bit_count = self.bitCounts(list, max_bits);
+ // And do the assignment
+ self.assignEncodingAndSize(bit_count, list);
+ }
+
+ pub fn bitLength(self: *Self, freq: []u16) u32 {
+ var total: u32 = 0;
+ for (freq, 0..) |f, i| {
+ if (f != 0) {
+ total += @as(u32, @intCast(f)) * @as(u32, @intCast(self.codes[i].len));
+ }
+ }
+ return total;
+ }
+
+ // Return the number of literals assigned to each bit size in the Huffman encoding
+ //
+ // This method is only called when list.len >= 3
+ // The cases of 0, 1, and 2 literals are handled by special case code.
+ //
+ // list: An array of the literals with non-zero frequencies
+ // and their associated frequencies. The array is in order of increasing
+ // frequency, and has as its last element a special element with frequency
+ // `math.maxInt(i32)`
+ //
+ // max_bits: The maximum number of bits that should be used to encode any literal.
+ // Must be less than 16.
+ //
+ // Returns an integer array in which array[i] indicates the number of literals
+ // that should be encoded in i bits.
+ fn bitCounts(self: *Self, list: []LiteralNode, max_bits_to_use: usize) []u32 {
+ var max_bits = max_bits_to_use;
+ const n = list.len;
+ const max_bits_limit = 16;
+
+ assert(max_bits < max_bits_limit);
+
+ // The tree can't have greater depth than n - 1, no matter what. This
+ // saves a little bit of work in some small cases
+ max_bits = @min(max_bits, n - 1);
+
+ // Create information about each of the levels.
+ // A bogus "Level 0" whose sole purpose is so that
+ // level1.prev.needed == 0. This makes level1.next_pair_freq
+ // be a legitimate value that never gets chosen.
+ var levels: [max_bits_limit]LevelInfo = mem.zeroes([max_bits_limit]LevelInfo);
+ // leaf_counts[i] counts the number of literals at the left
+ // of ancestors of the rightmost node at level i.
+ // leaf_counts[i][j] is the number of literals at the left
+ // of the level j ancestor.
+ var leaf_counts: [max_bits_limit][max_bits_limit]u32 = mem.zeroes([max_bits_limit][max_bits_limit]u32);
+
+ {
+ var level = @as(u32, 1);
+ while (level <= max_bits) : (level += 1) {
+ // For every level, the first two items are the first two characters.
+ // We initialize the levels as if we had already figured this out.
+ levels[level] = LevelInfo{
+ .level = level,
+ .last_freq = list[1].freq,
+ .next_char_freq = list[2].freq,
+ .next_pair_freq = list[0].freq + list[1].freq,
+ .needed = 0,
+ };
+ leaf_counts[level][level] = 2;
+ if (level == 1) {
+ levels[level].next_pair_freq = math.maxInt(i32);
+ }
+ }
+ }
+
+ // We need a total of 2*n - 2 items at top level and have already generated 2.
+ levels[max_bits].needed = 2 * @as(u32, @intCast(n)) - 4;
+
+ {
+ var level = max_bits;
+ while (true) {
+ var l = &levels[level];
+ if (l.next_pair_freq == math.maxInt(i32) and l.next_char_freq == math.maxInt(i32)) {
+ // We've run out of both leaves and pairs.
+ // End all calculations for this level.
+ // To make sure we never come back to this level or any lower level,
+ // set next_pair_freq impossibly large.
+ l.needed = 0;
+ levels[level + 1].next_pair_freq = math.maxInt(i32);
+ level += 1;
+ continue;
+ }
+
+ const prev_freq = l.last_freq;
+ if (l.next_char_freq < l.next_pair_freq) {
+ // The next item on this row is a leaf node.
+ const next = leaf_counts[level][level] + 1;
+ l.last_freq = l.next_char_freq;
+ // Lower leaf_counts are the same of the previous node.
+ leaf_counts[level][level] = next;
+ if (next >= list.len) {
+ l.next_char_freq = maxNode().freq;
+ } else {
+ l.next_char_freq = list[next].freq;
+ }
+ } else {
+ // The next item on this row is a pair from the previous row.
+ // next_pair_freq isn't valid until we generate two
+ // more values in the level below
+ l.last_freq = l.next_pair_freq;
+ // Take leaf counts from the lower level, except counts[level] remains the same.
+ @memcpy(leaf_counts[level][0..level], leaf_counts[level - 1][0..level]);
+ levels[l.level - 1].needed = 2;
+ }
+
+ l.needed -= 1;
+ if (l.needed == 0) {
+ // We've done everything we need to do for this level.
+ // Continue calculating one level up. Fill in next_pair_freq
+ // of that level with the sum of the two nodes we've just calculated on
+ // this level.
+ if (l.level == max_bits) {
+ // All done!
+ break;
+ }
+ levels[l.level + 1].next_pair_freq = prev_freq + l.last_freq;
+ level += 1;
+ } else {
+ // If we stole from below, move down temporarily to replenish it.
+ while (levels[level - 1].needed > 0) {
+ level -= 1;
+ if (level == 0) {
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ // Somethings is wrong if at the end, the top level is null or hasn't used
+ // all of the leaves.
+ assert(leaf_counts[max_bits][max_bits] == n);
+
+ var bit_count = self.bit_count[0 .. max_bits + 1];
+ var bits: u32 = 1;
+ const counts = &leaf_counts[max_bits];
+ {
+ var level = max_bits;
+ while (level > 0) : (level -= 1) {
+ // counts[level] gives the number of literals requiring at least "bits"
+ // bits to encode.
+ bit_count[bits] = counts[level] - counts[level - 1];
+ bits += 1;
+ if (level == 0) {
+ break;
+ }
+ }
+ }
+ return bit_count;
+ }
+
+ // Look at the leaves and assign them a bit count and an encoding as specified
+ // in RFC 1951 3.2.2
+ fn assignEncodingAndSize(self: *Self, bit_count: []u32, list_arg: []LiteralNode) void {
+ var code = @as(u16, 0);
+ var list = list_arg;
+
+ for (bit_count, 0..) |bits, n| {
+ code <<= 1;
+ if (n == 0 or bits == 0) {
+ continue;
+ }
+ // The literals list[list.len-bits] .. list[list.len-bits]
+ // are encoded using "bits" bits, and get the values
+ // code, code + 1, .... The code values are
+ // assigned in literal order (not frequency order).
+ const chunk = list[list.len - @as(u32, @intCast(bits)) ..];
+
+ self.lns = chunk;
+ mem.sort(LiteralNode, self.lns, {}, byLiteral);
+
+ for (chunk) |node| {
+ self.codes[node.literal] = HuffCode{
+ .code = bitReverse(u16, code, @as(u5, @intCast(n))),
+ .len = @as(u16, @intCast(n)),
+ };
+ code += 1;
+ }
+ list = list[0 .. list.len - @as(u32, @intCast(bits))];
+ }
+ }
+ };
+}
+
+fn maxNode() LiteralNode {
+ return LiteralNode{
+ .literal = math.maxInt(u16),
+ .freq = math.maxInt(u16),
+ };
+}
+
+pub fn huffmanEncoder(comptime size: u32) HuffmanEncoder(size) {
+ return .{};
+}
+
+pub const LiteralEncoder = HuffmanEncoder(huffman.max_num_frequencies);
+pub const DistanceEncoder = HuffmanEncoder(huffman.distance_code_count);
+pub const CodegenEncoder = HuffmanEncoder(19);
+
+// Generates a HuffmanCode corresponding to the fixed literal table
+pub fn fixedLiteralEncoder() LiteralEncoder {
+ var h: LiteralEncoder = undefined;
+ var ch: u16 = 0;
+
+ while (ch < huffman.max_num_frequencies) : (ch += 1) {
+ var bits: u16 = undefined;
+ var size: u16 = undefined;
+ switch (ch) {
+ 0...143 => {
+ // size 8, 000110000 .. 10111111
+ bits = ch + 48;
+ size = 8;
+ },
+ 144...255 => {
+ // size 9, 110010000 .. 111111111
+ bits = ch + 400 - 144;
+ size = 9;
+ },
+ 256...279 => {
+ // size 7, 0000000 .. 0010111
+ bits = ch - 256;
+ size = 7;
+ },
+ else => {
+ // size 8, 11000000 .. 11000111
+ bits = ch + 192 - 280;
+ size = 8;
+ },
+ }
+ h.codes[ch] = HuffCode{ .code = bitReverse(u16, bits, @as(u5, @intCast(size))), .len = size };
+ }
+ return h;
+}
+
+pub fn fixedDistanceEncoder() DistanceEncoder {
+ var h: DistanceEncoder = undefined;
+ for (h.codes, 0..) |_, ch| {
+ h.codes[ch] = HuffCode{ .code = bitReverse(u16, @as(u16, @intCast(ch)), 5), .len = 5 };
+ }
+ return h;
+}
+
+pub fn huffmanDistanceEncoder() DistanceEncoder {
+ var distance_freq = [1]u16{0} ** huffman.distance_code_count;
+ distance_freq[0] = 1;
+ // huff_distance is a static distance encoder used for huffman only encoding.
+ // It can be reused since we will not be encoding distance values.
+ var h: DistanceEncoder = .{};
+ h.generate(distance_freq[0..], 15);
+ return h;
+}
+
+fn byLiteral(context: void, a: LiteralNode, b: LiteralNode) bool {
+ _ = context;
+ return a.literal < b.literal;
+}
+
+fn byFreq(context: void, a: LiteralNode, b: LiteralNode) bool {
+ _ = context;
+ if (a.freq == b.freq) {
+ return a.literal < b.literal;
+ }
+ return a.freq < b.freq;
+}
+
+fn stream(r: *Reader, w: *Writer, limit: std.Io.Limit) Reader.StreamError!usize {
+ const c: *Compress = @fieldParentPtr("reader", r);
+ switch (c.state) {
+ .header => |i| {
+ const header = c.hasher.container().header();
+ const n = try w.write(header[i..]);
+ if (header.len - i - n == 0) {
+ c.state = .middle;
+ } else {
+ c.state.header += n;
+ }
+ return n;
+ },
+ .middle => {
+ c.input.fillMore() catch |err| switch (err) {
+ error.EndOfStream => {
+ c.state = .final;
+ return 0;
+ },
+ else => |e| return e,
+ };
+ const buffer_contents = c.input.buffered();
+ const min_lookahead = flate.match.min_length + flate.match.max_length;
+ const history_plus_lookahead_len = flate.history_len + min_lookahead;
+ if (buffer_contents.len < history_plus_lookahead_len) return 0;
+ const lookahead = buffer_contents[flate.history_len..];
+ const start = w.count;
+ const n = try c.tokenizeSlice(w, limit, lookahead) catch |err| switch (err) {
+ error.WriteFailed => return error.WriteFailed,
+ };
+ c.hasher.update(lookahead[0..n]);
+ c.input.toss(n);
+ return w.count - start;
+ },
+ .final => {
+ const buffer_contents = c.input.buffered();
+ const start = w.count;
+ const n = c.tokenizeSlice(w, limit, buffer_contents) catch |err| switch (err) {
+ error.WriteFailed => return error.WriteFailed,
+ };
+ if (buffer_contents.len - n == 0) {
+ c.hasher.update(buffer_contents);
+ c.input.tossAll();
+ {
+ // In the case of flushing, last few lookahead buffers were
+ // smaller than min match len, so only last literal can be
+ // unwritten.
+ assert(c.prev_match == null);
+ try c.addPrevLiteral();
+ c.prev_literal = null;
+
+ try c.flushTokens(.final);
+ }
+ switch (c.hasher) {
+ .gzip => |*gzip| {
+ // GZIP 8 bytes footer
+ // - 4 bytes, CRC32 (CRC-32)
+ // - 4 bytes, ISIZE (Input SIZE) - size of the original (uncompressed) input data modulo 2^32
+ comptime assert(c.footer_buffer.len == 8);
+ std.mem.writeInt(u32, c.footer_buffer[0..4], gzip.final(), .little);
+ std.mem.writeInt(u32, c.footer_buffer[4..8], gzip.bytes_read, .little);
+ c.state = .{ .footer = 0 };
+ },
+ .zlib => |*zlib| {
+ // ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952).
+ // 4 bytes of ADLER32 (Adler-32 checksum)
+ // Checksum value of the uncompressed data (excluding any
+ // dictionary data) computed according to Adler-32
+ // algorithm.
+ comptime assert(c.footer_buffer.len == 8);
+ std.mem.writeInt(u32, c.footer_buffer[4..8], zlib.final, .big);
+ c.state = .{ .footer = 4 };
+ },
+ .raw => {
+ c.state = .ended;
+ },
+ }
+ }
+ return w.count - start;
+ },
+ .ended => return error.EndOfStream,
+ .footer => |i| {
+ const remaining = c.footer_buffer[i..];
+ const n = try w.write(limit.slice(remaining));
+ c.state = if (n == remaining) .ended else .{ .footer = i - n };
+ return n;
+ },
+ }
+}
+
+test "generate a Huffman code from an array of frequencies" {
+ var freqs: [19]u16 = [_]u16{
+ 8, // 0
+ 1, // 1
+ 1, // 2
+ 2, // 3
+ 5, // 4
+ 10, // 5
+ 9, // 6
+ 1, // 7
+ 0, // 8
+ 0, // 9
+ 0, // 10
+ 0, // 11
+ 0, // 12
+ 0, // 13
+ 0, // 14
+ 0, // 15
+ 1, // 16
+ 3, // 17
+ 5, // 18
+ };
+
+ var enc = huffmanEncoder(19);
+ enc.generate(freqs[0..], 7);
+
+ try testing.expectEqual(@as(u32, 141), enc.bitLength(freqs[0..]));
+
+ try testing.expectEqual(@as(usize, 3), enc.codes[0].len);
+ try testing.expectEqual(@as(usize, 6), enc.codes[1].len);
+ try testing.expectEqual(@as(usize, 6), enc.codes[2].len);
+ try testing.expectEqual(@as(usize, 5), enc.codes[3].len);
+ try testing.expectEqual(@as(usize, 3), enc.codes[4].len);
+ try testing.expectEqual(@as(usize, 2), enc.codes[5].len);
+ try testing.expectEqual(@as(usize, 2), enc.codes[6].len);
+ try testing.expectEqual(@as(usize, 6), enc.codes[7].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[8].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[9].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[10].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[11].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[12].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[13].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[14].len);
+ try testing.expectEqual(@as(usize, 0), enc.codes[15].len);
+ try testing.expectEqual(@as(usize, 6), enc.codes[16].len);
+ try testing.expectEqual(@as(usize, 5), enc.codes[17].len);
+ try testing.expectEqual(@as(usize, 3), enc.codes[18].len);
+
+ try testing.expectEqual(@as(u16, 0x0), enc.codes[5].code);
+ try testing.expectEqual(@as(u16, 0x2), enc.codes[6].code);
+ try testing.expectEqual(@as(u16, 0x1), enc.codes[0].code);
+ try testing.expectEqual(@as(u16, 0x5), enc.codes[4].code);
+ try testing.expectEqual(@as(u16, 0x3), enc.codes[18].code);
+ try testing.expectEqual(@as(u16, 0x7), enc.codes[3].code);
+ try testing.expectEqual(@as(u16, 0x17), enc.codes[17].code);
+ try testing.expectEqual(@as(u16, 0x0f), enc.codes[1].code);
+ try testing.expectEqual(@as(u16, 0x2f), enc.codes[2].code);
+ try testing.expectEqual(@as(u16, 0x1f), enc.codes[7].code);
+ try testing.expectEqual(@as(u16, 0x3f), enc.codes[16].code);
+}
+
+test "generate a Huffman code for the fixed literal table specific to Deflate" {
+ const enc = fixedLiteralEncoder();
+ for (enc.codes) |c| {
+ switch (c.len) {
+ 7 => {
+ const v = @bitReverse(@as(u7, @intCast(c.code)));
+ try testing.expect(v <= 0b0010111);
+ },
+ 8 => {
+ const v = @bitReverse(@as(u8, @intCast(c.code)));
+ try testing.expect((v >= 0b000110000 and v <= 0b10111111) or
+ (v >= 0b11000000 and v <= 11000111));
+ },
+ 9 => {
+ const v = @bitReverse(@as(u9, @intCast(c.code)));
+ try testing.expect(v >= 0b110010000 and v <= 0b111111111);
+ },
+ else => unreachable,
+ }
+ }
+}
+
+test "generate a Huffman code for the 30 possible relative distances (LZ77 distances) of Deflate" {
+ const enc = fixedDistanceEncoder();
+ for (enc.codes) |c| {
+ const v = @bitReverse(@as(u5, @intCast(c.code)));
+ try testing.expect(v <= 29);
+ try testing.expect(c.len == 5);
+ }
+}
+
+// Reverse bit-by-bit a N-bit code.
+fn bitReverse(comptime T: type, value: T, n: usize) T {
+ const r = @bitReverse(value);
+ return r >> @as(math.Log2Int(T), @intCast(@typeInfo(T).int.bits - n));
+}
+
+test bitReverse {
+ const ReverseBitsTest = struct {
+ in: u16,
+ bit_count: u5,
+ out: u16,
+ };
+
+ const reverse_bits_tests = [_]ReverseBitsTest{
+ .{ .in = 1, .bit_count = 1, .out = 1 },
+ .{ .in = 1, .bit_count = 2, .out = 2 },
+ .{ .in = 1, .bit_count = 3, .out = 4 },
+ .{ .in = 1, .bit_count = 4, .out = 8 },
+ .{ .in = 1, .bit_count = 5, .out = 16 },
+ .{ .in = 17, .bit_count = 5, .out = 17 },
+ .{ .in = 257, .bit_count = 9, .out = 257 },
+ .{ .in = 29, .bit_count = 5, .out = 23 },
+ };
+
+ for (reverse_bits_tests) |h| {
+ const v = bitReverse(u16, h.in, h.bit_count);
+ try std.testing.expectEqual(h.out, v);
+ }
+}
+
+test "fixedLiteralEncoder codes" {
+ var al = std.ArrayList(u8).init(testing.allocator);
+ defer al.deinit();
+ var bw = std.Io.bitWriter(.little, al.writer());
+
+ const f = fixedLiteralEncoder();
+ for (f.codes) |c| {
+ try bw.writeBits(c.code, c.len);
+ }
+ try testing.expectEqualSlices(u8, &fixed_codes, al.items);
+}
+
+pub const fixed_codes = [_]u8{
+ 0b00001100, 0b10001100, 0b01001100, 0b11001100, 0b00101100, 0b10101100, 0b01101100, 0b11101100,
+ 0b00011100, 0b10011100, 0b01011100, 0b11011100, 0b00111100, 0b10111100, 0b01111100, 0b11111100,
+ 0b00000010, 0b10000010, 0b01000010, 0b11000010, 0b00100010, 0b10100010, 0b01100010, 0b11100010,
+ 0b00010010, 0b10010010, 0b01010010, 0b11010010, 0b00110010, 0b10110010, 0b01110010, 0b11110010,
+ 0b00001010, 0b10001010, 0b01001010, 0b11001010, 0b00101010, 0b10101010, 0b01101010, 0b11101010,
+ 0b00011010, 0b10011010, 0b01011010, 0b11011010, 0b00111010, 0b10111010, 0b01111010, 0b11111010,
+ 0b00000110, 0b10000110, 0b01000110, 0b11000110, 0b00100110, 0b10100110, 0b01100110, 0b11100110,
+ 0b00010110, 0b10010110, 0b01010110, 0b11010110, 0b00110110, 0b10110110, 0b01110110, 0b11110110,
+ 0b00001110, 0b10001110, 0b01001110, 0b11001110, 0b00101110, 0b10101110, 0b01101110, 0b11101110,
+ 0b00011110, 0b10011110, 0b01011110, 0b11011110, 0b00111110, 0b10111110, 0b01111110, 0b11111110,
+ 0b00000001, 0b10000001, 0b01000001, 0b11000001, 0b00100001, 0b10100001, 0b01100001, 0b11100001,
+ 0b00010001, 0b10010001, 0b01010001, 0b11010001, 0b00110001, 0b10110001, 0b01110001, 0b11110001,
+ 0b00001001, 0b10001001, 0b01001001, 0b11001001, 0b00101001, 0b10101001, 0b01101001, 0b11101001,
+ 0b00011001, 0b10011001, 0b01011001, 0b11011001, 0b00111001, 0b10111001, 0b01111001, 0b11111001,
+ 0b00000101, 0b10000101, 0b01000101, 0b11000101, 0b00100101, 0b10100101, 0b01100101, 0b11100101,
+ 0b00010101, 0b10010101, 0b01010101, 0b11010101, 0b00110101, 0b10110101, 0b01110101, 0b11110101,
+ 0b00001101, 0b10001101, 0b01001101, 0b11001101, 0b00101101, 0b10101101, 0b01101101, 0b11101101,
+ 0b00011101, 0b10011101, 0b01011101, 0b11011101, 0b00111101, 0b10111101, 0b01111101, 0b11111101,
+ 0b00010011, 0b00100110, 0b01001110, 0b10011010, 0b00111100, 0b01100101, 0b11101010, 0b10110100,
+ 0b11101001, 0b00110011, 0b01100110, 0b11001110, 0b10011010, 0b00111101, 0b01100111, 0b11101110,
+ 0b10111100, 0b11111001, 0b00001011, 0b00010110, 0b00101110, 0b01011010, 0b10111100, 0b01100100,
+ 0b11101001, 0b10110010, 0b11100101, 0b00101011, 0b01010110, 0b10101110, 0b01011010, 0b10111101,
+ 0b01100110, 0b11101101, 0b10111010, 0b11110101, 0b00011011, 0b00110110, 0b01101110, 0b11011010,
+ 0b10111100, 0b01100101, 0b11101011, 0b10110110, 0b11101101, 0b00111011, 0b01110110, 0b11101110,
+ 0b11011010, 0b10111101, 0b01100111, 0b11101111, 0b10111110, 0b11111101, 0b00000111, 0b00001110,
+ 0b00011110, 0b00111010, 0b01111100, 0b11100100, 0b11101000, 0b10110001, 0b11100011, 0b00100111,
+ 0b01001110, 0b10011110, 0b00111010, 0b01111101, 0b11100110, 0b11101100, 0b10111001, 0b11110011,
+ 0b00010111, 0b00101110, 0b01011110, 0b10111010, 0b01111100, 0b11100101, 0b11101010, 0b10110101,
+ 0b11101011, 0b00110111, 0b01101110, 0b11011110, 0b10111010, 0b01111101, 0b11100111, 0b11101110,
+ 0b10111101, 0b11111011, 0b00001111, 0b00011110, 0b00111110, 0b01111010, 0b11111100, 0b11100100,
+ 0b11101001, 0b10110011, 0b11100111, 0b00101111, 0b01011110, 0b10111110, 0b01111010, 0b11111101,
+ 0b11100110, 0b11101101, 0b10111011, 0b11110111, 0b00011111, 0b00111110, 0b01111110, 0b11111010,
+ 0b11111100, 0b11100101, 0b11101011, 0b10110111, 0b11101111, 0b00111111, 0b01111110, 0b11111110,
+ 0b11111010, 0b11111101, 0b11100111, 0b11101111, 0b10111111, 0b11111111, 0b00000000, 0b00100000,
+ 0b00001000, 0b00001100, 0b10000001, 0b11000010, 0b11100000, 0b00001000, 0b00100100, 0b00001010,
+ 0b10001101, 0b11000001, 0b11100010, 0b11110000, 0b00000100, 0b00100010, 0b10001001, 0b01001100,
+ 0b10100001, 0b11010010, 0b11101000, 0b00000011, 0b10000011, 0b01000011, 0b11000011, 0b00100011,
+ 0b10100011,
+};
+
+test "tokenization" {
+ const L = Token.initLiteral;
+ const M = Token.initMatch;
+
+ const cases = [_]struct {
+ data: []const u8,
+ tokens: []const Token,
+ }{
+ .{
+ .data = "Blah blah blah blah blah!",
+ .tokens = &[_]Token{ L('B'), L('l'), L('a'), L('h'), L(' '), L('b'), M(5, 18), L('!') },
+ },
+ .{
+ .data = "ABCDEABCD ABCDEABCD",
+ .tokens = &[_]Token{
+ L('A'), L('B'), L('C'), L('D'), L('E'), L('A'), L('B'), L('C'), L('D'), L(' '),
+ L('A'), M(10, 8),
+ },
+ },
+ };
+
+ for (cases) |c| {
+ inline for (Container.list) |container| { // for each wrapping
+
+ var cw = std.Io.countingWriter(std.Io.null_writer);
+ const cww = cw.writer();
+ var df = try Compress(container, @TypeOf(cww), TestTokenWriter).init(cww, .{});
+
+ _ = try df.write(c.data);
+ try df.flush();
+
+ // df.token_writer.show();
+ try expect(df.block_writer.pos == c.tokens.len); // number of tokens written
+ try testing.expectEqualSlices(Token, df.block_writer.get(), c.tokens); // tokens match
+
+ try testing.expectEqual(container.headerSize(), cw.bytes_written);
+ try df.finish();
+ try testing.expectEqual(container.size(), cw.bytes_written);
+ }
+ }
+}
+
+// Tests that tokens written are equal to expected token list.
+const TestTokenWriter = struct {
+ const Self = @This();
+
+ pos: usize = 0,
+ actual: [128]Token = undefined,
+
+ pub fn init(_: anytype) Self {
+ return .{};
+ }
+ pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
+ for (tokens) |t| {
+ self.actual[self.pos] = t;
+ self.pos += 1;
+ }
+ }
+
+ pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {}
+
+ pub fn get(self: *Self) []Token {
+ return self.actual[0..self.pos];
+ }
+
+ pub fn show(self: *Self) void {
+ std.debug.print("\n", .{});
+ for (self.get()) |t| {
+ t.show();
+ }
+ }
+
+ pub fn flush(_: *Self) !void {}
+};
+
+test "file tokenization" {
+ const levels = [_]Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
+ const cases = [_]struct {
+ data: []const u8, // uncompressed content
+ // expected number of tokens producet in deflate tokenization
+ tokens_count: [levels.len]usize = .{0} ** levels.len,
+ }{
+ .{
+ .data = @embedFile("testdata/rfc1951.txt"),
+ .tokens_count = .{ 7675, 7672, 7599, 7594, 7598, 7599 },
+ },
+
+ .{
+ .data = @embedFile("testdata/block_writer/huffman-null-max.input"),
+ .tokens_count = .{ 257, 257, 257, 257, 257, 257 },
+ },
+ .{
+ .data = @embedFile("testdata/block_writer/huffman-pi.input"),
+ .tokens_count = .{ 2570, 2564, 2564, 2564, 2564, 2564 },
+ },
+ .{
+ .data = @embedFile("testdata/block_writer/huffman-text.input"),
+ .tokens_count = .{ 235, 234, 234, 234, 234, 234 },
+ },
+ .{
+ .data = @embedFile("testdata/fuzz/roundtrip1.input"),
+ .tokens_count = .{ 333, 331, 331, 331, 331, 331 },
+ },
+ .{
+ .data = @embedFile("testdata/fuzz/roundtrip2.input"),
+ .tokens_count = .{ 334, 334, 334, 334, 334, 334 },
+ },
+ };
+
+ for (cases) |case| { // for each case
+ const data = case.data;
+
+ for (levels, 0..) |level, i| { // for each compression level
+ var original: Reader = .fixed(data);
+
+ // buffer for decompressed data
+ var al = std.ArrayList(u8).init(testing.allocator);
+ defer al.deinit();
+ const writer = al.writer();
+
+ // create compressor
+ const WriterType = @TypeOf(writer);
+ const TokenWriter = TokenDecoder(@TypeOf(writer));
+ var cmp = try Compress(.raw, WriterType, TokenWriter).init(writer, .{ .level = level });
+
+ // Stream uncompressed `original` data to the compressor. It will
+ // produce tokens list and pass that list to the TokenDecoder. This
+ // TokenDecoder uses CircularBuffer from inflate to convert list of
+ // tokens back to the uncompressed stream.
+ try cmp.compress(original.reader());
+ try cmp.flush();
+ const expected_count = case.tokens_count[i];
+ const actual = cmp.block_writer.tokens_count;
+ if (expected_count == 0) {
+ std.debug.print("actual token count {d}\n", .{actual});
+ } else {
+ try testing.expectEqual(expected_count, actual);
+ }
+
+ try testing.expectEqual(data.len, al.items.len);
+ try testing.expectEqualSlices(u8, data, al.items);
+ }
+ }
+}
+
+const TokenDecoder = struct {
+ output: *Writer,
+ tokens_count: usize,
+
+ pub fn init(output: *Writer) TokenDecoder {
+ return .{
+ .output = output,
+ .tokens_count = 0,
+ };
+ }
+
+ pub fn write(self: *TokenDecoder, tokens: []const Token, _: bool, _: ?[]const u8) !void {
+ self.tokens_count += tokens.len;
+ for (tokens) |t| {
+ switch (t.kind) {
+ .literal => self.hist.write(t.literal()),
+ .match => try self.hist.writeMatch(t.length(), t.distance()),
+ }
+ if (self.hist.free() < 285) try self.flushWin();
+ }
+ try self.flushWin();
+ }
+
+ fn flushWin(self: *TokenDecoder) !void {
+ while (true) {
+ const buf = self.hist.read();
+ if (buf.len == 0) break;
+ try self.output.writeAll(buf);
+ }
+ }
+};
+
+test "store simple compressor" {
+ const data = "Hello world!";
+ const expected = [_]u8{
+ 0x1, // block type 0, final bit set
+ 0xc, 0x0, // len = 12
+ 0xf3, 0xff, // ~len
+ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', //
+ //0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21,
+ };
+
+ var fbs: Reader = .fixed(data);
+ var al = std.ArrayList(u8).init(testing.allocator);
+ defer al.deinit();
+
+ var cmp = try store.compressor(.raw, al.writer());
+ try cmp.compress(&fbs);
+ try cmp.finish();
+ try testing.expectEqualSlices(u8, &expected, al.items);
+
+ fbs = .fixed(data);
+ try al.resize(0);
+
+ // huffman only compresoor will also emit store block for this small sample
+ var hc = try huffman.compressor(.raw, al.writer());
+ try hc.compress(&fbs);
+ try hc.finish();
+ try testing.expectEqualSlices(u8, &expected, al.items);
+}
+
+test "sliding window match" {
+ const data = "Blah blah blah blah blah!";
+ var win: Writer = .{};
+ try expect(win.write(data) == data.len);
+ try expect(win.wp == data.len);
+ try expect(win.rp == 0);
+
+ // length between l symbols
+ try expect(win.match(1, 6, 0) == 18);
+ try expect(win.match(1, 11, 0) == 13);
+ try expect(win.match(1, 16, 0) == 8);
+ try expect(win.match(1, 21, 0) == 0);
+
+ // position 15 = "blah blah!"
+ // position 20 = "blah!"
+ try expect(win.match(15, 20, 0) == 4);
+ try expect(win.match(15, 20, 3) == 4);
+ try expect(win.match(15, 20, 4) == 0);
+}
+
+test "sliding window slide" {
+ var win: Writer = .{};
+ win.wp = Writer.buffer_len - 11;
+ win.rp = Writer.buffer_len - 111;
+ win.buffer[win.rp] = 0xab;
+ try expect(win.lookahead().len == 100);
+ try expect(win.tokensBuffer().?.len == win.rp);
+
+ const n = win.slide();
+ try expect(n == 32757);
+ try expect(win.buffer[win.rp] == 0xab);
+ try expect(win.rp == Writer.hist_len - 111);
+ try expect(win.wp == Writer.hist_len - 11);
+ try expect(win.lookahead().len == 100);
+ try expect(win.tokensBuffer() == null);
+}
lib/std/compress/flate/consts.zig
@@ -1,49 +0,0 @@
-pub const deflate = struct {
- // Number of tokens to accumulate in deflate before starting block encoding.
- //
- // In zlib this depends on memlevel: 6 + memlevel, where default memlevel is
- // 8 and max 9 that gives 14 or 15 bits.
- pub const tokens = 1 << 15;
-};
-
-pub const match = struct {
- pub const base_length = 3; // smallest match length per the RFC section 3.2.5
- pub const min_length = 4; // min length used in this algorithm
- pub const max_length = 258;
-
- pub const min_distance = 1;
- pub const max_distance = 32768;
-};
-
-pub const history = struct {
- pub const len = match.max_distance;
-};
-
-pub const lookup = struct {
- pub const bits = 15;
- pub const len = 1 << bits;
- pub const shift = 32 - bits;
-};
-
-pub const huffman = struct {
- // The odd order in which the codegen code sizes are written.
- pub const codegen_order = [_]u32{ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
- // The number of codegen codes.
- pub const codegen_code_count = 19;
-
- // The largest distance code.
- pub const distance_code_count = 30;
-
- // Maximum number of literals.
- pub const max_num_lit = 286;
-
- // Max number of frequencies used for a Huffman Code
- // Possible lengths are codegen_code_count (19), distance_code_count (30) and max_num_lit (286).
- // The largest of these is max_num_lit.
- pub const max_num_frequencies = max_num_lit;
-
- // Biggest block size for uncompressed block.
- pub const max_store_block_size = 65535;
- // The special code used to mark the end of a block.
- pub const end_block_marker = 256;
-};
lib/std/compress/flate/container.zig
@@ -1,208 +0,0 @@
-//! Container of the deflate bit stream body. Container adds header before
-//! deflate bit stream and footer after. It can bi gzip, zlib or raw (no header,
-//! no footer, raw bit stream).
-//!
-//! Zlib format is defined in rfc 1950. Header has 2 bytes and footer 4 bytes
-//! addler 32 checksum.
-//!
-//! Gzip format is defined in rfc 1952. Header has 10+ bytes and footer 4 bytes
-//! crc32 checksum and 4 bytes of uncompressed data length.
-//!
-//!
-//! rfc 1950: https://datatracker.ietf.org/doc/html/rfc1950#page-4
-//! rfc 1952: https://datatracker.ietf.org/doc/html/rfc1952#page-5
-//!
-
-const std = @import("std");
-
-pub const Container = enum {
- raw, // no header or footer
- gzip, // gzip header and footer
- zlib, // zlib header and footer
-
- pub fn size(w: Container) usize {
- return headerSize(w) + footerSize(w);
- }
-
- pub fn headerSize(w: Container) usize {
- return switch (w) {
- .gzip => 10,
- .zlib => 2,
- .raw => 0,
- };
- }
-
- pub fn footerSize(w: Container) usize {
- return switch (w) {
- .gzip => 8,
- .zlib => 4,
- .raw => 0,
- };
- }
-
- pub const list = [_]Container{ .raw, .gzip, .zlib };
-
- pub const Error = error{
- BadGzipHeader,
- BadZlibHeader,
- WrongGzipChecksum,
- WrongGzipSize,
- WrongZlibChecksum,
- };
-
- pub fn writeHeader(comptime wrap: Container, writer: anytype) !void {
- switch (wrap) {
- .gzip => {
- // GZIP 10 byte header (https://datatracker.ietf.org/doc/html/rfc1952#page-5):
- // - ID1 (IDentification 1), always 0x1f
- // - ID2 (IDentification 2), always 0x8b
- // - CM (Compression Method), always 8 = deflate
- // - FLG (Flags), all set to 0
- // - 4 bytes, MTIME (Modification time), not used, all set to zero
- // - XFL (eXtra FLags), all set to zero
- // - OS (Operating System), 03 = Unix
- const gzipHeader = [_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 };
- try writer.writeAll(&gzipHeader);
- },
- .zlib => {
- // ZLIB has a two-byte header (https://datatracker.ietf.org/doc/html/rfc1950#page-4):
- // 1st byte:
- // - First four bits is the CINFO (compression info), which is 7 for the default deflate window size.
- // - The next four bits is the CM (compression method), which is 8 for deflate.
- // 2nd byte:
- // - Two bits is the FLEVEL (compression level). Values are: 0=fastest, 1=fast, 2=default, 3=best.
- // - The next bit, FDICT, is set if a dictionary is given.
- // - The final five FCHECK bits form a mod-31 checksum.
- //
- // CINFO = 7, CM = 8, FLEVEL = 0b10, FDICT = 0, FCHECK = 0b11100
- const zlibHeader = [_]u8{ 0x78, 0b10_0_11100 };
- try writer.writeAll(&zlibHeader);
- },
- .raw => {},
- }
- }
-
- pub fn writeFooter(comptime wrap: Container, hasher: *Hasher(wrap), writer: anytype) !void {
- var bits: [4]u8 = undefined;
- switch (wrap) {
- .gzip => {
- // GZIP 8 bytes footer
- // - 4 bytes, CRC32 (CRC-32)
- // - 4 bytes, ISIZE (Input SIZE) - size of the original (uncompressed) input data modulo 2^32
- std.mem.writeInt(u32, &bits, hasher.chksum(), .little);
- try writer.writeAll(&bits);
-
- std.mem.writeInt(u32, &bits, hasher.bytesRead(), .little);
- try writer.writeAll(&bits);
- },
- .zlib => {
- // ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952).
- // 4 bytes of ADLER32 (Adler-32 checksum)
- // Checksum value of the uncompressed data (excluding any
- // dictionary data) computed according to Adler-32
- // algorithm.
- std.mem.writeInt(u32, &bits, hasher.chksum(), .big);
- try writer.writeAll(&bits);
- },
- .raw => {},
- }
- }
-
- pub fn parseHeader(comptime wrap: Container, reader: anytype) !void {
- switch (wrap) {
- .gzip => try parseGzipHeader(reader),
- .zlib => try parseZlibHeader(reader),
- .raw => {},
- }
- }
-
- fn parseGzipHeader(reader: anytype) !void {
- const magic1 = try reader.read(u8);
- const magic2 = try reader.read(u8);
- const method = try reader.read(u8);
- const flags = try reader.read(u8);
- try reader.skipBytes(6); // mtime(4), xflags, os
- if (magic1 != 0x1f or magic2 != 0x8b or method != 0x08)
- return error.BadGzipHeader;
- // Flags description: https://www.rfc-editor.org/rfc/rfc1952.html#page-5
- if (flags != 0) {
- if (flags & 0b0000_0100 != 0) { // FEXTRA
- const extra_len = try reader.read(u16);
- try reader.skipBytes(extra_len);
- }
- if (flags & 0b0000_1000 != 0) { // FNAME
- try reader.skipStringZ();
- }
- if (flags & 0b0001_0000 != 0) { // FCOMMENT
- try reader.skipStringZ();
- }
- if (flags & 0b0000_0010 != 0) { // FHCRC
- try reader.skipBytes(2);
- }
- }
- }
-
- fn parseZlibHeader(reader: anytype) !void {
- const cm = try reader.read(u4);
- const cinfo = try reader.read(u4);
- _ = try reader.read(u8);
- if (cm != 8 or cinfo > 7) {
- return error.BadZlibHeader;
- }
- }
-
- pub fn parseFooter(comptime wrap: Container, hasher: *Hasher(wrap), reader: anytype) !void {
- switch (wrap) {
- .gzip => {
- try reader.fill(0);
- if (try reader.read(u32) != hasher.chksum()) return error.WrongGzipChecksum;
- if (try reader.read(u32) != hasher.bytesRead()) return error.WrongGzipSize;
- },
- .zlib => {
- const chksum: u32 = @byteSwap(hasher.chksum());
- if (try reader.read(u32) != chksum) return error.WrongZlibChecksum;
- },
- .raw => {},
- }
- }
-
- pub fn Hasher(comptime wrap: Container) type {
- const HasherType = switch (wrap) {
- .gzip => std.hash.Crc32,
- .zlib => std.hash.Adler32,
- .raw => struct {
- pub fn init() @This() {
- return .{};
- }
- },
- };
-
- return struct {
- hasher: HasherType = HasherType.init(),
- bytes: usize = 0,
-
- const Self = @This();
-
- pub fn update(self: *Self, buf: []const u8) void {
- switch (wrap) {
- .raw => {},
- else => {
- self.hasher.update(buf);
- self.bytes += buf.len;
- },
- }
- }
-
- pub fn chksum(self: *Self) u32 {
- switch (wrap) {
- .raw => return 0,
- else => return self.hasher.final(),
- }
- }
-
- pub fn bytesRead(self: *Self) u32 {
- return @truncate(self.bytes);
- }
- };
- }
-};
lib/std/compress/flate/Decompress.zig
@@ -0,0 +1,894 @@
+const std = @import("../../std.zig");
+const flate = std.compress.flate;
+const Container = flate.Container;
+const Token = @import("Token.zig");
+const testing = std.testing;
+const Decompress = @This();
+const Writer = std.io.Writer;
+const Reader = std.io.Reader;
+
+input: *Reader,
+reader: Reader,
+/// Hashes, produces checksum, of uncompressed data for gzip/zlib footer.
+hasher: Container.Hasher,
+
+lit_dec: LiteralDecoder,
+dst_dec: DistanceDecoder,
+
+final_block: bool,
+state: State,
+
+read_err: ?Error,
+
+const BlockType = enum(u2) {
+ stored = 0,
+ fixed = 1,
+ dynamic = 2,
+};
+
+const State = union(enum) {
+ protocol_header,
+ block_header,
+ stored_block: u16,
+ fixed_block,
+ dynamic_block,
+ protocol_footer,
+ end,
+};
+
+pub const Error = Container.Error || error{
+ InvalidCode,
+ InvalidMatch,
+ InvalidBlockType,
+ WrongStoredBlockNlen,
+ InvalidDynamicBlockHeader,
+ EndOfStream,
+ ReadFailed,
+ OversubscribedHuffmanTree,
+ IncompleteHuffmanTree,
+ MissingEndOfBlockCode,
+};
+
+pub fn init(input: *Reader, container: Container, buffer: []u8) Decompress {
+ return .{
+ .reader = .{
+ // TODO populate discard so that when an amount is discarded that
+ // includes an entire frame, skip decoding that frame.
+ .vtable = &.{ .stream = stream },
+ .buffer = buffer,
+ .seek = 0,
+ .end = 0,
+ },
+ .input = input,
+ .hasher = .init(container),
+ .lit_dec = .{},
+ .dst_dec = .{},
+ .final_block = false,
+ .state = .protocol_header,
+ .read_err = null,
+ };
+}
+
+fn decodeLength(self: *Decompress, code: u8) !u16 {
+ if (code > 28) return error.InvalidCode;
+ const ml = Token.matchLength(code);
+ return if (ml.extra_bits == 0) // 0 - 5 extra bits
+ ml.base
+ else
+ ml.base + try self.takeNBitsBuffered(ml.extra_bits);
+}
+
+fn decodeDistance(self: *Decompress, code: u8) !u16 {
+ if (code > 29) return error.InvalidCode;
+ const md = Token.matchDistance(code);
+ return if (md.extra_bits == 0) // 0 - 13 extra bits
+ md.base
+ else
+ md.base + try self.takeNBitsBuffered(md.extra_bits);
+}
+
+// Decode code length symbol to code length. Writes decoded length into
+// lens slice starting at position pos. Returns number of positions
+// advanced.
+fn dynamicCodeLength(self: *Decompress, code: u16, lens: []u4, pos: usize) !usize {
+ if (pos >= lens.len)
+ return error.InvalidDynamicBlockHeader;
+
+ switch (code) {
+ 0...15 => {
+ // Represent code lengths of 0 - 15
+ lens[pos] = @intCast(code);
+ return 1;
+ },
+ 16 => {
+ // Copy the previous code length 3 - 6 times.
+ // The next 2 bits indicate repeat length
+ const n: u8 = @as(u8, try self.takeBits(u2)) + 3;
+ if (pos == 0 or pos + n > lens.len)
+ return error.InvalidDynamicBlockHeader;
+ for (0..n) |i| {
+ lens[pos + i] = lens[pos + i - 1];
+ }
+ return n;
+ },
+ // Repeat a code length of 0 for 3 - 10 times. (3 bits of length)
+ 17 => return @as(u8, try self.takeBits(u3)) + 3,
+ // Repeat a code length of 0 for 11 - 138 times (7 bits of length)
+ 18 => return @as(u8, try self.takeBits(u7)) + 11,
+ else => return error.InvalidDynamicBlockHeader,
+ }
+}
+
+// Peek 15 bits from bits reader (maximum code len is 15 bits). Use
+// decoder to find symbol for that code. We then know how many bits is
+// used. Shift bit reader for that much bits, those bits are used. And
+// return symbol.
+fn decodeSymbol(self: *Decompress, decoder: anytype) !Symbol {
+ const sym = try decoder.find(try self.peekBitsReverseBuffered(u15));
+ try self.shiftBits(sym.code_bits);
+ return sym;
+}
+
+pub fn stream(r: *Reader, w: *Writer, limit: std.io.Limit) Reader.StreamError!usize {
+ const d: *Decompress = @alignCast(@fieldParentPtr("reader", r));
+ return readInner(d, w, limit) catch |err| switch (err) {
+ error.EndOfStream => return error.EndOfStream,
+ error.WriteFailed => return error.WriteFailed,
+ else => |e| {
+ // In the event of an error, state is unmodified so that it can be
+ // better used to diagnose the failure.
+ d.read_err = e;
+ return error.ReadFailed;
+ },
+ };
+}
+
+fn readInner(d: *Decompress, w: *Writer, limit: std.io.Limit) (Error || Reader.StreamError)!usize {
+ const in = d.input;
+ sw: switch (d.state) {
+ .protocol_header => switch (d.hasher.container()) {
+ .gzip => {
+ const Header = extern struct {
+ magic: u16 align(1),
+ method: u8,
+ flags: packed struct(u8) {
+ text: bool,
+ hcrc: bool,
+ extra: bool,
+ name: bool,
+ comment: bool,
+ reserved: u3,
+ },
+ mtime: u32 align(1),
+ xfl: u8,
+ os: u8,
+ };
+ const header = try in.takeStruct(Header, .little);
+ if (header.magic != 0x8b1f or header.method != 0x08)
+ return error.BadGzipHeader;
+ if (header.flags.extra) {
+ const extra_len = try in.takeInt(u16, .little);
+ try in.discardAll(extra_len);
+ }
+ if (header.flags.name) {
+ _ = try in.discardDelimiterInclusive(0);
+ }
+ if (header.flags.comment) {
+ _ = try in.discardDelimiterInclusive(0);
+ }
+ if (header.flags.hcrc) {
+ try in.discardAll(2);
+ }
+ continue :sw .block_header;
+ },
+ .zlib => {
+ const Header = extern struct {
+ cmf: packed struct(u8) {
+ cm: u4,
+ cinfo: u4,
+ },
+ flg: u8,
+ };
+ const header = try in.takeStruct(Header);
+ if (header.cmf.cm != 8 or header.cmf.cinfo > 7) return error.BadZlibHeader;
+ continue :sw .block_header;
+ },
+ .raw => continue :sw .block_header,
+ },
+ .block_header => {
+ d.final_block = (try d.takeBits(u1)) != 0;
+ const block_type = try d.takeBits(BlockType);
+ switch (block_type) {
+ .stored => {
+ d.alignBitsToByte(); // skip padding until byte boundary
+ // everything after this is byte aligned in stored block
+ const len = try in.takeInt(u16, .little);
+ const nlen = try in.takeInt(u16, .little);
+ if (len != ~nlen) return error.WrongStoredBlockNlen;
+ continue :sw .{ .stored_block = len };
+ },
+ .fixed => continue :sw .fixed_block,
+ .dynamic => {
+ const hlit: u16 = @as(u16, try d.takeBits(u5)) + 257; // number of ll code entries present - 257
+ const hdist: u16 = @as(u16, try d.takeBits(u5)) + 1; // number of distance code entries - 1
+ const hclen: u8 = @as(u8, try d.takeBits(u4)) + 4; // hclen + 4 code lengths are encoded
+
+ if (hlit > 286 or hdist > 30)
+ return error.InvalidDynamicBlockHeader;
+
+ // lengths for code lengths
+ var cl_lens = [_]u4{0} ** 19;
+ for (0..hclen) |i| {
+ cl_lens[flate.huffman.codegen_order[i]] = try d.takeBits(u3);
+ }
+ var cl_dec: CodegenDecoder = .{};
+ try cl_dec.generate(&cl_lens);
+
+ // decoded code lengths
+ var dec_lens = [_]u4{0} ** (286 + 30);
+ var pos: usize = 0;
+ while (pos < hlit + hdist) {
+ const sym = try cl_dec.find(try d.peekBitsReverse(u7));
+ try d.shiftBits(sym.code_bits);
+ pos += try d.dynamicCodeLength(sym.symbol, &dec_lens, pos);
+ }
+ if (pos > hlit + hdist) {
+ return error.InvalidDynamicBlockHeader;
+ }
+
+ // literal code lengths to literal decoder
+ try d.lit_dec.generate(dec_lens[0..hlit]);
+
+ // distance code lengths to distance decoder
+ try d.dst_dec.generate(dec_lens[hlit .. hlit + hdist]);
+
+ continue :sw .dynamic_block;
+ },
+ }
+ },
+ .stored_block => |remaining_len| {
+ const out = try w.writableSliceGreedyPreserve(flate.history_len, 1);
+ const limited_out = limit.min(.limited(remaining_len)).slice(out);
+ const n = try d.input.readVec(&.{limited_out});
+ if (remaining_len - n == 0) {
+ d.state = if (d.final_block) .protocol_footer else .block_header;
+ } else {
+ d.state = .{ .stored_block = @intCast(remaining_len - n) };
+ }
+ w.advance(n);
+ return n;
+ },
+ .fixed_block => {
+ const start = w.count;
+ while (@intFromEnum(limit) > w.count - start) {
+ const code = try d.readFixedCode();
+ switch (code) {
+ 0...255 => try w.writeBytePreserve(flate.history_len, @intCast(code)),
+ 256 => {
+ d.state = if (d.final_block) .protocol_footer else .block_header;
+ return w.count - start;
+ },
+ 257...285 => {
+ // Handles fixed block non literal (length) code.
+ // Length code is followed by 5 bits of distance code.
+ const length = try d.decodeLength(@intCast(code - 257));
+ const distance = try d.decodeDistance(try d.takeBitsReverseBuffered(u5));
+ try writeMatch(w, length, distance);
+ },
+ else => return error.InvalidCode,
+ }
+ }
+ d.state = .fixed_block;
+ return w.count - start;
+ },
+ .dynamic_block => {
+ // In larger archives most blocks are usually dynamic, so decompression
+ // performance depends on this logic.
+ const start = w.count;
+ while (@intFromEnum(limit) > w.count - start) {
+ const sym = try d.decodeSymbol(&d.lit_dec);
+
+ switch (sym.kind) {
+ .literal => try w.writeBytePreserve(flate.history_len, sym.symbol),
+ .match => {
+ // Decode match backreference <length, distance>
+ const length = try d.decodeLength(sym.symbol);
+ const dsm = try d.decodeSymbol(&d.dst_dec);
+ const distance = try d.decodeDistance(dsm.symbol);
+ try writeMatch(w, length, distance);
+ },
+ .end_of_block => {
+ d.state = if (d.final_block) .protocol_footer else .block_header;
+ return w.count - start;
+ },
+ }
+ }
+ d.state = .dynamic_block;
+ return w.count - start;
+ },
+ .protocol_footer => {
+ d.alignBitsToByte();
+ switch (d.hasher) {
+ .gzip => |*gzip| {
+ if (try in.takeInt(u32, .little) != gzip.crc.final()) return error.WrongGzipChecksum;
+ if (try in.takeInt(u32, .little) != gzip.count) return error.WrongGzipSize;
+ },
+ .zlib => |*zlib| {
+ const chksum: u32 = @byteSwap(zlib.final());
+ if (try in.takeInt(u32, .big) != chksum) return error.WrongZlibChecksum;
+ },
+ .raw => {},
+ }
+ d.state = .end;
+ return 0;
+ },
+ .end => return error.EndOfStream,
+ }
+}
+
+/// Write match (back-reference to the same data slice) starting at `distance`
+/// back from current write position, and `length` of bytes.
+fn writeMatch(bw: *Writer, length: u16, distance: u16) !void {
+ _ = bw;
+ _ = length;
+ _ = distance;
+ @panic("TODO");
+}
+
+fn takeBits(d: *Decompress, comptime T: type) !T {
+ _ = d;
+ @panic("TODO");
+}
+
+fn takeBitsReverseBuffered(d: *Decompress, comptime T: type) !T {
+ _ = d;
+ @panic("TODO");
+}
+
+fn takeNBitsBuffered(d: *Decompress, n: u4) !u16 {
+ _ = d;
+ _ = n;
+ @panic("TODO");
+}
+
+fn peekBitsReverse(d: *Decompress, comptime T: type) !T {
+ _ = d;
+ @panic("TODO");
+}
+
+fn peekBitsReverseBuffered(d: *Decompress, comptime T: type) !T {
+ _ = d;
+ @panic("TODO");
+}
+
+fn alignBitsToByte(d: *Decompress) void {
+ _ = d;
+ @panic("TODO");
+}
+
+fn shiftBits(d: *Decompress, n: u6) !void {
+ _ = d;
+ _ = n;
+ @panic("TODO");
+}
+
+fn readFixedCode(d: *Decompress) !u16 {
+ _ = d;
+ @panic("TODO");
+}
+
+pub const Symbol = packed struct {
+ pub const Kind = enum(u2) {
+ literal,
+ end_of_block,
+ match,
+ };
+
+ symbol: u8 = 0, // symbol from alphabet
+ code_bits: u4 = 0, // number of bits in code 0-15
+ kind: Kind = .literal,
+
+ code: u16 = 0, // huffman code of the symbol
+ next: u16 = 0, // pointer to the next symbol in linked list
+ // it is safe to use 0 as null pointer, when sorted 0 has shortest code and fits into lookup
+
+ // Sorting less than function.
+ pub fn asc(_: void, a: Symbol, b: Symbol) bool {
+ if (a.code_bits == b.code_bits) {
+ if (a.kind == b.kind) {
+ return a.symbol < b.symbol;
+ }
+ return @intFromEnum(a.kind) < @intFromEnum(b.kind);
+ }
+ return a.code_bits < b.code_bits;
+ }
+};
+
+pub const LiteralDecoder = HuffmanDecoder(286, 15, 9);
+pub const DistanceDecoder = HuffmanDecoder(30, 15, 9);
+pub const CodegenDecoder = HuffmanDecoder(19, 7, 7);
+
+/// Creates huffman tree codes from list of code lengths (in `build`).
+///
+/// `find` then finds symbol for code bits. Code can be any length between 1 and
+/// 15 bits. When calling `find` we don't know how many bits will be used to
+/// find symbol. When symbol is returned it has code_bits field which defines
+/// how much we should advance in bit stream.
+///
+/// Lookup table is used to map 15 bit int to symbol. Same symbol is written
+/// many times in this table; 32K places for 286 (at most) symbols.
+/// Small lookup table is optimization for faster search.
+/// It is variation of the algorithm explained in [zlib](https://github.com/madler/zlib/blob/643e17b7498d12ab8d15565662880579692f769d/doc/algorithm.txt#L92)
+/// with difference that we here use statically allocated arrays.
+///
+fn HuffmanDecoder(
+ comptime alphabet_size: u16,
+ comptime max_code_bits: u4,
+ comptime lookup_bits: u4,
+) type {
+ const lookup_shift = max_code_bits - lookup_bits;
+
+ return struct {
+ // all symbols in alaphabet, sorted by code_len, symbol
+ symbols: [alphabet_size]Symbol = undefined,
+ // lookup table code -> symbol
+ lookup: [1 << lookup_bits]Symbol = undefined,
+
+ const Self = @This();
+
+ /// Generates symbols and lookup tables from list of code lens for each symbol.
+ pub fn generate(self: *Self, lens: []const u4) !void {
+ try checkCompleteness(lens);
+
+ // init alphabet with code_bits
+ for (self.symbols, 0..) |_, i| {
+ const cb: u4 = if (i < lens.len) lens[i] else 0;
+ self.symbols[i] = if (i < 256)
+ .{ .kind = .literal, .symbol = @intCast(i), .code_bits = cb }
+ else if (i == 256)
+ .{ .kind = .end_of_block, .symbol = 0xff, .code_bits = cb }
+ else
+ .{ .kind = .match, .symbol = @intCast(i - 257), .code_bits = cb };
+ }
+ std.sort.heap(Symbol, &self.symbols, {}, Symbol.asc);
+
+ // reset lookup table
+ for (0..self.lookup.len) |i| {
+ self.lookup[i] = .{};
+ }
+
+ // assign code to symbols
+ // reference: https://youtu.be/9_YEGLe33NA?list=PLU4IQLU9e_OrY8oASHx0u3IXAL9TOdidm&t=2639
+ var code: u16 = 0;
+ var idx: u16 = 0;
+ for (&self.symbols, 0..) |*sym, pos| {
+ if (sym.code_bits == 0) continue; // skip unused
+ sym.code = code;
+
+ const next_code = code + (@as(u16, 1) << (max_code_bits - sym.code_bits));
+ const next_idx = next_code >> lookup_shift;
+
+ if (next_idx > self.lookup.len or idx >= self.lookup.len) break;
+ if (sym.code_bits <= lookup_bits) {
+ // fill small lookup table
+ for (idx..next_idx) |j|
+ self.lookup[j] = sym.*;
+ } else {
+ // insert into linked table starting at root
+ const root = &self.lookup[idx];
+ const root_next = root.next;
+ root.next = @intCast(pos);
+ sym.next = root_next;
+ }
+
+ idx = next_idx;
+ code = next_code;
+ }
+ }
+
+ /// Given the list of code lengths check that it represents a canonical
+ /// Huffman code for n symbols.
+ ///
+ /// Reference: https://github.com/madler/zlib/blob/5c42a230b7b468dff011f444161c0145b5efae59/contrib/puff/puff.c#L340
+ fn checkCompleteness(lens: []const u4) !void {
+ if (alphabet_size == 286)
+ if (lens[256] == 0) return error.MissingEndOfBlockCode;
+
+ var count = [_]u16{0} ** (@as(usize, max_code_bits) + 1);
+ var max: usize = 0;
+ for (lens) |n| {
+ if (n == 0) continue;
+ if (n > max) max = n;
+ count[n] += 1;
+ }
+ if (max == 0) // empty tree
+ return;
+
+ // check for an over-subscribed or incomplete set of lengths
+ var left: usize = 1; // one possible code of zero length
+ for (1..count.len) |len| {
+ left <<= 1; // one more bit, double codes left
+ if (count[len] > left)
+ return error.OversubscribedHuffmanTree;
+ left -= count[len]; // deduct count from possible codes
+ }
+ if (left > 0) { // left > 0 means incomplete
+ // incomplete code ok only for single length 1 code
+ if (max_code_bits > 7 and max == count[0] + count[1]) return;
+ return error.IncompleteHuffmanTree;
+ }
+ }
+
+ /// Finds symbol for lookup table code.
+ pub fn find(self: *Self, code: u16) !Symbol {
+ // try to find in lookup table
+ const idx = code >> lookup_shift;
+ const sym = self.lookup[idx];
+ if (sym.code_bits != 0) return sym;
+ // if not use linked list of symbols with same prefix
+ return self.findLinked(code, sym.next);
+ }
+
+ inline fn findLinked(self: *Self, code: u16, start: u16) !Symbol {
+ var pos = start;
+ while (pos > 0) {
+ const sym = self.symbols[pos];
+ const shift = max_code_bits - sym.code_bits;
+ // compare code_bits number of upper bits
+ if ((code ^ sym.code) >> shift == 0) return sym;
+ pos = sym.next;
+ }
+ return error.InvalidCode;
+ }
+ };
+}
+
+test "init/find" {
+ // example data from: https://youtu.be/SJPvNi4HrWQ?t=8423
+ const code_lens = [_]u4{ 4, 3, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 2 };
+ var h: CodegenDecoder = .{};
+ try h.generate(&code_lens);
+
+ const expected = [_]struct {
+ sym: Symbol,
+ code: u16,
+ }{
+ .{
+ .code = 0b00_00000,
+ .sym = .{ .symbol = 3, .code_bits = 2 },
+ },
+ .{
+ .code = 0b01_00000,
+ .sym = .{ .symbol = 18, .code_bits = 2 },
+ },
+ .{
+ .code = 0b100_0000,
+ .sym = .{ .symbol = 1, .code_bits = 3 },
+ },
+ .{
+ .code = 0b101_0000,
+ .sym = .{ .symbol = 4, .code_bits = 3 },
+ },
+ .{
+ .code = 0b110_0000,
+ .sym = .{ .symbol = 17, .code_bits = 3 },
+ },
+ .{
+ .code = 0b1110_000,
+ .sym = .{ .symbol = 0, .code_bits = 4 },
+ },
+ .{
+ .code = 0b1111_000,
+ .sym = .{ .symbol = 16, .code_bits = 4 },
+ },
+ };
+
+ // unused symbols
+ for (0..12) |i| {
+ try testing.expectEqual(0, h.symbols[i].code_bits);
+ }
+ // used, from index 12
+ for (expected, 12..) |e, i| {
+ try testing.expectEqual(e.sym.symbol, h.symbols[i].symbol);
+ try testing.expectEqual(e.sym.code_bits, h.symbols[i].code_bits);
+ const sym_from_code = try h.find(e.code);
+ try testing.expectEqual(e.sym.symbol, sym_from_code.symbol);
+ }
+
+ // All possible codes for each symbol.
+ // Lookup table has 126 elements, to cover all possible 7 bit codes.
+ for (0b0000_000..0b0100_000) |c| // 0..32 (32)
+ try testing.expectEqual(3, (try h.find(@intCast(c))).symbol);
+
+ for (0b0100_000..0b1000_000) |c| // 32..64 (32)
+ try testing.expectEqual(18, (try h.find(@intCast(c))).symbol);
+
+ for (0b1000_000..0b1010_000) |c| // 64..80 (16)
+ try testing.expectEqual(1, (try h.find(@intCast(c))).symbol);
+
+ for (0b1010_000..0b1100_000) |c| // 80..96 (16)
+ try testing.expectEqual(4, (try h.find(@intCast(c))).symbol);
+
+ for (0b1100_000..0b1110_000) |c| // 96..112 (16)
+ try testing.expectEqual(17, (try h.find(@intCast(c))).symbol);
+
+ for (0b1110_000..0b1111_000) |c| // 112..120 (8)
+ try testing.expectEqual(0, (try h.find(@intCast(c))).symbol);
+
+ for (0b1111_000..0b1_0000_000) |c| // 120...128 (8)
+ try testing.expectEqual(16, (try h.find(@intCast(c))).symbol);
+}
+
+test "encode/decode literals" {
+ const LiteralEncoder = std.compress.flate.Compress.LiteralEncoder;
+
+ for (1..286) |j| { // for all different number of codes
+ var enc: LiteralEncoder = .{};
+ // create frequencies
+ var freq = [_]u16{0} ** 286;
+ freq[256] = 1; // ensure we have end of block code
+ for (&freq, 1..) |*f, i| {
+ if (i % j == 0)
+ f.* = @intCast(i);
+ }
+
+ // encoder from frequencies
+ enc.generate(&freq, 15);
+
+ // get code_lens from encoder
+ var code_lens = [_]u4{0} ** 286;
+ for (code_lens, 0..) |_, i| {
+ code_lens[i] = @intCast(enc.codes[i].len);
+ }
+ // generate decoder from code lens
+ var dec: LiteralDecoder = .{};
+ try dec.generate(&code_lens);
+
+ // expect decoder code to match original encoder code
+ for (dec.symbols) |s| {
+ if (s.code_bits == 0) continue;
+ const c_code: u16 = @bitReverse(@as(u15, @intCast(s.code)));
+ const symbol: u16 = switch (s.kind) {
+ .literal => s.symbol,
+ .end_of_block => 256,
+ .match => @as(u16, s.symbol) + 257,
+ };
+
+ const c = enc.codes[symbol];
+ try testing.expect(c.code == c_code);
+ }
+
+ // find each symbol by code
+ for (enc.codes) |c| {
+ if (c.len == 0) continue;
+
+ const s_code: u15 = @bitReverse(@as(u15, @intCast(c.code)));
+ const s = try dec.find(s_code);
+ try testing.expect(s.code == s_code);
+ try testing.expect(s.code_bits == c.len);
+ }
+ }
+}
+
+test "decompress" {
+ const cases = [_]struct {
+ in: []const u8,
+ out: []const u8,
+ }{
+ // non compressed block (type 0)
+ .{
+ .in = &[_]u8{
+ 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
+ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
+ },
+ .out = "Hello world\n",
+ },
+ // fixed code block (type 1)
+ .{
+ .in = &[_]u8{
+ 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1
+ 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00,
+ },
+ .out = "Hello world\n",
+ },
+ // dynamic block (type 2)
+ .{
+ .in = &[_]u8{
+ 0x3d, 0xc6, 0x39, 0x11, 0x00, 0x00, 0x0c, 0x02, // deflate data block type 2
+ 0x30, 0x2b, 0xb5, 0x52, 0x1e, 0xff, 0x96, 0x38,
+ 0x16, 0x96, 0x5c, 0x1e, 0x94, 0xcb, 0x6d, 0x01,
+ },
+ .out = "ABCDEABCD ABCDEABCD",
+ },
+ };
+ for (cases) |c| {
+ var fb: Reader = .fixed(c.in);
+ var aw: Writer.Allocating = .init(testing.allocator);
+ defer aw.deinit();
+
+ var decompress: Decompress = .init(&fb, .raw, &.{});
+ const r = &decompress.reader;
+ _ = try r.streamRemaining(&aw.writer);
+ try testing.expectEqualStrings(c.out, aw.getWritten());
+ }
+}
+
+test "gzip decompress" {
+ const cases = [_]struct {
+ in: []const u8,
+ out: []const u8,
+ }{
+ // non compressed block (type 0)
+ .{
+ .in = &[_]u8{
+ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header (10 bytes)
+ 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
+ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
+ 0xd5, 0xe0, 0x39, 0xb7, // gzip footer: checksum
+ 0x0c, 0x00, 0x00, 0x00, // gzip footer: size
+ },
+ .out = "Hello world\n",
+ },
+ // fixed code block (type 1)
+ .{
+ .in = &[_]u8{
+ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x03, // gzip header (10 bytes)
+ 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1
+ 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00,
+ 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00, // gzip footer (chksum, len)
+ },
+ .out = "Hello world\n",
+ },
+ // dynamic block (type 2)
+ .{
+ .in = &[_]u8{
+ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header (10 bytes)
+ 0x3d, 0xc6, 0x39, 0x11, 0x00, 0x00, 0x0c, 0x02, // deflate data block type 2
+ 0x30, 0x2b, 0xb5, 0x52, 0x1e, 0xff, 0x96, 0x38,
+ 0x16, 0x96, 0x5c, 0x1e, 0x94, 0xcb, 0x6d, 0x01,
+ 0x17, 0x1c, 0x39, 0xb4, 0x13, 0x00, 0x00, 0x00, // gzip footer (chksum, len)
+ },
+ .out = "ABCDEABCD ABCDEABCD",
+ },
+ // gzip header with name
+ .{
+ .in = &[_]u8{
+ 0x1f, 0x8b, 0x08, 0x08, 0xe5, 0x70, 0xb1, 0x65, 0x00, 0x03, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x2e,
+ 0x74, 0x78, 0x74, 0x00, 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, 0x2f, 0xca, 0x49, 0xe1,
+ 0x02, 0x00, 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00,
+ },
+ .out = "Hello world\n",
+ },
+ };
+ for (cases) |c| {
+ var fb: Reader = .fixed(c.in);
+ var aw: Writer.Allocating = .init(testing.allocator);
+ defer aw.deinit();
+
+ var decompress: Decompress = .init(&fb, .gzip, &.{});
+ const r = &decompress.reader;
+ _ = try r.streamRemaining(&aw.writer);
+ try testing.expectEqualStrings(c.out, aw.getWritten());
+ }
+}
+
+test "zlib decompress" {
+ const cases = [_]struct {
+ in: []const u8,
+ out: []const u8,
+ }{
+ // non compressed block (type 0)
+ .{
+ .in = &[_]u8{
+ 0x78, 0b10_0_11100, // zlib header (2 bytes)
+ 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
+ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
+ 0x1c, 0xf2, 0x04, 0x47, // zlib footer: checksum
+ },
+ .out = "Hello world\n",
+ },
+ };
+ for (cases) |c| {
+ var fb: Reader = .fixed(c.in);
+ var aw: Writer.Allocating = .init(testing.allocator);
+ defer aw.deinit();
+
+ var decompress: Decompress = .init(&fb, .zlib, &.{});
+ const r = &decompress.reader;
+ _ = try r.streamRemaining(&aw.writer);
+ try testing.expectEqualStrings(c.out, aw.getWritten());
+ }
+}
+
+test "fuzzing tests" {
+ const cases = [_]struct {
+ input: []const u8,
+ out: []const u8 = "",
+ err: ?anyerror = null,
+ }{
+ .{ .input = "deflate-stream", .out = @embedFile("testdata/fuzz/deflate-stream.expect") }, // 0
+ .{ .input = "empty-distance-alphabet01" },
+ .{ .input = "empty-distance-alphabet02" },
+ .{ .input = "end-of-stream", .err = error.EndOfStream },
+ .{ .input = "invalid-distance", .err = error.InvalidMatch },
+ .{ .input = "invalid-tree01", .err = error.IncompleteHuffmanTree }, // 5
+ .{ .input = "invalid-tree02", .err = error.IncompleteHuffmanTree },
+ .{ .input = "invalid-tree03", .err = error.IncompleteHuffmanTree },
+ .{ .input = "lengths-overflow", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "out-of-codes", .err = error.InvalidCode },
+ .{ .input = "puff01", .err = error.WrongStoredBlockNlen }, // 10
+ .{ .input = "puff02", .err = error.EndOfStream },
+ .{ .input = "puff03", .out = &[_]u8{0xa} },
+ .{ .input = "puff04", .err = error.InvalidCode },
+ .{ .input = "puff05", .err = error.EndOfStream },
+ .{ .input = "puff06", .err = error.EndOfStream },
+ .{ .input = "puff08", .err = error.InvalidCode },
+ .{ .input = "puff09", .out = "P" },
+ .{ .input = "puff10", .err = error.InvalidCode },
+ .{ .input = "puff11", .err = error.InvalidMatch },
+ .{ .input = "puff12", .err = error.InvalidDynamicBlockHeader }, // 20
+ .{ .input = "puff13", .err = error.IncompleteHuffmanTree },
+ .{ .input = "puff14", .err = error.EndOfStream },
+ .{ .input = "puff15", .err = error.IncompleteHuffmanTree },
+ .{ .input = "puff16", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "puff17", .err = error.MissingEndOfBlockCode }, // 25
+ .{ .input = "fuzz1", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "fuzz2", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "fuzz3", .err = error.InvalidMatch },
+ .{ .input = "fuzz4", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff18", .err = error.OversubscribedHuffmanTree }, // 30
+ .{ .input = "puff19", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff20", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff21", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff22", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff23", .err = error.OversubscribedHuffmanTree }, // 35
+ .{ .input = "puff24", .err = error.IncompleteHuffmanTree },
+ .{ .input = "puff25", .err = error.OversubscribedHuffmanTree },
+ .{ .input = "puff26", .err = error.InvalidDynamicBlockHeader },
+ .{ .input = "puff27", .err = error.InvalidDynamicBlockHeader },
+ };
+
+ inline for (cases, 0..) |c, case_no| {
+ var in: Reader = .fixed(@embedFile("testdata/fuzz/" ++ c.input ++ ".input"));
+ var aw: Writer.Allocating = .init(testing.allocator);
+ defer aw.deinit();
+ errdefer std.debug.print("test case failed {}\n", .{case_no});
+
+ var decompress: Decompress = .init(&in, .raw, &.{});
+ const r = &decompress.reader;
+ if (c.err) |expected_err| {
+ try testing.expectError(error.ReadFailed, r.streamRemaining(&aw.writer));
+ try testing.expectError(expected_err, decompress.read_err.?);
+ } else {
+ _ = try r.streamRemaining(&aw.writer);
+ try testing.expectEqualStrings(c.out, aw.getWritten());
+ }
+ }
+}
+
+test "bug 18966" {
+ const input = @embedFile("testdata/fuzz/bug_18966.input");
+ const expect = @embedFile("testdata/fuzz/bug_18966.expect");
+
+ var in: Reader = .fixed(input);
+ var aw: Writer.Allocating = .init(testing.allocator);
+ defer aw.deinit();
+
+ var decompress: Decompress = .init(&in, .gzip, &.{});
+ const r = &decompress.reader;
+ _ = try r.streamRemaining(&aw.writer);
+ try testing.expectEqualStrings(expect, aw.getWritten());
+}
+
+test "reading into empty buffer" {
+ // Inspired by https://github.com/ziglang/zig/issues/19895
+ const input = &[_]u8{
+ 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
+ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
+ };
+ var in: Reader = .fixed(input);
+ var decomp: Decompress = .init(&in, .raw, &.{});
+ const r = &decomp.reader;
+ var buf: [0]u8 = undefined;
+ try testing.expectEqual(0, try r.readVec(&.{&buf}));
+}
lib/std/compress/flate/deflate.zig
@@ -1,744 +0,0 @@
-const std = @import("std");
-const io = std.io;
-const assert = std.debug.assert;
-const testing = std.testing;
-const expect = testing.expect;
-const print = std.debug.print;
-
-const Token = @import("Token.zig");
-const consts = @import("consts.zig");
-const BlockWriter = @import("block_writer.zig").BlockWriter;
-const Container = @import("container.zig").Container;
-const SlidingWindow = @import("SlidingWindow.zig");
-const Lookup = @import("Lookup.zig");
-
-pub const Options = struct {
- level: Level = .default,
-};
-
-/// Trades between speed and compression size.
-/// Starts with level 4: in [zlib](https://github.com/madler/zlib/blob/abd3d1a28930f89375d4b41408b39f6c1be157b2/deflate.c#L115C1-L117C43)
-/// levels 1-3 are using different algorithm to perform faster but with less
-/// compression. That is not implemented here.
-pub const Level = enum(u4) {
- // zig fmt: off
- fast = 0xb, level_4 = 4,
- level_5 = 5,
- default = 0xc, level_6 = 6,
- level_7 = 7,
- level_8 = 8,
- best = 0xd, level_9 = 9,
- // zig fmt: on
-};
-
-/// Algorithm knobs for each level.
-const LevelArgs = struct {
- good: u16, // Do less lookups if we already have match of this length.
- nice: u16, // Stop looking for better match if we found match with at least this length.
- lazy: u16, // Don't do lazy match find if got match with at least this length.
- chain: u16, // How many lookups for previous match to perform.
-
- pub fn get(level: Level) LevelArgs {
- // zig fmt: off
- return switch (level) {
- .fast, .level_4 => .{ .good = 4, .lazy = 4, .nice = 16, .chain = 16 },
- .level_5 => .{ .good = 8, .lazy = 16, .nice = 32, .chain = 32 },
- .default, .level_6 => .{ .good = 8, .lazy = 16, .nice = 128, .chain = 128 },
- .level_7 => .{ .good = 8, .lazy = 32, .nice = 128, .chain = 256 },
- .level_8 => .{ .good = 32, .lazy = 128, .nice = 258, .chain = 1024 },
- .best, .level_9 => .{ .good = 32, .lazy = 258, .nice = 258, .chain = 4096 },
- };
- // zig fmt: on
- }
-};
-
-/// Compress plain data from reader into compressed stream written to writer.
-pub fn compress(comptime container: Container, reader: anytype, writer: anytype, options: Options) !void {
- var c = try compressor(container, writer, options);
- try c.compress(reader);
- try c.finish();
-}
-
-/// Create compressor for writer type.
-pub fn compressor(comptime container: Container, writer: anytype, options: Options) !Compressor(
- container,
- @TypeOf(writer),
-) {
- return try Compressor(container, @TypeOf(writer)).init(writer, options);
-}
-
-/// Compressor type.
-pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
- const TokenWriterType = BlockWriter(WriterType);
- return Deflate(container, WriterType, TokenWriterType);
-}
-
-/// Default compression algorithm. Has two steps: tokenization and token
-/// encoding.
-///
-/// Tokenization takes uncompressed input stream and produces list of tokens.
-/// Each token can be literal (byte of data) or match (backrefernce to previous
-/// data with length and distance). Tokenization accumulators 32K tokens, when
-/// full or `flush` is called tokens are passed to the `block_writer`. Level
-/// defines how hard (how slow) it tries to find match.
-///
-/// Block writer will decide which type of deflate block to write (stored, fixed,
-/// dynamic) and encode tokens to the output byte stream. Client has to call
-/// `finish` to write block with the final bit set.
-///
-/// Container defines type of header and footer which can be gzip, zlib or raw.
-/// They all share same deflate body. Raw has no header or footer just deflate
-/// body.
-///
-/// Compression algorithm explained in rfc-1951 (slightly edited for this case):
-///
-/// The compressor uses a chained hash table `lookup` to find duplicated
-/// strings, using a hash function that operates on 4-byte sequences. At any
-/// given point during compression, let XYZW be the next 4 input bytes
-/// (lookahead) to be examined (not necessarily all different, of course).
-/// First, the compressor examines the hash chain for XYZW. If the chain is
-/// empty, the compressor simply writes out X as a literal byte and advances
-/// one byte in the input. If the hash chain is not empty, indicating that the
-/// sequence XYZW (or, if we are unlucky, some other 4 bytes with the same
-/// hash function value) has occurred recently, the compressor compares all
-/// strings on the XYZW hash chain with the actual input data sequence
-/// starting at the current point, and selects the longest match.
-///
-/// To improve overall compression, the compressor defers the selection of
-/// matches ("lazy matching"): after a match of length N has been found, the
-/// compressor searches for a longer match starting at the next input byte. If
-/// it finds a longer match, it truncates the previous match to a length of
-/// one (thus producing a single literal byte) and then emits the longer
-/// match. Otherwise, it emits the original match, and, as described above,
-/// advances N bytes before continuing.
-///
-///
-/// Allocates statically ~400K (192K lookup, 128K tokens, 64K window).
-///
-/// Deflate function accepts BlockWriterType so we can change that in test to test
-/// just tokenization part.
-///
-fn Deflate(comptime container: Container, comptime WriterType: type, comptime BlockWriterType: type) type {
- return struct {
- lookup: Lookup = .{},
- win: SlidingWindow = .{},
- tokens: Tokens = .{},
- wrt: WriterType,
- block_writer: BlockWriterType,
- level: LevelArgs,
- hasher: container.Hasher() = .{},
-
- // Match and literal at the previous position.
- // Used for lazy match finding in processWindow.
- prev_match: ?Token = null,
- prev_literal: ?u8 = null,
-
- const Self = @This();
-
- pub fn init(wrt: WriterType, options: Options) !Self {
- const self = Self{
- .wrt = wrt,
- .block_writer = BlockWriterType.init(wrt),
- .level = LevelArgs.get(options.level),
- };
- try container.writeHeader(self.wrt);
- return self;
- }
-
- const FlushOption = enum { none, flush, final };
-
- // Process data in window and create tokens. If token buffer is full
- // flush tokens to the token writer. In the case of `flush` or `final`
- // option it will process all data from the window. In the `none` case
- // it will preserve some data for the next match.
- fn tokenize(self: *Self, flush_opt: FlushOption) !void {
- // flush - process all data from window
- const should_flush = (flush_opt != .none);
-
- // While there is data in active lookahead buffer.
- while (self.win.activeLookahead(should_flush)) |lh| {
- var step: u16 = 1; // 1 in the case of literal, match length otherwise
- const pos: u16 = self.win.pos();
- const literal = lh[0]; // literal at current position
- const min_len: u16 = if (self.prev_match) |m| m.length() else 0;
-
- // Try to find match at least min_len long.
- if (self.findMatch(pos, lh, min_len)) |match| {
- // Found better match than previous.
- try self.addPrevLiteral();
-
- // Is found match length good enough?
- if (match.length() >= self.level.lazy) {
- // Don't try to lazy find better match, use this.
- step = try self.addMatch(match);
- } else {
- // Store this match.
- self.prev_literal = literal;
- self.prev_match = match;
- }
- } else {
- // There is no better match at current pos then it was previous.
- // Write previous match or literal.
- if (self.prev_match) |m| {
- // Write match from previous position.
- step = try self.addMatch(m) - 1; // we already advanced 1 from previous position
- } else {
- // No match at previous position.
- // Write previous literal if any, and remember this literal.
- try self.addPrevLiteral();
- self.prev_literal = literal;
- }
- }
- // Advance window and add hashes.
- self.windowAdvance(step, lh, pos);
- }
-
- if (should_flush) {
- // In the case of flushing, last few lookahead buffers were smaller then min match len.
- // So only last literal can be unwritten.
- assert(self.prev_match == null);
- try self.addPrevLiteral();
- self.prev_literal = null;
-
- try self.flushTokens(flush_opt);
- }
- }
-
- fn windowAdvance(self: *Self, step: u16, lh: []const u8, pos: u16) void {
- // current position is already added in findMatch
- self.lookup.bulkAdd(lh[1..], step - 1, pos + 1);
- self.win.advance(step);
- }
-
- // Add previous literal (if any) to the tokens list.
- fn addPrevLiteral(self: *Self) !void {
- if (self.prev_literal) |l| try self.addToken(Token.initLiteral(l));
- }
-
- // Add match to the tokens list, reset prev pointers.
- // Returns length of the added match.
- fn addMatch(self: *Self, m: Token) !u16 {
- try self.addToken(m);
- self.prev_literal = null;
- self.prev_match = null;
- return m.length();
- }
-
- fn addToken(self: *Self, token: Token) !void {
- self.tokens.add(token);
- if (self.tokens.full()) try self.flushTokens(.none);
- }
-
- // Finds largest match in the history window with the data at current pos.
- fn findMatch(self: *Self, pos: u16, lh: []const u8, min_len: u16) ?Token {
- var len: u16 = min_len;
- // Previous location with the same hash (same 4 bytes).
- var prev_pos = self.lookup.add(lh, pos);
- // Last found match.
- var match: ?Token = null;
-
- // How much back-references to try, performance knob.
- var chain: usize = self.level.chain;
- if (len >= self.level.good) {
- // If we've got a match that's good enough, only look in 1/4 the chain.
- chain >>= 2;
- }
-
- // Hot path loop!
- while (prev_pos > 0 and chain > 0) : (chain -= 1) {
- const distance = pos - prev_pos;
- if (distance > consts.match.max_distance)
- break;
-
- const new_len = self.win.match(prev_pos, pos, len);
- if (new_len > len) {
- match = Token.initMatch(@intCast(distance), new_len);
- if (new_len >= self.level.nice) {
- // The match is good enough that we don't try to find a better one.
- return match;
- }
- len = new_len;
- }
- prev_pos = self.lookup.prev(prev_pos);
- }
-
- return match;
- }
-
- fn flushTokens(self: *Self, flush_opt: FlushOption) !void {
- // Pass tokens to the token writer
- try self.block_writer.write(self.tokens.tokens(), flush_opt == .final, self.win.tokensBuffer());
- // Stored block ensures byte alignment.
- // It has 3 bits (final, block_type) and then padding until byte boundary.
- // After that everything is aligned to the boundary in the stored block.
- // Empty stored block is Ob000 + (0-7) bits of padding + 0x00 0x00 0xFF 0xFF.
- // Last 4 bytes are byte aligned.
- if (flush_opt == .flush) {
- try self.block_writer.storedBlock("", false);
- }
- if (flush_opt != .none) {
- // Safe to call only when byte aligned or it is OK to add
- // padding bits (on last byte of the final block).
- try self.block_writer.flush();
- }
- // Reset internal tokens store.
- self.tokens.reset();
- // Notify win that tokens are flushed.
- self.win.flush();
- }
-
- // Slide win and if needed lookup tables.
- fn slide(self: *Self) void {
- const n = self.win.slide();
- self.lookup.slide(n);
- }
-
- /// Compresses as much data as possible, stops when the reader becomes
- /// empty. It will introduce some output latency (reading input without
- /// producing all output) because some data are still in internal
- /// buffers.
- ///
- /// It is up to the caller to call flush (if needed) or finish (required)
- /// when is need to output any pending data or complete stream.
- ///
- pub fn compress(self: *Self, reader: anytype) !void {
- while (true) {
- // Fill window from reader
- const buf = self.win.writable();
- if (buf.len == 0) {
- try self.tokenize(.none);
- self.slide();
- continue;
- }
- const n = try reader.readAll(buf);
- self.hasher.update(buf[0..n]);
- self.win.written(n);
- // Process window
- try self.tokenize(.none);
- // Exit when no more data in reader
- if (n < buf.len) break;
- }
- }
-
- /// Flushes internal buffers to the output writer. Outputs empty stored
- /// block to sync bit stream to the byte boundary, so that the
- /// decompressor can get all input data available so far.
- ///
- /// It is useful mainly in compressed network protocols, to ensure that
- /// deflate bit stream can be used as byte stream. May degrade
- /// compression so it should be used only when necessary.
- ///
- /// Completes the current deflate block and follows it with an empty
- /// stored block that is three zero bits plus filler bits to the next
- /// byte, followed by four bytes (00 00 ff ff).
- ///
- pub fn flush(self: *Self) !void {
- try self.tokenize(.flush);
- }
-
- /// Completes deflate bit stream by writing any pending data as deflate
- /// final deflate block. HAS to be called once all data are written to
- /// the compressor as a signal that next block has to have final bit
- /// set.
- ///
- pub fn finish(self: *Self) !void {
- try self.tokenize(.final);
- try container.writeFooter(&self.hasher, self.wrt);
- }
-
- /// Use another writer while preserving history. Most probably flush
- /// should be called on old writer before setting new.
- pub fn setWriter(self: *Self, new_writer: WriterType) void {
- self.block_writer.setWriter(new_writer);
- self.wrt = new_writer;
- }
-
- // Writer interface
-
- pub const Writer = io.GenericWriter(*Self, Error, write);
- pub const Error = BlockWriterType.Error;
-
- /// Write `input` of uncompressed data.
- /// See compress.
- pub fn write(self: *Self, input: []const u8) !usize {
- var fbs = io.fixedBufferStream(input);
- try self.compress(fbs.reader());
- return input.len;
- }
-
- pub fn writer(self: *Self) Writer {
- return .{ .context = self };
- }
- };
-}
-
-// Tokens store
-const Tokens = struct {
- list: [consts.deflate.tokens]Token = undefined,
- pos: usize = 0,
-
- fn add(self: *Tokens, t: Token) void {
- self.list[self.pos] = t;
- self.pos += 1;
- }
-
- fn full(self: *Tokens) bool {
- return self.pos == self.list.len;
- }
-
- fn reset(self: *Tokens) void {
- self.pos = 0;
- }
-
- fn tokens(self: *Tokens) []const Token {
- return self.list[0..self.pos];
- }
-};
-
-/// Creates huffman only deflate blocks. Disables Lempel-Ziv match searching and
-/// only performs Huffman entropy encoding. Results in faster compression, much
-/// less memory requirements during compression but bigger compressed sizes.
-pub const huffman = struct {
- pub fn compress(comptime container: Container, reader: anytype, writer: anytype) !void {
- var c = try huffman.compressor(container, writer);
- try c.compress(reader);
- try c.finish();
- }
-
- pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
- return SimpleCompressor(.huffman, container, WriterType);
- }
-
- pub fn compressor(comptime container: Container, writer: anytype) !huffman.Compressor(container, @TypeOf(writer)) {
- return try huffman.Compressor(container, @TypeOf(writer)).init(writer);
- }
-};
-
-/// Creates store blocks only. Data are not compressed only packed into deflate
-/// store blocks. That adds 9 bytes of header for each block. Max stored block
-/// size is 64K. Block is emitted when flush is called on on finish.
-pub const store = struct {
- pub fn compress(comptime container: Container, reader: anytype, writer: anytype) !void {
- var c = try store.compressor(container, writer);
- try c.compress(reader);
- try c.finish();
- }
-
- pub fn Compressor(comptime container: Container, comptime WriterType: type) type {
- return SimpleCompressor(.store, container, WriterType);
- }
-
- pub fn compressor(comptime container: Container, writer: anytype) !store.Compressor(container, @TypeOf(writer)) {
- return try store.Compressor(container, @TypeOf(writer)).init(writer);
- }
-};
-
-const SimpleCompressorKind = enum {
- huffman,
- store,
-};
-
-fn simpleCompressor(
- comptime kind: SimpleCompressorKind,
- comptime container: Container,
- writer: anytype,
-) !SimpleCompressor(kind, container, @TypeOf(writer)) {
- return try SimpleCompressor(kind, container, @TypeOf(writer)).init(writer);
-}
-
-fn SimpleCompressor(
- comptime kind: SimpleCompressorKind,
- comptime container: Container,
- comptime WriterType: type,
-) type {
- const BlockWriterType = BlockWriter(WriterType);
- return struct {
- buffer: [65535]u8 = undefined, // because store blocks are limited to 65535 bytes
- wp: usize = 0,
-
- wrt: WriterType,
- block_writer: BlockWriterType,
- hasher: container.Hasher() = .{},
-
- const Self = @This();
-
- pub fn init(wrt: WriterType) !Self {
- const self = Self{
- .wrt = wrt,
- .block_writer = BlockWriterType.init(wrt),
- };
- try container.writeHeader(self.wrt);
- return self;
- }
-
- pub fn flush(self: *Self) !void {
- try self.flushBuffer(false);
- try self.block_writer.storedBlock("", false);
- try self.block_writer.flush();
- }
-
- pub fn finish(self: *Self) !void {
- try self.flushBuffer(true);
- try self.block_writer.flush();
- try container.writeFooter(&self.hasher, self.wrt);
- }
-
- fn flushBuffer(self: *Self, final: bool) !void {
- const buf = self.buffer[0..self.wp];
- switch (kind) {
- .huffman => try self.block_writer.huffmanBlock(buf, final),
- .store => try self.block_writer.storedBlock(buf, final),
- }
- self.wp = 0;
- }
-
- // Writes all data from the input reader of uncompressed data.
- // It is up to the caller to call flush or finish if there is need to
- // output compressed blocks.
- pub fn compress(self: *Self, reader: anytype) !void {
- while (true) {
- // read from rdr into buffer
- const buf = self.buffer[self.wp..];
- if (buf.len == 0) {
- try self.flushBuffer(false);
- continue;
- }
- const n = try reader.readAll(buf);
- self.hasher.update(buf[0..n]);
- self.wp += n;
- if (n < buf.len) break; // no more data in reader
- }
- }
-
- // Writer interface
-
- pub const Writer = io.GenericWriter(*Self, Error, write);
- pub const Error = BlockWriterType.Error;
-
- // Write `input` of uncompressed data.
- pub fn write(self: *Self, input: []const u8) !usize {
- var fbs = io.fixedBufferStream(input);
- try self.compress(fbs.reader());
- return input.len;
- }
-
- pub fn writer(self: *Self) Writer {
- return .{ .context = self };
- }
- };
-}
-
-const builtin = @import("builtin");
-
-test "tokenization" {
- const L = Token.initLiteral;
- const M = Token.initMatch;
-
- const cases = [_]struct {
- data: []const u8,
- tokens: []const Token,
- }{
- .{
- .data = "Blah blah blah blah blah!",
- .tokens = &[_]Token{ L('B'), L('l'), L('a'), L('h'), L(' '), L('b'), M(5, 18), L('!') },
- },
- .{
- .data = "ABCDEABCD ABCDEABCD",
- .tokens = &[_]Token{
- L('A'), L('B'), L('C'), L('D'), L('E'), L('A'), L('B'), L('C'), L('D'), L(' '),
- L('A'), M(10, 8),
- },
- },
- };
-
- for (cases) |c| {
- inline for (Container.list) |container| { // for each wrapping
-
- var cw = io.countingWriter(io.null_writer);
- const cww = cw.writer();
- var df = try Deflate(container, @TypeOf(cww), TestTokenWriter).init(cww, .{});
-
- _ = try df.write(c.data);
- try df.flush();
-
- // df.token_writer.show();
- try expect(df.block_writer.pos == c.tokens.len); // number of tokens written
- try testing.expectEqualSlices(Token, df.block_writer.get(), c.tokens); // tokens match
-
- try testing.expectEqual(container.headerSize(), cw.bytes_written);
- try df.finish();
- try testing.expectEqual(container.size(), cw.bytes_written);
- }
- }
-}
-
-// Tests that tokens written are equal to expected token list.
-const TestTokenWriter = struct {
- const Self = @This();
-
- pos: usize = 0,
- actual: [128]Token = undefined,
-
- pub fn init(_: anytype) Self {
- return .{};
- }
- pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
- for (tokens) |t| {
- self.actual[self.pos] = t;
- self.pos += 1;
- }
- }
-
- pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {}
-
- pub fn get(self: *Self) []Token {
- return self.actual[0..self.pos];
- }
-
- pub fn show(self: *Self) void {
- print("\n", .{});
- for (self.get()) |t| {
- t.show();
- }
- }
-
- pub fn flush(_: *Self) !void {}
-};
-
-test "file tokenization" {
- const levels = [_]Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
- const cases = [_]struct {
- data: []const u8, // uncompressed content
- // expected number of tokens producet in deflate tokenization
- tokens_count: [levels.len]usize = .{0} ** levels.len,
- }{
- .{
- .data = @embedFile("testdata/rfc1951.txt"),
- .tokens_count = .{ 7675, 7672, 7599, 7594, 7598, 7599 },
- },
-
- .{
- .data = @embedFile("testdata/block_writer/huffman-null-max.input"),
- .tokens_count = .{ 257, 257, 257, 257, 257, 257 },
- },
- .{
- .data = @embedFile("testdata/block_writer/huffman-pi.input"),
- .tokens_count = .{ 2570, 2564, 2564, 2564, 2564, 2564 },
- },
- .{
- .data = @embedFile("testdata/block_writer/huffman-text.input"),
- .tokens_count = .{ 235, 234, 234, 234, 234, 234 },
- },
- .{
- .data = @embedFile("testdata/fuzz/roundtrip1.input"),
- .tokens_count = .{ 333, 331, 331, 331, 331, 331 },
- },
- .{
- .data = @embedFile("testdata/fuzz/roundtrip2.input"),
- .tokens_count = .{ 334, 334, 334, 334, 334, 334 },
- },
- };
-
- for (cases) |case| { // for each case
- const data = case.data;
-
- for (levels, 0..) |level, i| { // for each compression level
- var original = io.fixedBufferStream(data);
-
- // buffer for decompressed data
- var al = std.ArrayList(u8).init(testing.allocator);
- defer al.deinit();
- const writer = al.writer();
-
- // create compressor
- const WriterType = @TypeOf(writer);
- const TokenWriter = TokenDecoder(@TypeOf(writer));
- var cmp = try Deflate(.raw, WriterType, TokenWriter).init(writer, .{ .level = level });
-
- // Stream uncompressed `original` data to the compressor. It will
- // produce tokens list and pass that list to the TokenDecoder. This
- // TokenDecoder uses CircularBuffer from inflate to convert list of
- // tokens back to the uncompressed stream.
- try cmp.compress(original.reader());
- try cmp.flush();
- const expected_count = case.tokens_count[i];
- const actual = cmp.block_writer.tokens_count;
- if (expected_count == 0) {
- print("actual token count {d}\n", .{actual});
- } else {
- try testing.expectEqual(expected_count, actual);
- }
-
- try testing.expectEqual(data.len, al.items.len);
- try testing.expectEqualSlices(u8, data, al.items);
- }
- }
-}
-
-fn TokenDecoder(comptime WriterType: type) type {
- return struct {
- const CircularBuffer = @import("CircularBuffer.zig");
- hist: CircularBuffer = .{},
- wrt: WriterType,
- tokens_count: usize = 0,
-
- const Self = @This();
-
- pub fn init(wrt: WriterType) Self {
- return .{ .wrt = wrt };
- }
-
- pub fn write(self: *Self, tokens: []const Token, _: bool, _: ?[]const u8) !void {
- self.tokens_count += tokens.len;
- for (tokens) |t| {
- switch (t.kind) {
- .literal => self.hist.write(t.literal()),
- .match => try self.hist.writeMatch(t.length(), t.distance()),
- }
- if (self.hist.free() < 285) try self.flushWin();
- }
- try self.flushWin();
- }
-
- pub fn storedBlock(_: *Self, _: []const u8, _: bool) !void {}
-
- fn flushWin(self: *Self) !void {
- while (true) {
- const buf = self.hist.read();
- if (buf.len == 0) break;
- try self.wrt.writeAll(buf);
- }
- }
-
- pub fn flush(_: *Self) !void {}
- };
-}
-
-test "store simple compressor" {
- const data = "Hello world!";
- const expected = [_]u8{
- 0x1, // block type 0, final bit set
- 0xc, 0x0, // len = 12
- 0xf3, 0xff, // ~len
- 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', //
- //0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x21,
- };
-
- var fbs = std.io.fixedBufferStream(data);
- var al = std.ArrayList(u8).init(testing.allocator);
- defer al.deinit();
-
- var cmp = try store.compressor(.raw, al.writer());
- try cmp.compress(fbs.reader());
- try cmp.finish();
- try testing.expectEqualSlices(u8, &expected, al.items);
-
- fbs.reset();
- try al.resize(0);
-
- // huffman only compresoor will also emit store block for this small sample
- var hc = try huffman.compressor(.raw, al.writer());
- try hc.compress(fbs.reader());
- try hc.finish();
- try testing.expectEqualSlices(u8, &expected, al.items);
-}
lib/std/compress/flate/huffman_decoder.zig
@@ -1,302 +0,0 @@
-const std = @import("std");
-const testing = std.testing;
-
-pub const Symbol = packed struct {
- pub const Kind = enum(u2) {
- literal,
- end_of_block,
- match,
- };
-
- symbol: u8 = 0, // symbol from alphabet
- code_bits: u4 = 0, // number of bits in code 0-15
- kind: Kind = .literal,
-
- code: u16 = 0, // huffman code of the symbol
- next: u16 = 0, // pointer to the next symbol in linked list
- // it is safe to use 0 as null pointer, when sorted 0 has shortest code and fits into lookup
-
- // Sorting less than function.
- pub fn asc(_: void, a: Symbol, b: Symbol) bool {
- if (a.code_bits == b.code_bits) {
- if (a.kind == b.kind) {
- return a.symbol < b.symbol;
- }
- return @intFromEnum(a.kind) < @intFromEnum(b.kind);
- }
- return a.code_bits < b.code_bits;
- }
-};
-
-pub const LiteralDecoder = HuffmanDecoder(286, 15, 9);
-pub const DistanceDecoder = HuffmanDecoder(30, 15, 9);
-pub const CodegenDecoder = HuffmanDecoder(19, 7, 7);
-
-pub const Error = error{
- InvalidCode,
- OversubscribedHuffmanTree,
- IncompleteHuffmanTree,
- MissingEndOfBlockCode,
-};
-
-/// Creates huffman tree codes from list of code lengths (in `build`).
-///
-/// `find` then finds symbol for code bits. Code can be any length between 1 and
-/// 15 bits. When calling `find` we don't know how many bits will be used to
-/// find symbol. When symbol is returned it has code_bits field which defines
-/// how much we should advance in bit stream.
-///
-/// Lookup table is used to map 15 bit int to symbol. Same symbol is written
-/// many times in this table; 32K places for 286 (at most) symbols.
-/// Small lookup table is optimization for faster search.
-/// It is variation of the algorithm explained in [zlib](https://github.com/madler/zlib/blob/643e17b7498d12ab8d15565662880579692f769d/doc/algorithm.txt#L92)
-/// with difference that we here use statically allocated arrays.
-///
-fn HuffmanDecoder(
- comptime alphabet_size: u16,
- comptime max_code_bits: u4,
- comptime lookup_bits: u4,
-) type {
- const lookup_shift = max_code_bits - lookup_bits;
-
- return struct {
- // all symbols in alaphabet, sorted by code_len, symbol
- symbols: [alphabet_size]Symbol = undefined,
- // lookup table code -> symbol
- lookup: [1 << lookup_bits]Symbol = undefined,
-
- const Self = @This();
-
- /// Generates symbols and lookup tables from list of code lens for each symbol.
- pub fn generate(self: *Self, lens: []const u4) !void {
- try checkCompleteness(lens);
-
- // init alphabet with code_bits
- for (self.symbols, 0..) |_, i| {
- const cb: u4 = if (i < lens.len) lens[i] else 0;
- self.symbols[i] = if (i < 256)
- .{ .kind = .literal, .symbol = @intCast(i), .code_bits = cb }
- else if (i == 256)
- .{ .kind = .end_of_block, .symbol = 0xff, .code_bits = cb }
- else
- .{ .kind = .match, .symbol = @intCast(i - 257), .code_bits = cb };
- }
- std.sort.heap(Symbol, &self.symbols, {}, Symbol.asc);
-
- // reset lookup table
- for (0..self.lookup.len) |i| {
- self.lookup[i] = .{};
- }
-
- // assign code to symbols
- // reference: https://youtu.be/9_YEGLe33NA?list=PLU4IQLU9e_OrY8oASHx0u3IXAL9TOdidm&t=2639
- var code: u16 = 0;
- var idx: u16 = 0;
- for (&self.symbols, 0..) |*sym, pos| {
- if (sym.code_bits == 0) continue; // skip unused
- sym.code = code;
-
- const next_code = code + (@as(u16, 1) << (max_code_bits - sym.code_bits));
- const next_idx = next_code >> lookup_shift;
-
- if (next_idx > self.lookup.len or idx >= self.lookup.len) break;
- if (sym.code_bits <= lookup_bits) {
- // fill small lookup table
- for (idx..next_idx) |j|
- self.lookup[j] = sym.*;
- } else {
- // insert into linked table starting at root
- const root = &self.lookup[idx];
- const root_next = root.next;
- root.next = @intCast(pos);
- sym.next = root_next;
- }
-
- idx = next_idx;
- code = next_code;
- }
- }
-
- /// Given the list of code lengths check that it represents a canonical
- /// Huffman code for n symbols.
- ///
- /// Reference: https://github.com/madler/zlib/blob/5c42a230b7b468dff011f444161c0145b5efae59/contrib/puff/puff.c#L340
- fn checkCompleteness(lens: []const u4) !void {
- if (alphabet_size == 286)
- if (lens[256] == 0) return error.MissingEndOfBlockCode;
-
- var count = [_]u16{0} ** (@as(usize, max_code_bits) + 1);
- var max: usize = 0;
- for (lens) |n| {
- if (n == 0) continue;
- if (n > max) max = n;
- count[n] += 1;
- }
- if (max == 0) // empty tree
- return;
-
- // check for an over-subscribed or incomplete set of lengths
- var left: usize = 1; // one possible code of zero length
- for (1..count.len) |len| {
- left <<= 1; // one more bit, double codes left
- if (count[len] > left)
- return error.OversubscribedHuffmanTree;
- left -= count[len]; // deduct count from possible codes
- }
- if (left > 0) { // left > 0 means incomplete
- // incomplete code ok only for single length 1 code
- if (max_code_bits > 7 and max == count[0] + count[1]) return;
- return error.IncompleteHuffmanTree;
- }
- }
-
- /// Finds symbol for lookup table code.
- pub fn find(self: *Self, code: u16) !Symbol {
- // try to find in lookup table
- const idx = code >> lookup_shift;
- const sym = self.lookup[idx];
- if (sym.code_bits != 0) return sym;
- // if not use linked list of symbols with same prefix
- return self.findLinked(code, sym.next);
- }
-
- inline fn findLinked(self: *Self, code: u16, start: u16) !Symbol {
- var pos = start;
- while (pos > 0) {
- const sym = self.symbols[pos];
- const shift = max_code_bits - sym.code_bits;
- // compare code_bits number of upper bits
- if ((code ^ sym.code) >> shift == 0) return sym;
- pos = sym.next;
- }
- return error.InvalidCode;
- }
- };
-}
-
-test "init/find" {
- // example data from: https://youtu.be/SJPvNi4HrWQ?t=8423
- const code_lens = [_]u4{ 4, 3, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 3, 2 };
- var h: CodegenDecoder = .{};
- try h.generate(&code_lens);
-
- const expected = [_]struct {
- sym: Symbol,
- code: u16,
- }{
- .{
- .code = 0b00_00000,
- .sym = .{ .symbol = 3, .code_bits = 2 },
- },
- .{
- .code = 0b01_00000,
- .sym = .{ .symbol = 18, .code_bits = 2 },
- },
- .{
- .code = 0b100_0000,
- .sym = .{ .symbol = 1, .code_bits = 3 },
- },
- .{
- .code = 0b101_0000,
- .sym = .{ .symbol = 4, .code_bits = 3 },
- },
- .{
- .code = 0b110_0000,
- .sym = .{ .symbol = 17, .code_bits = 3 },
- },
- .{
- .code = 0b1110_000,
- .sym = .{ .symbol = 0, .code_bits = 4 },
- },
- .{
- .code = 0b1111_000,
- .sym = .{ .symbol = 16, .code_bits = 4 },
- },
- };
-
- // unused symbols
- for (0..12) |i| {
- try testing.expectEqual(0, h.symbols[i].code_bits);
- }
- // used, from index 12
- for (expected, 12..) |e, i| {
- try testing.expectEqual(e.sym.symbol, h.symbols[i].symbol);
- try testing.expectEqual(e.sym.code_bits, h.symbols[i].code_bits);
- const sym_from_code = try h.find(e.code);
- try testing.expectEqual(e.sym.symbol, sym_from_code.symbol);
- }
-
- // All possible codes for each symbol.
- // Lookup table has 126 elements, to cover all possible 7 bit codes.
- for (0b0000_000..0b0100_000) |c| // 0..32 (32)
- try testing.expectEqual(3, (try h.find(@intCast(c))).symbol);
-
- for (0b0100_000..0b1000_000) |c| // 32..64 (32)
- try testing.expectEqual(18, (try h.find(@intCast(c))).symbol);
-
- for (0b1000_000..0b1010_000) |c| // 64..80 (16)
- try testing.expectEqual(1, (try h.find(@intCast(c))).symbol);
-
- for (0b1010_000..0b1100_000) |c| // 80..96 (16)
- try testing.expectEqual(4, (try h.find(@intCast(c))).symbol);
-
- for (0b1100_000..0b1110_000) |c| // 96..112 (16)
- try testing.expectEqual(17, (try h.find(@intCast(c))).symbol);
-
- for (0b1110_000..0b1111_000) |c| // 112..120 (8)
- try testing.expectEqual(0, (try h.find(@intCast(c))).symbol);
-
- for (0b1111_000..0b1_0000_000) |c| // 120...128 (8)
- try testing.expectEqual(16, (try h.find(@intCast(c))).symbol);
-}
-
-test "encode/decode literals" {
- const LiteralEncoder = @import("huffman_encoder.zig").LiteralEncoder;
-
- for (1..286) |j| { // for all different number of codes
- var enc: LiteralEncoder = .{};
- // create frequencies
- var freq = [_]u16{0} ** 286;
- freq[256] = 1; // ensure we have end of block code
- for (&freq, 1..) |*f, i| {
- if (i % j == 0)
- f.* = @intCast(i);
- }
-
- // encoder from frequencies
- enc.generate(&freq, 15);
-
- // get code_lens from encoder
- var code_lens = [_]u4{0} ** 286;
- for (code_lens, 0..) |_, i| {
- code_lens[i] = @intCast(enc.codes[i].len);
- }
- // generate decoder from code lens
- var dec: LiteralDecoder = .{};
- try dec.generate(&code_lens);
-
- // expect decoder code to match original encoder code
- for (dec.symbols) |s| {
- if (s.code_bits == 0) continue;
- const c_code: u16 = @bitReverse(@as(u15, @intCast(s.code)));
- const symbol: u16 = switch (s.kind) {
- .literal => s.symbol,
- .end_of_block => 256,
- .match => @as(u16, s.symbol) + 257,
- };
-
- const c = enc.codes[symbol];
- try testing.expect(c.code == c_code);
- }
-
- // find each symbol by code
- for (enc.codes) |c| {
- if (c.len == 0) continue;
-
- const s_code: u15 = @bitReverse(@as(u15, @intCast(c.code)));
- const s = try dec.find(s_code);
- try testing.expect(s.code == s_code);
- try testing.expect(s.code_bits == c.len);
- }
- }
-}
lib/std/compress/flate/huffman_encoder.zig
@@ -1,536 +0,0 @@
-const std = @import("std");
-const assert = std.debug.assert;
-const math = std.math;
-const mem = std.mem;
-const sort = std.sort;
-const testing = std.testing;
-
-const consts = @import("consts.zig").huffman;
-
-const LiteralNode = struct {
- literal: u16,
- freq: u16,
-};
-
-// Describes the state of the constructed tree for a given depth.
-const LevelInfo = struct {
- // Our level. for better printing
- level: u32,
-
- // The frequency of the last node at this level
- last_freq: u32,
-
- // The frequency of the next character to add to this level
- next_char_freq: u32,
-
- // The frequency of the next pair (from level below) to add to this level.
- // Only valid if the "needed" value of the next lower level is 0.
- next_pair_freq: u32,
-
- // The number of chains remaining to generate for this level before moving
- // up to the next level
- needed: u32,
-};
-
-// hcode is a huffman code with a bit code and bit length.
-pub const HuffCode = struct {
- code: u16 = 0,
- len: u16 = 0,
-
- // set sets the code and length of an hcode.
- fn set(self: *HuffCode, code: u16, length: u16) void {
- self.len = length;
- self.code = code;
- }
-};
-
-pub fn HuffmanEncoder(comptime size: usize) type {
- return struct {
- codes: [size]HuffCode = undefined,
- // Reusable buffer with the longest possible frequency table.
- freq_cache: [consts.max_num_frequencies + 1]LiteralNode = undefined,
- bit_count: [17]u32 = undefined,
- lns: []LiteralNode = undefined, // sorted by literal, stored to avoid repeated allocation in generate
- lfs: []LiteralNode = undefined, // sorted by frequency, stored to avoid repeated allocation in generate
-
- const Self = @This();
-
- // Update this Huffman Code object to be the minimum code for the specified frequency count.
- //
- // freq An array of frequencies, in which frequency[i] gives the frequency of literal i.
- // max_bits The maximum number of bits to use for any literal.
- pub fn generate(self: *Self, freq: []u16, max_bits: u32) void {
- var list = self.freq_cache[0 .. freq.len + 1];
- // Number of non-zero literals
- var count: u32 = 0;
- // Set list to be the set of all non-zero literals and their frequencies
- for (freq, 0..) |f, i| {
- if (f != 0) {
- list[count] = LiteralNode{ .literal = @as(u16, @intCast(i)), .freq = f };
- count += 1;
- } else {
- list[count] = LiteralNode{ .literal = 0x00, .freq = 0 };
- self.codes[i].len = 0;
- }
- }
- list[freq.len] = LiteralNode{ .literal = 0x00, .freq = 0 };
-
- list = list[0..count];
- if (count <= 2) {
- // Handle the small cases here, because they are awkward for the general case code. With
- // two or fewer literals, everything has bit length 1.
- for (list, 0..) |node, i| {
- // "list" is in order of increasing literal value.
- self.codes[node.literal].set(@as(u16, @intCast(i)), 1);
- }
- return;
- }
- self.lfs = list;
- mem.sort(LiteralNode, self.lfs, {}, byFreq);
-
- // Get the number of literals for each bit count
- const bit_count = self.bitCounts(list, max_bits);
- // And do the assignment
- self.assignEncodingAndSize(bit_count, list);
- }
-
- pub fn bitLength(self: *Self, freq: []u16) u32 {
- var total: u32 = 0;
- for (freq, 0..) |f, i| {
- if (f != 0) {
- total += @as(u32, @intCast(f)) * @as(u32, @intCast(self.codes[i].len));
- }
- }
- return total;
- }
-
- // Return the number of literals assigned to each bit size in the Huffman encoding
- //
- // This method is only called when list.len >= 3
- // The cases of 0, 1, and 2 literals are handled by special case code.
- //
- // list: An array of the literals with non-zero frequencies
- // and their associated frequencies. The array is in order of increasing
- // frequency, and has as its last element a special element with frequency
- // std.math.maxInt(i32)
- //
- // max_bits: The maximum number of bits that should be used to encode any literal.
- // Must be less than 16.
- //
- // Returns an integer array in which array[i] indicates the number of literals
- // that should be encoded in i bits.
- fn bitCounts(self: *Self, list: []LiteralNode, max_bits_to_use: usize) []u32 {
- var max_bits = max_bits_to_use;
- const n = list.len;
- const max_bits_limit = 16;
-
- assert(max_bits < max_bits_limit);
-
- // The tree can't have greater depth than n - 1, no matter what. This
- // saves a little bit of work in some small cases
- max_bits = @min(max_bits, n - 1);
-
- // Create information about each of the levels.
- // A bogus "Level 0" whose sole purpose is so that
- // level1.prev.needed == 0. This makes level1.next_pair_freq
- // be a legitimate value that never gets chosen.
- var levels: [max_bits_limit]LevelInfo = mem.zeroes([max_bits_limit]LevelInfo);
- // leaf_counts[i] counts the number of literals at the left
- // of ancestors of the rightmost node at level i.
- // leaf_counts[i][j] is the number of literals at the left
- // of the level j ancestor.
- var leaf_counts: [max_bits_limit][max_bits_limit]u32 = mem.zeroes([max_bits_limit][max_bits_limit]u32);
-
- {
- var level = @as(u32, 1);
- while (level <= max_bits) : (level += 1) {
- // For every level, the first two items are the first two characters.
- // We initialize the levels as if we had already figured this out.
- levels[level] = LevelInfo{
- .level = level,
- .last_freq = list[1].freq,
- .next_char_freq = list[2].freq,
- .next_pair_freq = list[0].freq + list[1].freq,
- .needed = 0,
- };
- leaf_counts[level][level] = 2;
- if (level == 1) {
- levels[level].next_pair_freq = math.maxInt(i32);
- }
- }
- }
-
- // We need a total of 2*n - 2 items at top level and have already generated 2.
- levels[max_bits].needed = 2 * @as(u32, @intCast(n)) - 4;
-
- {
- var level = max_bits;
- while (true) {
- var l = &levels[level];
- if (l.next_pair_freq == math.maxInt(i32) and l.next_char_freq == math.maxInt(i32)) {
- // We've run out of both leaves and pairs.
- // End all calculations for this level.
- // To make sure we never come back to this level or any lower level,
- // set next_pair_freq impossibly large.
- l.needed = 0;
- levels[level + 1].next_pair_freq = math.maxInt(i32);
- level += 1;
- continue;
- }
-
- const prev_freq = l.last_freq;
- if (l.next_char_freq < l.next_pair_freq) {
- // The next item on this row is a leaf node.
- const next = leaf_counts[level][level] + 1;
- l.last_freq = l.next_char_freq;
- // Lower leaf_counts are the same of the previous node.
- leaf_counts[level][level] = next;
- if (next >= list.len) {
- l.next_char_freq = maxNode().freq;
- } else {
- l.next_char_freq = list[next].freq;
- }
- } else {
- // The next item on this row is a pair from the previous row.
- // next_pair_freq isn't valid until we generate two
- // more values in the level below
- l.last_freq = l.next_pair_freq;
- // Take leaf counts from the lower level, except counts[level] remains the same.
- @memcpy(leaf_counts[level][0..level], leaf_counts[level - 1][0..level]);
- levels[l.level - 1].needed = 2;
- }
-
- l.needed -= 1;
- if (l.needed == 0) {
- // We've done everything we need to do for this level.
- // Continue calculating one level up. Fill in next_pair_freq
- // of that level with the sum of the two nodes we've just calculated on
- // this level.
- if (l.level == max_bits) {
- // All done!
- break;
- }
- levels[l.level + 1].next_pair_freq = prev_freq + l.last_freq;
- level += 1;
- } else {
- // If we stole from below, move down temporarily to replenish it.
- while (levels[level - 1].needed > 0) {
- level -= 1;
- if (level == 0) {
- break;
- }
- }
- }
- }
- }
-
- // Somethings is wrong if at the end, the top level is null or hasn't used
- // all of the leaves.
- assert(leaf_counts[max_bits][max_bits] == n);
-
- var bit_count = self.bit_count[0 .. max_bits + 1];
- var bits: u32 = 1;
- const counts = &leaf_counts[max_bits];
- {
- var level = max_bits;
- while (level > 0) : (level -= 1) {
- // counts[level] gives the number of literals requiring at least "bits"
- // bits to encode.
- bit_count[bits] = counts[level] - counts[level - 1];
- bits += 1;
- if (level == 0) {
- break;
- }
- }
- }
- return bit_count;
- }
-
- // Look at the leaves and assign them a bit count and an encoding as specified
- // in RFC 1951 3.2.2
- fn assignEncodingAndSize(self: *Self, bit_count: []u32, list_arg: []LiteralNode) void {
- var code = @as(u16, 0);
- var list = list_arg;
-
- for (bit_count, 0..) |bits, n| {
- code <<= 1;
- if (n == 0 or bits == 0) {
- continue;
- }
- // The literals list[list.len-bits] .. list[list.len-bits]
- // are encoded using "bits" bits, and get the values
- // code, code + 1, .... The code values are
- // assigned in literal order (not frequency order).
- const chunk = list[list.len - @as(u32, @intCast(bits)) ..];
-
- self.lns = chunk;
- mem.sort(LiteralNode, self.lns, {}, byLiteral);
-
- for (chunk) |node| {
- self.codes[node.literal] = HuffCode{
- .code = bitReverse(u16, code, @as(u5, @intCast(n))),
- .len = @as(u16, @intCast(n)),
- };
- code += 1;
- }
- list = list[0 .. list.len - @as(u32, @intCast(bits))];
- }
- }
- };
-}
-
-fn maxNode() LiteralNode {
- return LiteralNode{
- .literal = math.maxInt(u16),
- .freq = math.maxInt(u16),
- };
-}
-
-pub fn huffmanEncoder(comptime size: u32) HuffmanEncoder(size) {
- return .{};
-}
-
-pub const LiteralEncoder = HuffmanEncoder(consts.max_num_frequencies);
-pub const DistanceEncoder = HuffmanEncoder(consts.distance_code_count);
-pub const CodegenEncoder = HuffmanEncoder(19);
-
-// Generates a HuffmanCode corresponding to the fixed literal table
-pub fn fixedLiteralEncoder() LiteralEncoder {
- var h: LiteralEncoder = undefined;
- var ch: u16 = 0;
-
- while (ch < consts.max_num_frequencies) : (ch += 1) {
- var bits: u16 = undefined;
- var size: u16 = undefined;
- switch (ch) {
- 0...143 => {
- // size 8, 000110000 .. 10111111
- bits = ch + 48;
- size = 8;
- },
- 144...255 => {
- // size 9, 110010000 .. 111111111
- bits = ch + 400 - 144;
- size = 9;
- },
- 256...279 => {
- // size 7, 0000000 .. 0010111
- bits = ch - 256;
- size = 7;
- },
- else => {
- // size 8, 11000000 .. 11000111
- bits = ch + 192 - 280;
- size = 8;
- },
- }
- h.codes[ch] = HuffCode{ .code = bitReverse(u16, bits, @as(u5, @intCast(size))), .len = size };
- }
- return h;
-}
-
-pub fn fixedDistanceEncoder() DistanceEncoder {
- var h: DistanceEncoder = undefined;
- for (h.codes, 0..) |_, ch| {
- h.codes[ch] = HuffCode{ .code = bitReverse(u16, @as(u16, @intCast(ch)), 5), .len = 5 };
- }
- return h;
-}
-
-pub fn huffmanDistanceEncoder() DistanceEncoder {
- var distance_freq = [1]u16{0} ** consts.distance_code_count;
- distance_freq[0] = 1;
- // huff_distance is a static distance encoder used for huffman only encoding.
- // It can be reused since we will not be encoding distance values.
- var h: DistanceEncoder = .{};
- h.generate(distance_freq[0..], 15);
- return h;
-}
-
-fn byLiteral(context: void, a: LiteralNode, b: LiteralNode) bool {
- _ = context;
- return a.literal < b.literal;
-}
-
-fn byFreq(context: void, a: LiteralNode, b: LiteralNode) bool {
- _ = context;
- if (a.freq == b.freq) {
- return a.literal < b.literal;
- }
- return a.freq < b.freq;
-}
-
-test "generate a Huffman code from an array of frequencies" {
- var freqs: [19]u16 = [_]u16{
- 8, // 0
- 1, // 1
- 1, // 2
- 2, // 3
- 5, // 4
- 10, // 5
- 9, // 6
- 1, // 7
- 0, // 8
- 0, // 9
- 0, // 10
- 0, // 11
- 0, // 12
- 0, // 13
- 0, // 14
- 0, // 15
- 1, // 16
- 3, // 17
- 5, // 18
- };
-
- var enc = huffmanEncoder(19);
- enc.generate(freqs[0..], 7);
-
- try testing.expectEqual(@as(u32, 141), enc.bitLength(freqs[0..]));
-
- try testing.expectEqual(@as(usize, 3), enc.codes[0].len);
- try testing.expectEqual(@as(usize, 6), enc.codes[1].len);
- try testing.expectEqual(@as(usize, 6), enc.codes[2].len);
- try testing.expectEqual(@as(usize, 5), enc.codes[3].len);
- try testing.expectEqual(@as(usize, 3), enc.codes[4].len);
- try testing.expectEqual(@as(usize, 2), enc.codes[5].len);
- try testing.expectEqual(@as(usize, 2), enc.codes[6].len);
- try testing.expectEqual(@as(usize, 6), enc.codes[7].len);
- try testing.expectEqual(@as(usize, 0), enc.codes[8].len);
- try testing.expectEqual(@as(usize, 0), enc.codes[9].len);
- try testing.expectEqual(@as(usize, 0), enc.codes[10].len);
- try testing.expectEqual(@as(usize, 0), enc.codes[11].len);
- try testing.expectEqual(@as(usize, 0), enc.codes[12].len);
- try testing.expectEqual(@as(usize, 0), enc.codes[13].len);
- try testing.expectEqual(@as(usize, 0), enc.codes[14].len);
- try testing.expectEqual(@as(usize, 0), enc.codes[15].len);
- try testing.expectEqual(@as(usize, 6), enc.codes[16].len);
- try testing.expectEqual(@as(usize, 5), enc.codes[17].len);
- try testing.expectEqual(@as(usize, 3), enc.codes[18].len);
-
- try testing.expectEqual(@as(u16, 0x0), enc.codes[5].code);
- try testing.expectEqual(@as(u16, 0x2), enc.codes[6].code);
- try testing.expectEqual(@as(u16, 0x1), enc.codes[0].code);
- try testing.expectEqual(@as(u16, 0x5), enc.codes[4].code);
- try testing.expectEqual(@as(u16, 0x3), enc.codes[18].code);
- try testing.expectEqual(@as(u16, 0x7), enc.codes[3].code);
- try testing.expectEqual(@as(u16, 0x17), enc.codes[17].code);
- try testing.expectEqual(@as(u16, 0x0f), enc.codes[1].code);
- try testing.expectEqual(@as(u16, 0x2f), enc.codes[2].code);
- try testing.expectEqual(@as(u16, 0x1f), enc.codes[7].code);
- try testing.expectEqual(@as(u16, 0x3f), enc.codes[16].code);
-}
-
-test "generate a Huffman code for the fixed literal table specific to Deflate" {
- const enc = fixedLiteralEncoder();
- for (enc.codes) |c| {
- switch (c.len) {
- 7 => {
- const v = @bitReverse(@as(u7, @intCast(c.code)));
- try testing.expect(v <= 0b0010111);
- },
- 8 => {
- const v = @bitReverse(@as(u8, @intCast(c.code)));
- try testing.expect((v >= 0b000110000 and v <= 0b10111111) or
- (v >= 0b11000000 and v <= 11000111));
- },
- 9 => {
- const v = @bitReverse(@as(u9, @intCast(c.code)));
- try testing.expect(v >= 0b110010000 and v <= 0b111111111);
- },
- else => unreachable,
- }
- }
-}
-
-test "generate a Huffman code for the 30 possible relative distances (LZ77 distances) of Deflate" {
- const enc = fixedDistanceEncoder();
- for (enc.codes) |c| {
- const v = @bitReverse(@as(u5, @intCast(c.code)));
- try testing.expect(v <= 29);
- try testing.expect(c.len == 5);
- }
-}
-
-// Reverse bit-by-bit a N-bit code.
-fn bitReverse(comptime T: type, value: T, n: usize) T {
- const r = @bitReverse(value);
- return r >> @as(math.Log2Int(T), @intCast(@typeInfo(T).int.bits - n));
-}
-
-test bitReverse {
- const ReverseBitsTest = struct {
- in: u16,
- bit_count: u5,
- out: u16,
- };
-
- const reverse_bits_tests = [_]ReverseBitsTest{
- .{ .in = 1, .bit_count = 1, .out = 1 },
- .{ .in = 1, .bit_count = 2, .out = 2 },
- .{ .in = 1, .bit_count = 3, .out = 4 },
- .{ .in = 1, .bit_count = 4, .out = 8 },
- .{ .in = 1, .bit_count = 5, .out = 16 },
- .{ .in = 17, .bit_count = 5, .out = 17 },
- .{ .in = 257, .bit_count = 9, .out = 257 },
- .{ .in = 29, .bit_count = 5, .out = 23 },
- };
-
- for (reverse_bits_tests) |h| {
- const v = bitReverse(u16, h.in, h.bit_count);
- try std.testing.expectEqual(h.out, v);
- }
-}
-
-test "fixedLiteralEncoder codes" {
- var al = std.ArrayList(u8).init(testing.allocator);
- defer al.deinit();
- var bw = std.io.bitWriter(.little, al.writer());
-
- const f = fixedLiteralEncoder();
- for (f.codes) |c| {
- try bw.writeBits(c.code, c.len);
- }
- try testing.expectEqualSlices(u8, &fixed_codes, al.items);
-}
-
-pub const fixed_codes = [_]u8{
- 0b00001100, 0b10001100, 0b01001100, 0b11001100, 0b00101100, 0b10101100, 0b01101100, 0b11101100,
- 0b00011100, 0b10011100, 0b01011100, 0b11011100, 0b00111100, 0b10111100, 0b01111100, 0b11111100,
- 0b00000010, 0b10000010, 0b01000010, 0b11000010, 0b00100010, 0b10100010, 0b01100010, 0b11100010,
- 0b00010010, 0b10010010, 0b01010010, 0b11010010, 0b00110010, 0b10110010, 0b01110010, 0b11110010,
- 0b00001010, 0b10001010, 0b01001010, 0b11001010, 0b00101010, 0b10101010, 0b01101010, 0b11101010,
- 0b00011010, 0b10011010, 0b01011010, 0b11011010, 0b00111010, 0b10111010, 0b01111010, 0b11111010,
- 0b00000110, 0b10000110, 0b01000110, 0b11000110, 0b00100110, 0b10100110, 0b01100110, 0b11100110,
- 0b00010110, 0b10010110, 0b01010110, 0b11010110, 0b00110110, 0b10110110, 0b01110110, 0b11110110,
- 0b00001110, 0b10001110, 0b01001110, 0b11001110, 0b00101110, 0b10101110, 0b01101110, 0b11101110,
- 0b00011110, 0b10011110, 0b01011110, 0b11011110, 0b00111110, 0b10111110, 0b01111110, 0b11111110,
- 0b00000001, 0b10000001, 0b01000001, 0b11000001, 0b00100001, 0b10100001, 0b01100001, 0b11100001,
- 0b00010001, 0b10010001, 0b01010001, 0b11010001, 0b00110001, 0b10110001, 0b01110001, 0b11110001,
- 0b00001001, 0b10001001, 0b01001001, 0b11001001, 0b00101001, 0b10101001, 0b01101001, 0b11101001,
- 0b00011001, 0b10011001, 0b01011001, 0b11011001, 0b00111001, 0b10111001, 0b01111001, 0b11111001,
- 0b00000101, 0b10000101, 0b01000101, 0b11000101, 0b00100101, 0b10100101, 0b01100101, 0b11100101,
- 0b00010101, 0b10010101, 0b01010101, 0b11010101, 0b00110101, 0b10110101, 0b01110101, 0b11110101,
- 0b00001101, 0b10001101, 0b01001101, 0b11001101, 0b00101101, 0b10101101, 0b01101101, 0b11101101,
- 0b00011101, 0b10011101, 0b01011101, 0b11011101, 0b00111101, 0b10111101, 0b01111101, 0b11111101,
- 0b00010011, 0b00100110, 0b01001110, 0b10011010, 0b00111100, 0b01100101, 0b11101010, 0b10110100,
- 0b11101001, 0b00110011, 0b01100110, 0b11001110, 0b10011010, 0b00111101, 0b01100111, 0b11101110,
- 0b10111100, 0b11111001, 0b00001011, 0b00010110, 0b00101110, 0b01011010, 0b10111100, 0b01100100,
- 0b11101001, 0b10110010, 0b11100101, 0b00101011, 0b01010110, 0b10101110, 0b01011010, 0b10111101,
- 0b01100110, 0b11101101, 0b10111010, 0b11110101, 0b00011011, 0b00110110, 0b01101110, 0b11011010,
- 0b10111100, 0b01100101, 0b11101011, 0b10110110, 0b11101101, 0b00111011, 0b01110110, 0b11101110,
- 0b11011010, 0b10111101, 0b01100111, 0b11101111, 0b10111110, 0b11111101, 0b00000111, 0b00001110,
- 0b00011110, 0b00111010, 0b01111100, 0b11100100, 0b11101000, 0b10110001, 0b11100011, 0b00100111,
- 0b01001110, 0b10011110, 0b00111010, 0b01111101, 0b11100110, 0b11101100, 0b10111001, 0b11110011,
- 0b00010111, 0b00101110, 0b01011110, 0b10111010, 0b01111100, 0b11100101, 0b11101010, 0b10110101,
- 0b11101011, 0b00110111, 0b01101110, 0b11011110, 0b10111010, 0b01111101, 0b11100111, 0b11101110,
- 0b10111101, 0b11111011, 0b00001111, 0b00011110, 0b00111110, 0b01111010, 0b11111100, 0b11100100,
- 0b11101001, 0b10110011, 0b11100111, 0b00101111, 0b01011110, 0b10111110, 0b01111010, 0b11111101,
- 0b11100110, 0b11101101, 0b10111011, 0b11110111, 0b00011111, 0b00111110, 0b01111110, 0b11111010,
- 0b11111100, 0b11100101, 0b11101011, 0b10110111, 0b11101111, 0b00111111, 0b01111110, 0b11111110,
- 0b11111010, 0b11111101, 0b11100111, 0b11101111, 0b10111111, 0b11111111, 0b00000000, 0b00100000,
- 0b00001000, 0b00001100, 0b10000001, 0b11000010, 0b11100000, 0b00001000, 0b00100100, 0b00001010,
- 0b10001101, 0b11000001, 0b11100010, 0b11110000, 0b00000100, 0b00100010, 0b10001001, 0b01001100,
- 0b10100001, 0b11010010, 0b11101000, 0b00000011, 0b10000011, 0b01000011, 0b11000011, 0b00100011,
- 0b10100011,
-};
lib/std/compress/flate/inflate.zig
@@ -1,570 +0,0 @@
-const std = @import("std");
-const assert = std.debug.assert;
-const testing = std.testing;
-
-const hfd = @import("huffman_decoder.zig");
-const BitReader = @import("bit_reader.zig").BitReader;
-const CircularBuffer = @import("CircularBuffer.zig");
-const Container = @import("container.zig").Container;
-const Token = @import("Token.zig");
-const codegen_order = @import("consts.zig").huffman.codegen_order;
-
-/// Decompresses deflate bit stream `reader` and writes uncompressed data to the
-/// `writer` stream.
-pub fn decompress(comptime container: Container, reader: anytype, writer: anytype) !void {
- var d = decompressor(container, reader);
- try d.decompress(writer);
-}
-
-/// Inflate decompressor for the reader type.
-pub fn decompressor(comptime container: Container, reader: anytype) Decompressor(container, @TypeOf(reader)) {
- return Decompressor(container, @TypeOf(reader)).init(reader);
-}
-
-pub fn Decompressor(comptime container: Container, comptime ReaderType: type) type {
- // zlib has 4 bytes footer, lookahead of 4 bytes ensures that we will not overshoot.
- // gzip has 8 bytes footer so we will not overshoot even with 8 bytes of lookahead.
- // For raw deflate there is always possibility of overshot so we use 8 bytes lookahead.
- const lookahead: type = if (container == .zlib) u32 else u64;
- return Inflate(container, lookahead, ReaderType);
-}
-
-/// Inflate decompresses deflate bit stream. Reads compressed data from reader
-/// provided in init. Decompressed data are stored in internal hist buffer and
-/// can be accesses iterable `next` or reader interface.
-///
-/// Container defines header/footer wrapper around deflate bit stream. Can be
-/// gzip or zlib.
-///
-/// Deflate bit stream consists of multiple blocks. Block can be one of three types:
-/// * stored, non compressed, max 64k in size
-/// * fixed, huffman codes are predefined
-/// * dynamic, huffman code tables are encoded at the block start
-///
-/// `step` function runs decoder until internal `hist` buffer is full. Client
-/// than needs to read that data in order to proceed with decoding.
-///
-/// Allocates 74.5K of internal buffers, most important are:
-/// * 64K for history (CircularBuffer)
-/// * ~10K huffman decoders (Literal and DistanceDecoder)
-///
-pub fn Inflate(comptime container: Container, comptime LookaheadType: type, comptime ReaderType: type) type {
- assert(LookaheadType == u32 or LookaheadType == u64);
- const BitReaderType = BitReader(LookaheadType, ReaderType);
-
- return struct {
- //const BitReaderType = BitReader(ReaderType);
- const F = BitReaderType.flag;
-
- bits: BitReaderType = .{},
- hist: CircularBuffer = .{},
- // Hashes, produces checkusm, of uncompressed data for gzip/zlib footer.
- hasher: container.Hasher() = .{},
-
- // dynamic block huffman code decoders
- lit_dec: hfd.LiteralDecoder = .{}, // literals
- dst_dec: hfd.DistanceDecoder = .{}, // distances
-
- // current read state
- bfinal: u1 = 0,
- block_type: u2 = 0b11,
- state: ReadState = .protocol_header,
-
- const ReadState = enum {
- protocol_header,
- block_header,
- block,
- protocol_footer,
- end,
- };
-
- const Self = @This();
-
- pub const Error = BitReaderType.Error || Container.Error || hfd.Error || error{
- InvalidCode,
- InvalidMatch,
- InvalidBlockType,
- WrongStoredBlockNlen,
- InvalidDynamicBlockHeader,
- };
-
- pub fn init(rt: ReaderType) Self {
- return .{ .bits = BitReaderType.init(rt) };
- }
-
- fn blockHeader(self: *Self) !void {
- self.bfinal = try self.bits.read(u1);
- self.block_type = try self.bits.read(u2);
- }
-
- fn storedBlock(self: *Self) !bool {
- self.bits.alignToByte(); // skip padding until byte boundary
- // everything after this is byte aligned in stored block
- var len = try self.bits.read(u16);
- const nlen = try self.bits.read(u16);
- if (len != ~nlen) return error.WrongStoredBlockNlen;
-
- while (len > 0) {
- const buf = self.hist.getWritable(len);
- try self.bits.readAll(buf);
- len -= @intCast(buf.len);
- }
- return true;
- }
-
- fn fixedBlock(self: *Self) !bool {
- while (!self.hist.full()) {
- const code = try self.bits.readFixedCode();
- switch (code) {
- 0...255 => self.hist.write(@intCast(code)),
- 256 => return true, // end of block
- 257...285 => try self.fixedDistanceCode(@intCast(code - 257)),
- else => return error.InvalidCode,
- }
- }
- return false;
- }
-
- // Handles fixed block non literal (length) code.
- // Length code is followed by 5 bits of distance code.
- fn fixedDistanceCode(self: *Self, code: u8) !void {
- try self.bits.fill(5 + 5 + 13);
- const length = try self.decodeLength(code);
- const distance = try self.decodeDistance(try self.bits.readF(u5, F.buffered | F.reverse));
- try self.hist.writeMatch(length, distance);
- }
-
- inline fn decodeLength(self: *Self, code: u8) !u16 {
- if (code > 28) return error.InvalidCode;
- const ml = Token.matchLength(code);
- return if (ml.extra_bits == 0) // 0 - 5 extra bits
- ml.base
- else
- ml.base + try self.bits.readN(ml.extra_bits, F.buffered);
- }
-
- fn decodeDistance(self: *Self, code: u8) !u16 {
- if (code > 29) return error.InvalidCode;
- const md = Token.matchDistance(code);
- return if (md.extra_bits == 0) // 0 - 13 extra bits
- md.base
- else
- md.base + try self.bits.readN(md.extra_bits, F.buffered);
- }
-
- fn dynamicBlockHeader(self: *Self) !void {
- const hlit: u16 = @as(u16, try self.bits.read(u5)) + 257; // number of ll code entries present - 257
- const hdist: u16 = @as(u16, try self.bits.read(u5)) + 1; // number of distance code entries - 1
- const hclen: u8 = @as(u8, try self.bits.read(u4)) + 4; // hclen + 4 code lengths are encoded
-
- if (hlit > 286 or hdist > 30)
- return error.InvalidDynamicBlockHeader;
-
- // lengths for code lengths
- var cl_lens = [_]u4{0} ** 19;
- for (0..hclen) |i| {
- cl_lens[codegen_order[i]] = try self.bits.read(u3);
- }
- var cl_dec: hfd.CodegenDecoder = .{};
- try cl_dec.generate(&cl_lens);
-
- // decoded code lengths
- var dec_lens = [_]u4{0} ** (286 + 30);
- var pos: usize = 0;
- while (pos < hlit + hdist) {
- const sym = try cl_dec.find(try self.bits.peekF(u7, F.reverse));
- try self.bits.shift(sym.code_bits);
- pos += try self.dynamicCodeLength(sym.symbol, &dec_lens, pos);
- }
- if (pos > hlit + hdist) {
- return error.InvalidDynamicBlockHeader;
- }
-
- // literal code lengths to literal decoder
- try self.lit_dec.generate(dec_lens[0..hlit]);
-
- // distance code lengths to distance decoder
- try self.dst_dec.generate(dec_lens[hlit .. hlit + hdist]);
- }
-
- // Decode code length symbol to code length. Writes decoded length into
- // lens slice starting at position pos. Returns number of positions
- // advanced.
- fn dynamicCodeLength(self: *Self, code: u16, lens: []u4, pos: usize) !usize {
- if (pos >= lens.len)
- return error.InvalidDynamicBlockHeader;
-
- switch (code) {
- 0...15 => {
- // Represent code lengths of 0 - 15
- lens[pos] = @intCast(code);
- return 1;
- },
- 16 => {
- // Copy the previous code length 3 - 6 times.
- // The next 2 bits indicate repeat length
- const n: u8 = @as(u8, try self.bits.read(u2)) + 3;
- if (pos == 0 or pos + n > lens.len)
- return error.InvalidDynamicBlockHeader;
- for (0..n) |i| {
- lens[pos + i] = lens[pos + i - 1];
- }
- return n;
- },
- // Repeat a code length of 0 for 3 - 10 times. (3 bits of length)
- 17 => return @as(u8, try self.bits.read(u3)) + 3,
- // Repeat a code length of 0 for 11 - 138 times (7 bits of length)
- 18 => return @as(u8, try self.bits.read(u7)) + 11,
- else => return error.InvalidDynamicBlockHeader,
- }
- }
-
- // In larger archives most blocks are usually dynamic, so decompression
- // performance depends on this function.
- fn dynamicBlock(self: *Self) !bool {
- // Hot path loop!
- while (!self.hist.full()) {
- try self.bits.fill(15); // optimization so other bit reads can be buffered (avoiding one `if` in hot path)
- const sym = try self.decodeSymbol(&self.lit_dec);
-
- switch (sym.kind) {
- .literal => self.hist.write(sym.symbol),
- .match => { // Decode match backreference <length, distance>
- // fill so we can use buffered reads
- if (LookaheadType == u32)
- try self.bits.fill(5 + 15)
- else
- try self.bits.fill(5 + 15 + 13);
- const length = try self.decodeLength(sym.symbol);
- const dsm = try self.decodeSymbol(&self.dst_dec);
- if (LookaheadType == u32) try self.bits.fill(13);
- const distance = try self.decodeDistance(dsm.symbol);
- try self.hist.writeMatch(length, distance);
- },
- .end_of_block => return true,
- }
- }
- return false;
- }
-
- // Peek 15 bits from bits reader (maximum code len is 15 bits). Use
- // decoder to find symbol for that code. We then know how many bits is
- // used. Shift bit reader for that much bits, those bits are used. And
- // return symbol.
- fn decodeSymbol(self: *Self, decoder: anytype) !hfd.Symbol {
- const sym = try decoder.find(try self.bits.peekF(u15, F.buffered | F.reverse));
- try self.bits.shift(sym.code_bits);
- return sym;
- }
-
- fn step(self: *Self) !void {
- switch (self.state) {
- .protocol_header => {
- try container.parseHeader(&self.bits);
- self.state = .block_header;
- },
- .block_header => {
- try self.blockHeader();
- self.state = .block;
- if (self.block_type == 2) try self.dynamicBlockHeader();
- },
- .block => {
- const done = switch (self.block_type) {
- 0 => try self.storedBlock(),
- 1 => try self.fixedBlock(),
- 2 => try self.dynamicBlock(),
- else => return error.InvalidBlockType,
- };
- if (done) {
- self.state = if (self.bfinal == 1) .protocol_footer else .block_header;
- }
- },
- .protocol_footer => {
- self.bits.alignToByte();
- try container.parseFooter(&self.hasher, &self.bits);
- self.state = .end;
- },
- .end => {},
- }
- }
-
- /// Replaces the inner reader with new reader.
- pub fn setReader(self: *Self, new_reader: ReaderType) void {
- self.bits.forward_reader = new_reader;
- if (self.state == .end or self.state == .protocol_footer) {
- self.state = .protocol_header;
- }
- }
-
- // Reads all compressed data from the internal reader and outputs plain
- // (uncompressed) data to the provided writer.
- pub fn decompress(self: *Self, writer: anytype) !void {
- while (try self.next()) |buf| {
- try writer.writeAll(buf);
- }
- }
-
- /// Returns the number of bytes that have been read from the internal
- /// reader but not yet consumed by the decompressor.
- pub fn unreadBytes(self: Self) usize {
- // There can be no error here: the denominator is not zero, and
- // overflow is not possible since the type is unsigned.
- return std.math.divCeil(usize, self.bits.nbits, 8) catch unreachable;
- }
-
- // Iterator interface
-
- /// Can be used in iterator like loop without memcpy to another buffer:
- /// while (try inflate.next()) |buf| { ... }
- pub fn next(self: *Self) Error!?[]const u8 {
- const out = try self.get(0);
- if (out.len == 0) return null;
- return out;
- }
-
- /// Returns decompressed data from internal sliding window buffer.
- /// Returned buffer can be any length between 0 and `limit` bytes. 0
- /// returned bytes means end of stream reached. With limit=0 returns as
- /// much data it can. It newer will be more than 65536 bytes, which is
- /// size of internal buffer.
- pub fn get(self: *Self, limit: usize) Error![]const u8 {
- while (true) {
- const out = self.hist.readAtMost(limit);
- if (out.len > 0) {
- self.hasher.update(out);
- return out;
- }
- if (self.state == .end) return out;
- try self.step();
- }
- }
-
- // Reader interface
-
- pub const Reader = std.io.GenericReader(*Self, Error, read);
-
- /// Returns the number of bytes read. It may be less than buffer.len.
- /// If the number of bytes read is 0, it means end of stream.
- /// End of stream is not an error condition.
- pub fn read(self: *Self, buffer: []u8) Error!usize {
- if (buffer.len == 0) return 0;
- const out = try self.get(buffer.len);
- @memcpy(buffer[0..out.len], out);
- return out.len;
- }
-
- pub fn reader(self: *Self) Reader {
- return .{ .context = self };
- }
- };
-}
-
-test "decompress" {
- const cases = [_]struct {
- in: []const u8,
- out: []const u8,
- }{
- // non compressed block (type 0)
- .{
- .in = &[_]u8{
- 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
- 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
- },
- .out = "Hello world\n",
- },
- // fixed code block (type 1)
- .{
- .in = &[_]u8{
- 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1
- 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00,
- },
- .out = "Hello world\n",
- },
- // dynamic block (type 2)
- .{
- .in = &[_]u8{
- 0x3d, 0xc6, 0x39, 0x11, 0x00, 0x00, 0x0c, 0x02, // deflate data block type 2
- 0x30, 0x2b, 0xb5, 0x52, 0x1e, 0xff, 0x96, 0x38,
- 0x16, 0x96, 0x5c, 0x1e, 0x94, 0xcb, 0x6d, 0x01,
- },
- .out = "ABCDEABCD ABCDEABCD",
- },
- };
- for (cases) |c| {
- var fb = std.io.fixedBufferStream(c.in);
- var al = std.ArrayList(u8).init(testing.allocator);
- defer al.deinit();
-
- try decompress(.raw, fb.reader(), al.writer());
- try testing.expectEqualStrings(c.out, al.items);
- }
-}
-
-test "gzip decompress" {
- const cases = [_]struct {
- in: []const u8,
- out: []const u8,
- }{
- // non compressed block (type 0)
- .{
- .in = &[_]u8{
- 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header (10 bytes)
- 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
- 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
- 0xd5, 0xe0, 0x39, 0xb7, // gzip footer: checksum
- 0x0c, 0x00, 0x00, 0x00, // gzip footer: size
- },
- .out = "Hello world\n",
- },
- // fixed code block (type 1)
- .{
- .in = &[_]u8{
- 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x03, // gzip header (10 bytes)
- 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, // deflate data block type 1
- 0x2f, 0xca, 0x49, 0xe1, 0x02, 0x00,
- 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00, // gzip footer (chksum, len)
- },
- .out = "Hello world\n",
- },
- // dynamic block (type 2)
- .{
- .in = &[_]u8{
- 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, // gzip header (10 bytes)
- 0x3d, 0xc6, 0x39, 0x11, 0x00, 0x00, 0x0c, 0x02, // deflate data block type 2
- 0x30, 0x2b, 0xb5, 0x52, 0x1e, 0xff, 0x96, 0x38,
- 0x16, 0x96, 0x5c, 0x1e, 0x94, 0xcb, 0x6d, 0x01,
- 0x17, 0x1c, 0x39, 0xb4, 0x13, 0x00, 0x00, 0x00, // gzip footer (chksum, len)
- },
- .out = "ABCDEABCD ABCDEABCD",
- },
- // gzip header with name
- .{
- .in = &[_]u8{
- 0x1f, 0x8b, 0x08, 0x08, 0xe5, 0x70, 0xb1, 0x65, 0x00, 0x03, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x2e,
- 0x74, 0x78, 0x74, 0x00, 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, 0x2f, 0xca, 0x49, 0xe1,
- 0x02, 0x00, 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00,
- },
- .out = "Hello world\n",
- },
- };
- for (cases) |c| {
- var fb = std.io.fixedBufferStream(c.in);
- var al = std.ArrayList(u8).init(testing.allocator);
- defer al.deinit();
-
- try decompress(.gzip, fb.reader(), al.writer());
- try testing.expectEqualStrings(c.out, al.items);
- }
-}
-
-test "zlib decompress" {
- const cases = [_]struct {
- in: []const u8,
- out: []const u8,
- }{
- // non compressed block (type 0)
- .{
- .in = &[_]u8{
- 0x78, 0b10_0_11100, // zlib header (2 bytes)
- 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
- 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
- 0x1c, 0xf2, 0x04, 0x47, // zlib footer: checksum
- },
- .out = "Hello world\n",
- },
- };
- for (cases) |c| {
- var fb = std.io.fixedBufferStream(c.in);
- var al = std.ArrayList(u8).init(testing.allocator);
- defer al.deinit();
-
- try decompress(.zlib, fb.reader(), al.writer());
- try testing.expectEqualStrings(c.out, al.items);
- }
-}
-
-test "fuzzing tests" {
- const cases = [_]struct {
- input: []const u8,
- out: []const u8 = "",
- err: ?anyerror = null,
- }{
- .{ .input = "deflate-stream", .out = @embedFile("testdata/fuzz/deflate-stream.expect") }, // 0
- .{ .input = "empty-distance-alphabet01" },
- .{ .input = "empty-distance-alphabet02" },
- .{ .input = "end-of-stream", .err = error.EndOfStream },
- .{ .input = "invalid-distance", .err = error.InvalidMatch },
- .{ .input = "invalid-tree01", .err = error.IncompleteHuffmanTree }, // 5
- .{ .input = "invalid-tree02", .err = error.IncompleteHuffmanTree },
- .{ .input = "invalid-tree03", .err = error.IncompleteHuffmanTree },
- .{ .input = "lengths-overflow", .err = error.InvalidDynamicBlockHeader },
- .{ .input = "out-of-codes", .err = error.InvalidCode },
- .{ .input = "puff01", .err = error.WrongStoredBlockNlen }, // 10
- .{ .input = "puff02", .err = error.EndOfStream },
- .{ .input = "puff03", .out = &[_]u8{0xa} },
- .{ .input = "puff04", .err = error.InvalidCode },
- .{ .input = "puff05", .err = error.EndOfStream },
- .{ .input = "puff06", .err = error.EndOfStream },
- .{ .input = "puff08", .err = error.InvalidCode },
- .{ .input = "puff09", .out = "P" },
- .{ .input = "puff10", .err = error.InvalidCode },
- .{ .input = "puff11", .err = error.InvalidMatch },
- .{ .input = "puff12", .err = error.InvalidDynamicBlockHeader }, // 20
- .{ .input = "puff13", .err = error.IncompleteHuffmanTree },
- .{ .input = "puff14", .err = error.EndOfStream },
- .{ .input = "puff15", .err = error.IncompleteHuffmanTree },
- .{ .input = "puff16", .err = error.InvalidDynamicBlockHeader },
- .{ .input = "puff17", .err = error.MissingEndOfBlockCode }, // 25
- .{ .input = "fuzz1", .err = error.InvalidDynamicBlockHeader },
- .{ .input = "fuzz2", .err = error.InvalidDynamicBlockHeader },
- .{ .input = "fuzz3", .err = error.InvalidMatch },
- .{ .input = "fuzz4", .err = error.OversubscribedHuffmanTree },
- .{ .input = "puff18", .err = error.OversubscribedHuffmanTree }, // 30
- .{ .input = "puff19", .err = error.OversubscribedHuffmanTree },
- .{ .input = "puff20", .err = error.OversubscribedHuffmanTree },
- .{ .input = "puff21", .err = error.OversubscribedHuffmanTree },
- .{ .input = "puff22", .err = error.OversubscribedHuffmanTree },
- .{ .input = "puff23", .err = error.OversubscribedHuffmanTree }, // 35
- .{ .input = "puff24", .err = error.IncompleteHuffmanTree },
- .{ .input = "puff25", .err = error.OversubscribedHuffmanTree },
- .{ .input = "puff26", .err = error.InvalidDynamicBlockHeader },
- .{ .input = "puff27", .err = error.InvalidDynamicBlockHeader },
- };
-
- inline for (cases, 0..) |c, case_no| {
- var in = std.io.fixedBufferStream(@embedFile("testdata/fuzz/" ++ c.input ++ ".input"));
- var out = std.ArrayList(u8).init(testing.allocator);
- defer out.deinit();
- errdefer std.debug.print("test case failed {}\n", .{case_no});
-
- if (c.err) |expected_err| {
- try testing.expectError(expected_err, decompress(.raw, in.reader(), out.writer()));
- } else {
- try decompress(.raw, in.reader(), out.writer());
- try testing.expectEqualStrings(c.out, out.items);
- }
- }
-}
-
-test "bug 18966" {
- const input = @embedFile("testdata/fuzz/bug_18966.input");
- const expect = @embedFile("testdata/fuzz/bug_18966.expect");
-
- var in = std.io.fixedBufferStream(input);
- var out = std.ArrayList(u8).init(testing.allocator);
- defer out.deinit();
-
- try decompress(.gzip, in.reader(), out.writer());
- try testing.expectEqualStrings(expect, out.items);
-}
-
-test "bug 19895" {
- const input = &[_]u8{
- 0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
- 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a, // non compressed data
- };
- var in = std.io.fixedBufferStream(input);
- var decomp = decompressor(.raw, in.reader());
- var buf: [0]u8 = undefined;
- try testing.expectEqual(0, try decomp.read(&buf));
-}
lib/std/compress/flate/Lookup.zig
@@ -5,22 +5,22 @@
const std = @import("std");
const testing = std.testing;
const expect = testing.expect;
-const consts = @import("consts.zig");
+const flate = @import("../flate.zig");
-const Self = @This();
+const Lookup = @This();
const prime4 = 0x9E3779B1; // 4 bytes prime number 2654435761
-const chain_len = 2 * consts.history.len;
+const chain_len = 2 * flate.history_len;
// Maps hash => first position
-head: [consts.lookup.len]u16 = [_]u16{0} ** consts.lookup.len,
+head: [flate.lookup.len]u16 = [_]u16{0} ** flate.lookup.len,
// Maps position => previous positions for the same hash value
chain: [chain_len]u16 = [_]u16{0} ** (chain_len),
// Calculates hash of the 4 bytes from data.
// Inserts `pos` position of that hash in the lookup tables.
// Returns previous location with the same hash value.
-pub fn add(self: *Self, data: []const u8, pos: u16) u16 {
+pub fn add(self: *Lookup, data: []const u8, pos: u16) u16 {
if (data.len < 4) return 0;
const h = hash(data[0..4]);
return self.set(h, pos);
@@ -28,11 +28,11 @@ pub fn add(self: *Self, data: []const u8, pos: u16) u16 {
// Returns previous location with the same hash value given the current
// position.
-pub fn prev(self: *Self, pos: u16) u16 {
+pub fn prev(self: *Lookup, pos: u16) u16 {
return self.chain[pos];
}
-fn set(self: *Self, h: u32, pos: u16) u16 {
+fn set(self: *Lookup, h: u32, pos: u16) u16 {
const p = self.head[h];
self.head[h] = pos;
self.chain[pos] = p;
@@ -40,7 +40,7 @@ fn set(self: *Self, h: u32, pos: u16) u16 {
}
// Slide all positions in head and chain for `n`
-pub fn slide(self: *Self, n: u16) void {
+pub fn slide(self: *Lookup, n: u16) void {
for (&self.head) |*v| {
v.* -|= n;
}
@@ -52,8 +52,8 @@ pub fn slide(self: *Self, n: u16) void {
// Add `len` 4 bytes hashes from `data` into lookup.
// Position of the first byte is `pos`.
-pub fn bulkAdd(self: *Self, data: []const u8, len: u16, pos: u16) void {
- if (len == 0 or data.len < consts.match.min_length) {
+pub fn bulkAdd(self: *Lookup, data: []const u8, len: u16, pos: u16) void {
+ if (len == 0 or data.len < flate.match.min_length) {
return;
}
var hb =
@@ -80,7 +80,7 @@ fn hash(b: *const [4]u8) u32 {
}
fn hashu(v: u32) u32 {
- return @intCast((v *% prime4) >> consts.lookup.shift);
+ return @intCast((v *% prime4) >> flate.lookup.shift);
}
test add {
@@ -91,7 +91,7 @@ test add {
0x01, 0x02, 0x03,
};
- var h: Self = .{};
+ var h: Lookup = .{};
for (data, 0..) |_, i| {
const p = h.add(data[i..], @intCast(i));
if (i >= 8 and i < 24) {
@@ -101,7 +101,7 @@ test add {
}
}
- const v = Self.hash(data[2 .. 2 + 4]);
+ const v = Lookup.hash(data[2 .. 2 + 4]);
try expect(h.head[v] == 2 + 16);
try expect(h.chain[2 + 16] == 2 + 8);
try expect(h.chain[2 + 8] == 2);
@@ -111,13 +111,13 @@ test bulkAdd {
const data = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
// one by one
- var h: Self = .{};
+ var h: Lookup = .{};
for (data, 0..) |_, i| {
_ = h.add(data[i..], @intCast(i));
}
// in bulk
- var bh: Self = .{};
+ var bh: Lookup = .{};
bh.bulkAdd(data, data.len, 0);
try testing.expectEqualSlices(u16, &h.head, &bh.head);
lib/std/compress/flate/SlidingWindow.zig
@@ -1,160 +0,0 @@
-//! Used in deflate (compression), holds uncompressed data form which Tokens are
-//! produces. In combination with Lookup it is used to find matches in history data.
-//!
-const std = @import("std");
-const consts = @import("consts.zig");
-
-const expect = testing.expect;
-const assert = std.debug.assert;
-const testing = std.testing;
-
-const hist_len = consts.history.len;
-const buffer_len = 2 * hist_len;
-const min_lookahead = consts.match.min_length + consts.match.max_length;
-const max_rp = buffer_len - min_lookahead;
-
-const Self = @This();
-
-buffer: [buffer_len]u8 = undefined,
-wp: usize = 0, // write position
-rp: usize = 0, // read position
-fp: isize = 0, // last flush position, tokens are build from fp..rp
-
-/// Returns number of bytes written, or 0 if buffer is full and need to slide.
-pub fn write(self: *Self, buf: []const u8) usize {
- if (self.rp >= max_rp) return 0; // need to slide
-
- const n = @min(buf.len, buffer_len - self.wp);
- @memcpy(self.buffer[self.wp .. self.wp + n], buf[0..n]);
- self.wp += n;
- return n;
-}
-
-/// Slide buffer for hist_len.
-/// Drops old history, preserves between hist_len and hist_len - min_lookahead.
-/// Returns number of bytes removed.
-pub fn slide(self: *Self) u16 {
- assert(self.rp >= max_rp and self.wp >= self.rp);
- const n = self.wp - hist_len;
- @memcpy(self.buffer[0..n], self.buffer[hist_len..self.wp]);
- self.rp -= hist_len;
- self.wp -= hist_len;
- self.fp -= hist_len;
- return @intCast(n);
-}
-
-/// Data from the current position (read position). Those part of the buffer is
-/// not converted to tokens yet.
-fn lookahead(self: *Self) []const u8 {
- assert(self.wp >= self.rp);
- return self.buffer[self.rp..self.wp];
-}
-
-/// Returns part of the lookahead buffer. If should_flush is set no lookahead is
-/// preserved otherwise preserves enough data for the longest match. Returns
-/// null if there is not enough data.
-pub fn activeLookahead(self: *Self, should_flush: bool) ?[]const u8 {
- const min: usize = if (should_flush) 0 else min_lookahead;
- const lh = self.lookahead();
- return if (lh.len > min) lh else null;
-}
-
-/// Advances read position, shrinks lookahead.
-pub fn advance(self: *Self, n: u16) void {
- assert(self.wp >= self.rp + n);
- self.rp += n;
-}
-
-/// Returns writable part of the buffer, where new uncompressed data can be
-/// written.
-pub fn writable(self: *Self) []u8 {
- return self.buffer[self.wp..];
-}
-
-/// Notification of what part of writable buffer is filled with data.
-pub fn written(self: *Self, n: usize) void {
- self.wp += n;
-}
-
-/// Finds match length between previous and current position.
-/// Used in hot path!
-pub fn match(self: *Self, prev_pos: u16, curr_pos: u16, min_len: u16) u16 {
- const max_len: usize = @min(self.wp - curr_pos, consts.match.max_length);
- // lookahead buffers from previous and current positions
- const prev_lh = self.buffer[prev_pos..][0..max_len];
- const curr_lh = self.buffer[curr_pos..][0..max_len];
-
- // If we already have match (min_len > 0),
- // test the first byte above previous len a[min_len] != b[min_len]
- // and then all the bytes from that position to zero.
- // That is likely positions to find difference than looping from first bytes.
- var i: usize = min_len;
- if (i > 0) {
- if (max_len <= i) return 0;
- while (true) {
- if (prev_lh[i] != curr_lh[i]) return 0;
- if (i == 0) break;
- i -= 1;
- }
- i = min_len;
- }
- while (i < max_len) : (i += 1)
- if (prev_lh[i] != curr_lh[i]) break;
- return if (i >= consts.match.min_length) @intCast(i) else 0;
-}
-
-/// Current position of non-compressed data. Data before rp are already converted
-/// to tokens.
-pub fn pos(self: *Self) u16 {
- return @intCast(self.rp);
-}
-
-/// Notification that token list is cleared.
-pub fn flush(self: *Self) void {
- self.fp = @intCast(self.rp);
-}
-
-/// Part of the buffer since last flush or null if there was slide in between (so
-/// fp becomes negative).
-pub fn tokensBuffer(self: *Self) ?[]const u8 {
- assert(self.fp <= self.rp);
- if (self.fp < 0) return null;
- return self.buffer[@intCast(self.fp)..self.rp];
-}
-
-test match {
- const data = "Blah blah blah blah blah!";
- var win: Self = .{};
- try expect(win.write(data) == data.len);
- try expect(win.wp == data.len);
- try expect(win.rp == 0);
-
- // length between l symbols
- try expect(win.match(1, 6, 0) == 18);
- try expect(win.match(1, 11, 0) == 13);
- try expect(win.match(1, 16, 0) == 8);
- try expect(win.match(1, 21, 0) == 0);
-
- // position 15 = "blah blah!"
- // position 20 = "blah!"
- try expect(win.match(15, 20, 0) == 4);
- try expect(win.match(15, 20, 3) == 4);
- try expect(win.match(15, 20, 4) == 0);
-}
-
-test slide {
- var win: Self = .{};
- win.wp = Self.buffer_len - 11;
- win.rp = Self.buffer_len - 111;
- win.buffer[win.rp] = 0xab;
- try expect(win.lookahead().len == 100);
- try expect(win.tokensBuffer().?.len == win.rp);
-
- const n = win.slide();
- try expect(n == 32757);
- try expect(win.buffer[win.rp] == 0xab);
- try expect(win.rp == Self.hist_len - 111);
- try expect(win.wp == Self.hist_len - 11);
- try expect(win.lookahead().len == 100);
- try expect(win.tokensBuffer() == null);
-}
lib/std/compress/flate/Token.zig
@@ -6,7 +6,7 @@ const std = @import("std");
const assert = std.debug.assert;
const print = std.debug.print;
const expect = std.testing.expect;
-const consts = @import("consts.zig").match;
+const match = std.compress.flate.match;
const Token = @This();
@@ -26,11 +26,11 @@ pub fn literal(t: Token) u8 {
}
pub fn distance(t: Token) u16 {
- return @as(u16, t.dist) + consts.min_distance;
+ return @as(u16, t.dist) + match.min_distance;
}
pub fn length(t: Token) u16 {
- return @as(u16, t.len_lit) + consts.base_length;
+ return @as(u16, t.len_lit) + match.base_length;
}
pub fn initLiteral(lit: u8) Token {
@@ -40,12 +40,12 @@ pub fn initLiteral(lit: u8) Token {
// distance range 1 - 32768, stored in dist as 0 - 32767 (u15)
// length range 3 - 258, stored in len_lit as 0 - 255 (u8)
pub fn initMatch(dist: u16, len: u16) Token {
- assert(len >= consts.min_length and len <= consts.max_length);
- assert(dist >= consts.min_distance and dist <= consts.max_distance);
+ assert(len >= match.min_length and len <= match.max_length);
+ assert(dist >= match.min_distance and dist <= match.max_distance);
return .{
.kind = .match,
- .dist = @intCast(dist - consts.min_distance),
- .len_lit = @intCast(len - consts.base_length),
+ .dist = @intCast(dist - match.min_distance),
+ .len_lit = @intCast(len - match.base_length),
};
}
lib/std/compress/flate.zig
@@ -1,94 +1,189 @@
-/// Deflate is a lossless data compression file format that uses a combination
-/// of LZ77 and Huffman coding.
-pub const deflate = @import("flate/deflate.zig");
-
-/// Inflate is the decoding process that takes a Deflate bitstream for
-/// decompression and correctly produces the original full-size data or file.
-pub const inflate = @import("flate/inflate.zig");
-
-/// Decompress compressed data from reader and write plain data to the writer.
-pub fn decompress(reader: anytype, writer: anytype) !void {
- try inflate.decompress(.raw, reader, writer);
-}
+const builtin = @import("builtin");
+const std = @import("../std.zig");
+const testing = std.testing;
+const Writer = std.io.Writer;
+
+/// Container of the deflate bit stream body. Container adds header before
+/// deflate bit stream and footer after. It can bi gzip, zlib or raw (no header,
+/// no footer, raw bit stream).
+///
+/// Zlib format is defined in rfc 1950. Header has 2 bytes and footer 4 bytes
+/// addler 32 checksum.
+///
+/// Gzip format is defined in rfc 1952. Header has 10+ bytes and footer 4 bytes
+/// crc32 checksum and 4 bytes of uncompressed data length.
+///
+///
+/// rfc 1950: https://datatracker.ietf.org/doc/html/rfc1950#page-4
+/// rfc 1952: https://datatracker.ietf.org/doc/html/rfc1952#page-5
+pub const Container = enum {
+ raw, // no header or footer
+ gzip, // gzip header and footer
+ zlib, // zlib header and footer
+
+ pub fn size(w: Container) usize {
+ return headerSize(w) + footerSize(w);
+ }
-/// Decompressor type
-pub fn Decompressor(comptime ReaderType: type) type {
- return inflate.Decompressor(.raw, ReaderType);
-}
+ pub fn headerSize(w: Container) usize {
+ return header(w).len;
+ }
-/// Create Decompressor which will read compressed data from reader.
-pub fn decompressor(reader: anytype) Decompressor(@TypeOf(reader)) {
- return inflate.decompressor(.raw, reader);
-}
+ pub fn footerSize(w: Container) usize {
+ return switch (w) {
+ .gzip => 8,
+ .zlib => 4,
+ .raw => 0,
+ };
+ }
-/// Compression level, trades between speed and compression size.
-pub const Options = deflate.Options;
+ pub const list = [_]Container{ .raw, .gzip, .zlib };
-/// Compress plain data from reader and write compressed data to the writer.
-pub fn compress(reader: anytype, writer: anytype, options: Options) !void {
- try deflate.compress(.raw, reader, writer, options);
-}
+ pub const Error = error{
+ BadGzipHeader,
+ BadZlibHeader,
+ WrongGzipChecksum,
+ WrongGzipSize,
+ WrongZlibChecksum,
+ };
-/// Compressor type
-pub fn Compressor(comptime WriterType: type) type {
- return deflate.Compressor(.raw, WriterType);
-}
+ pub fn header(container: Container) []const u8 {
+ return switch (container) {
+ // GZIP 10 byte header (https://datatracker.ietf.org/doc/html/rfc1952#page-5):
+ // - ID1 (IDentification 1), always 0x1f
+ // - ID2 (IDentification 2), always 0x8b
+ // - CM (Compression Method), always 8 = deflate
+ // - FLG (Flags), all set to 0
+ // - 4 bytes, MTIME (Modification time), not used, all set to zero
+ // - XFL (eXtra FLags), all set to zero
+ // - OS (Operating System), 03 = Unix
+ .gzip => &[_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 },
+ // ZLIB has a two-byte header (https://datatracker.ietf.org/doc/html/rfc1950#page-4):
+ // 1st byte:
+ // - First four bits is the CINFO (compression info), which is 7 for the default deflate window size.
+ // - The next four bits is the CM (compression method), which is 8 for deflate.
+ // 2nd byte:
+ // - Two bits is the FLEVEL (compression level). Values are: 0=fastest, 1=fast, 2=default, 3=best.
+ // - The next bit, FDICT, is set if a dictionary is given.
+ // - The final five FCHECK bits form a mod-31 checksum.
+ //
+ // CINFO = 7, CM = 8, FLEVEL = 0b10, FDICT = 0, FCHECK = 0b11100
+ .zlib => &[_]u8{ 0x78, 0b10_0_11100 },
+ .raw => &.{},
+ };
+ }
-/// Create Compressor which outputs compressed data to the writer.
-pub fn compressor(writer: anytype, options: Options) !Compressor(@TypeOf(writer)) {
- return try deflate.compressor(.raw, writer, options);
-}
+ pub const Hasher = union(Container) {
+ raw: void,
+ gzip: struct {
+ crc: std.hash.Crc32 = .init(),
+ count: usize = 0,
+ },
+ zlib: std.hash.Adler32,
+
+ pub fn init(containter: Container) Hasher {
+ return switch (containter) {
+ .gzip => .{ .gzip = .{} },
+ .zlib => .{ .zlib = .init() },
+ .raw => .raw,
+ };
+ }
-/// Huffman only compression. Without Lempel-Ziv match searching. Faster
-/// compression, less memory requirements but bigger compressed sizes.
-pub const huffman = struct {
- pub fn compress(reader: anytype, writer: anytype) !void {
- try deflate.huffman.compress(.raw, reader, writer);
- }
+ pub fn container(h: Hasher) Container {
+ return h;
+ }
- pub fn Compressor(comptime WriterType: type) type {
- return deflate.huffman.Compressor(.raw, WriterType);
- }
+ pub fn update(h: *Hasher, buf: []const u8) void {
+ switch (h.*) {
+ .raw => {},
+ .gzip => |*gzip| {
+ gzip.update(buf);
+ gzip.count += buf.len;
+ },
+ .zlib => |*zlib| {
+ zlib.update(buf);
+ },
+ inline .gzip, .zlib => |*x| x.update(buf),
+ }
+ }
- pub fn compressor(writer: anytype) !huffman.Compressor(@TypeOf(writer)) {
- return deflate.huffman.compressor(.raw, writer);
- }
+ pub fn writeFooter(hasher: *Hasher, writer: *Writer) Writer.Error!void {
+ var bits: [4]u8 = undefined;
+ switch (hasher.*) {
+ .gzip => |*gzip| {
+ // GZIP 8 bytes footer
+ // - 4 bytes, CRC32 (CRC-32)
+ // - 4 bytes, ISIZE (Input SIZE) - size of the original (uncompressed) input data modulo 2^32
+ std.mem.writeInt(u32, &bits, gzip.final(), .little);
+ try writer.writeAll(&bits);
+
+ std.mem.writeInt(u32, &bits, gzip.bytes_read, .little);
+ try writer.writeAll(&bits);
+ },
+ .zlib => |*zlib| {
+ // ZLIB (RFC 1950) is big-endian, unlike GZIP (RFC 1952).
+ // 4 bytes of ADLER32 (Adler-32 checksum)
+ // Checksum value of the uncompressed data (excluding any
+ // dictionary data) computed according to Adler-32
+ // algorithm.
+ std.mem.writeInt(u32, &bits, zlib.final, .big);
+ try writer.writeAll(&bits);
+ },
+ .raw => {},
+ }
+ }
+ };
};
-// No compression store only. Compressed size is slightly bigger than plain.
-pub const store = struct {
- pub fn compress(reader: anytype, writer: anytype) !void {
- try deflate.store.compress(.raw, reader, writer);
- }
+/// When decompressing, the output buffer is used as the history window, so
+/// less than this may result in failure to decompress streams that were
+/// compressed with a larger window.
+pub const max_window_len = 1 << 16;
- pub fn Compressor(comptime WriterType: type) type {
- return deflate.store.Compressor(.raw, WriterType);
- }
+/// Deflate is a lossless data compression file format that uses a combination
+/// of LZ77 and Huffman coding.
+pub const Compress = @import("flate/Compress.zig");
- pub fn compressor(writer: anytype) !store.Compressor(@TypeOf(writer)) {
- return deflate.store.compressor(.raw, writer);
- }
-};
+/// Inflate is the decoding process that takes a Deflate bitstream for
+/// decompression and correctly produces the original full-size data or file.
+pub const Decompress = @import("flate/Decompress.zig");
-/// Container defines header/footer around deflate bit stream. Gzip and zlib
-/// compression algorithms are containers around deflate bit stream body.
-const Container = @import("flate/container.zig").Container;
-const std = @import("std");
-const testing = std.testing;
-const fixedBufferStream = std.io.fixedBufferStream;
-const print = std.debug.print;
-const builtin = @import("builtin");
+/// Huffman only compression. Without Lempel-Ziv match searching. Faster
+/// compression, less memory requirements but bigger compressed sizes.
+pub const huffman = struct {
+ // The odd order in which the codegen code sizes are written.
+ pub const codegen_order = [_]u32{ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+ // The number of codegen codes.
+ pub const codegen_code_count = 19;
+
+ // The largest distance code.
+ pub const distance_code_count = 30;
+
+ // Maximum number of literals.
+ pub const max_num_lit = 286;
+
+ // Max number of frequencies used for a Huffman Code
+ // Possible lengths are codegen_code_count (19), distance_code_count (30) and max_num_lit (286).
+ // The largest of these is max_num_lit.
+ pub const max_num_frequencies = max_num_lit;
+
+ // Biggest block size for uncompressed block.
+ pub const max_store_block_size = 65535;
+ // The special code used to mark the end of a block.
+ pub const end_block_marker = 256;
+};
test {
- _ = deflate;
- _ = inflate;
+ _ = Compress;
+ _ = Decompress;
}
test "compress/decompress" {
+ const print = std.debug.print;
var cmp_buf: [64 * 1024]u8 = undefined; // compressed data buffer
var dcm_buf: [64 * 1024]u8 = undefined; // decompressed data buffer
- const levels = [_]deflate.Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
+ const levels = [_]Compress.Level{ .level_4, .level_5, .level_6, .level_7, .level_8, .level_9 };
const cases = [_]struct {
data: []const u8, // uncompressed content
// compressed data sizes per level 4-9
@@ -135,28 +230,34 @@ test "compress/decompress" {
// compress original stream to compressed stream
{
- var original = fixedBufferStream(data);
- var compressed = fixedBufferStream(&cmp_buf);
- try deflate.compress(container, original.reader(), compressed.writer(), .{ .level = level });
+ var original: std.io.Reader = .fixed(data);
+ var compressed: Writer = .fixed(&cmp_buf);
+ var compress: Compress = .init(&original, &.{}, .{ .container = .raw, .level = level });
+ const n = try compress.reader.streamRemaining(&compressed);
if (compressed_size == 0) {
if (container == .gzip)
print("case {d} gzip level {} compressed size: {d}\n", .{ case_no, level, compressed.pos });
- compressed_size = compressed.pos;
+ compressed_size = compressed.end;
}
- try testing.expectEqual(compressed_size, compressed.pos);
+ try testing.expectEqual(compressed_size, n);
+ try testing.expectEqual(compressed_size, compressed.end);
}
// decompress compressed stream to decompressed stream
{
- var compressed = fixedBufferStream(cmp_buf[0..compressed_size]);
- var decompressed = fixedBufferStream(&dcm_buf);
- try inflate.decompress(container, compressed.reader(), decompressed.writer());
- try testing.expectEqualSlices(u8, data, decompressed.getWritten());
+ var compressed: std.io.Reader = .fixed(cmp_buf[0..compressed_size]);
+ var decompressed: Writer = .fixed(&dcm_buf);
+ var decompress: Decompress = .init(&compressed, container, &.{});
+ _ = try decompress.reader.streamRemaining(&decompressed);
+ try testing.expectEqualSlices(u8, data, decompressed.buffered());
}
// compressor writer interface
{
- var compressed = fixedBufferStream(&cmp_buf);
- var cmp = try deflate.compressor(container, compressed.writer(), .{ .level = level });
+ var compressed: Writer = .fixed(&cmp_buf);
+ var cmp = try Compress.init(&compressed, &.{}, .{
+ .level = level,
+ .container = container,
+ });
var cmp_wrt = cmp.writer();
try cmp_wrt.writeAll(data);
try cmp.finish();
@@ -165,10 +266,9 @@ test "compress/decompress" {
}
// decompressor reader interface
{
- var compressed = fixedBufferStream(cmp_buf[0..compressed_size]);
- var dcm = inflate.decompressor(container, compressed.reader());
- var dcm_rdr = dcm.reader();
- const n = try dcm_rdr.readAll(&dcm_buf);
+ var compressed: std.io.Reader = .fixed(cmp_buf[0..compressed_size]);
+ var decompress: Decompress = .init(&compressed, container, &.{});
+ const n = try decompress.reader.readSliceShort(&dcm_buf);
try testing.expectEqual(data.len, n);
try testing.expectEqualSlices(u8, data, dcm_buf[0..n]);
}
@@ -184,9 +284,9 @@ test "compress/decompress" {
// compress original stream to compressed stream
{
- var original = fixedBufferStream(data);
- var compressed = fixedBufferStream(&cmp_buf);
- var cmp = try deflate.huffman.compressor(container, compressed.writer());
+ var original: std.io.Reader = .fixed(data);
+ var compressed: Writer = .fixed(&cmp_buf);
+ var cmp = try Compress.Huffman.init(container, &compressed);
try cmp.compress(original.reader());
try cmp.finish();
if (compressed_size == 0) {
@@ -198,10 +298,11 @@ test "compress/decompress" {
}
// decompress compressed stream to decompressed stream
{
- var compressed = fixedBufferStream(cmp_buf[0..compressed_size]);
- var decompressed = fixedBufferStream(&dcm_buf);
- try inflate.decompress(container, compressed.reader(), decompressed.writer());
- try testing.expectEqualSlices(u8, data, decompressed.getWritten());
+ var compressed: std.io.Reader = .fixed(cmp_buf[0..compressed_size]);
+ var decompress: Decompress = .init(&compressed, container, &.{});
+ var decompressed: Writer = .fixed(&dcm_buf);
+ _ = try decompress.reader.streamRemaining(&decompressed);
+ try testing.expectEqualSlices(u8, data, decompressed.buffered());
}
}
}
@@ -216,9 +317,9 @@ test "compress/decompress" {
// compress original stream to compressed stream
{
- var original = fixedBufferStream(data);
- var compressed = fixedBufferStream(&cmp_buf);
- var cmp = try deflate.store.compressor(container, compressed.writer());
+ var original: std.io.Reader = .fixed(data);
+ var compressed: Writer = .fixed(&cmp_buf);
+ var cmp = try Compress.SimpleCompressor(.store, container).init(&compressed);
try cmp.compress(original.reader());
try cmp.finish();
if (compressed_size == 0) {
@@ -231,23 +332,25 @@ test "compress/decompress" {
}
// decompress compressed stream to decompressed stream
{
- var compressed = fixedBufferStream(cmp_buf[0..compressed_size]);
- var decompressed = fixedBufferStream(&dcm_buf);
- try inflate.decompress(container, compressed.reader(), decompressed.writer());
- try testing.expectEqualSlices(u8, data, decompressed.getWritten());
+ var compressed: std.io.Reader = .fixed(cmp_buf[0..compressed_size]);
+ var decompress: Decompress = .init(&compressed, container, &.{});
+ var decompressed: Writer = .fixed(&dcm_buf);
+ _ = try decompress.reader.streamRemaining(&decompressed);
+ try testing.expectEqualSlices(u8, data, decompressed.buffered());
}
}
}
}
}
-fn testDecompress(comptime container: Container, compressed: []const u8, expected_plain: []const u8) !void {
- var in = fixedBufferStream(compressed);
- var out = std.ArrayList(u8).init(testing.allocator);
- defer out.deinit();
+fn testDecompress(container: Container, compressed: []const u8, expected_plain: []const u8) !void {
+ var in: std.io.Reader = .fixed(compressed);
+ var aw: std.io.Writer.Allocating = .init(testing.allocator);
+ defer aw.deinit();
- try inflate.decompress(container, in.reader(), out.writer());
- try testing.expectEqualSlices(u8, expected_plain, out.items);
+ var decompress: Decompress = .init(&in, container, &.{});
+ _ = try decompress.reader.streamRemaining(&aw.writer);
+ try testing.expectEqualSlices(u8, expected_plain, aw.items);
}
test "don't read past deflate stream's end" {
@@ -352,126 +455,186 @@ test "gzip header" {
}
test "public interface" {
- const plain_data = [_]u8{ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a };
+ const plain_data_buf = [_]u8{ 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0a };
// deflate final stored block, header + plain (stored) data
const deflate_block = [_]u8{
0b0000_0001, 0b0000_1100, 0x00, 0b1111_0011, 0xff, // deflate fixed buffer header len, nlen
- } ++ plain_data;
-
- // gzip header/footer + deflate block
- const gzip_data =
- [_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 } ++ // gzip header (10 bytes)
- deflate_block ++
- [_]u8{ 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00 }; // gzip footer checksum (4 byte), size (4 bytes)
-
- // zlib header/footer + deflate block
- const zlib_data = [_]u8{ 0x78, 0b10_0_11100 } ++ // zlib header (2 bytes)}
- deflate_block ++
- [_]u8{ 0x1c, 0xf2, 0x04, 0x47 }; // zlib footer: checksum
-
- const gzip = @import("gzip.zig");
- const zlib = @import("zlib.zig");
- const flate = @This();
-
- try testInterface(gzip, &gzip_data, &plain_data);
- try testInterface(zlib, &zlib_data, &plain_data);
- try testInterface(flate, &deflate_block, &plain_data);
-}
+ } ++ plain_data_buf;
+
+ const plain_data: []const u8 = &plain_data_buf;
+ const gzip_data: []const u8 = &deflate_block;
+
+ //// gzip header/footer + deflate block
+ //const gzip_data =
+ // [_]u8{ 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 } ++ // gzip header (10 bytes)
+ // deflate_block ++
+ // [_]u8{ 0xd5, 0xe0, 0x39, 0xb7, 0x0c, 0x00, 0x00, 0x00 }; // gzip footer checksum (4 byte), size (4 bytes)
+
+ //// zlib header/footer + deflate block
+ //const zlib_data = [_]u8{ 0x78, 0b10_0_11100 } ++ // zlib header (2 bytes)}
+ // deflate_block ++
+ // [_]u8{ 0x1c, 0xf2, 0x04, 0x47 }; // zlib footer: checksum
+
+ // TODO
+ //const gzip = @import("gzip.zig");
+ //const zlib = @import("zlib.zig");
-fn testInterface(comptime pkg: type, gzip_data: []const u8, plain_data: []const u8) !void {
var buffer1: [64]u8 = undefined;
var buffer2: [64]u8 = undefined;
- var compressed = fixedBufferStream(&buffer1);
- var plain = fixedBufferStream(&buffer2);
+ // TODO These used to be functions, need to migrate the tests
+ const decompress = void;
+ const compress = void;
+ const store = void;
// decompress
{
- var in = fixedBufferStream(gzip_data);
- try pkg.decompress(in.reader(), plain.writer());
- try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ var plain: Writer = .fixed(&buffer2);
+
+ var in: std.io.Reader = .fixed(gzip_data);
+ try decompress(&in, &plain);
+ try testing.expectEqualSlices(u8, plain_data, plain.buffered());
}
- plain.reset();
- compressed.reset();
// compress/decompress
{
- var in = fixedBufferStream(plain_data);
- try pkg.compress(in.reader(), compressed.writer(), .{});
- compressed.reset();
- try pkg.decompress(compressed.reader(), plain.writer());
- try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ var plain: Writer = .fixed(&buffer2);
+ var compressed: Writer = .fixed(&buffer1);
+
+ var in: std.io.Reader = .fixed(plain_data);
+ try compress(&in, &compressed, .{});
+
+ var r: std.io.Reader = .fixed(&buffer1);
+ try decompress(&r, &plain);
+ try testing.expectEqualSlices(u8, plain_data, plain.buffered());
}
- plain.reset();
- compressed.reset();
// compressor/decompressor
{
- var in = fixedBufferStream(plain_data);
- var cmp = try pkg.compressor(compressed.writer(), .{});
- try cmp.compress(in.reader());
+ var plain: Writer = .fixed(&buffer2);
+ var compressed: Writer = .fixed(&buffer1);
+
+ var in: std.io.Reader = .fixed(plain_data);
+ var cmp = try Compress(&compressed, .{});
+ try cmp.compress(&in);
try cmp.finish();
- compressed.reset();
- var dcp = pkg.decompressor(compressed.reader());
- try dcp.decompress(plain.writer());
- try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ var r: std.io.Reader = .fixed(&buffer1);
+ var dcp = Decompress(&r);
+ try dcp.decompress(&plain);
+ try testing.expectEqualSlices(u8, plain_data, plain.buffered());
}
- plain.reset();
- compressed.reset();
// huffman
{
// huffman compress/decompress
{
- var in = fixedBufferStream(plain_data);
- try pkg.huffman.compress(in.reader(), compressed.writer());
- compressed.reset();
- try pkg.decompress(compressed.reader(), plain.writer());
- try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ var plain: Writer = .fixed(&buffer2);
+ var compressed: Writer = .fixed(&buffer1);
+
+ var in: std.io.Reader = .fixed(plain_data);
+ try huffman.compress(&in, &compressed);
+
+ var r: std.io.Reader = .fixed(&buffer1);
+ try decompress(&r, &plain);
+ try testing.expectEqualSlices(u8, plain_data, plain.buffered());
}
- plain.reset();
- compressed.reset();
// huffman compressor/decompressor
{
- var in = fixedBufferStream(plain_data);
- var cmp = try pkg.huffman.compressor(compressed.writer());
- try cmp.compress(in.reader());
+ var plain: Writer = .fixed(&buffer2);
+ var compressed: Writer = .fixed(&buffer1);
+
+ var in: std.io.Reader = .fixed(plain_data);
+ var cmp = try huffman.Compressor(&compressed);
+ try cmp.compress(&in);
try cmp.finish();
- compressed.reset();
- try pkg.decompress(compressed.reader(), plain.writer());
- try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ var r: std.io.Reader = .fixed(&buffer1);
+ try decompress(&r, &plain);
+ try testing.expectEqualSlices(u8, plain_data, plain.buffered());
}
}
- plain.reset();
- compressed.reset();
// store
{
// store compress/decompress
{
- var in = fixedBufferStream(plain_data);
- try pkg.store.compress(in.reader(), compressed.writer());
- compressed.reset();
- try pkg.decompress(compressed.reader(), plain.writer());
- try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ var plain: Writer = .fixed(&buffer2);
+ var compressed: Writer = .fixed(&buffer1);
+
+ var in: std.io.Reader = .fixed(plain_data);
+ try store.compress(&in, &compressed);
+
+ var r: std.io.Reader = .fixed(&buffer1);
+ try decompress(&r, &plain);
+ try testing.expectEqualSlices(u8, plain_data, plain.buffered());
}
- plain.reset();
- compressed.reset();
// store compressor/decompressor
{
- var in = fixedBufferStream(plain_data);
- var cmp = try pkg.store.compressor(compressed.writer());
- try cmp.compress(in.reader());
+ var plain: Writer = .fixed(&buffer2);
+ var compressed: Writer = .fixed(&buffer1);
+
+ var in: std.io.Reader = .fixed(plain_data);
+ var cmp = try store.compressor(&compressed);
+ try cmp.compress(&in);
try cmp.finish();
- compressed.reset();
- try pkg.decompress(compressed.reader(), plain.writer());
- try testing.expectEqualSlices(u8, plain_data, plain.getWritten());
+ var r: std.io.Reader = .fixed(&buffer1);
+ try decompress(&r, &plain);
+ try testing.expectEqualSlices(u8, plain_data, plain.buffered());
}
}
}
+
+pub const match = struct {
+ pub const base_length = 3; // smallest match length per the RFC section 3.2.5
+ pub const min_length = 4; // min length used in this algorithm
+ pub const max_length = 258;
+
+ pub const min_distance = 1;
+ pub const max_distance = 32768;
+};
+
+pub const history_len = match.max_distance;
+
+pub const lookup = struct {
+ pub const bits = 15;
+ pub const len = 1 << bits;
+ pub const shift = 32 - bits;
+};
+
+test "zlib should not overshoot" {
+ // Compressed zlib data with extra 4 bytes at the end.
+ const data = [_]u8{
+ 0x78, 0x9c, 0x73, 0xce, 0x2f, 0xa8, 0x2c, 0xca, 0x4c, 0xcf, 0x28, 0x51, 0x08, 0xcf, 0xcc, 0xc9,
+ 0x49, 0xcd, 0x55, 0x28, 0x4b, 0xcc, 0x53, 0x08, 0x4e, 0xce, 0x48, 0xcc, 0xcc, 0xd6, 0x51, 0x08,
+ 0xce, 0xcc, 0x4b, 0x4f, 0x2c, 0xc8, 0x2f, 0x4a, 0x55, 0x30, 0xb4, 0xb4, 0x34, 0xd5, 0xb5, 0x34,
+ 0x03, 0x00, 0x8b, 0x61, 0x0f, 0xa4, 0x52, 0x5a, 0x94, 0x12,
+ };
+
+ var stream: std.io.Reader = .fixed(&data);
+ const reader = stream.reader();
+
+ var dcp = Decompress.init(reader);
+ var out: [128]u8 = undefined;
+
+ // Decompress
+ var n = try dcp.reader().readAll(out[0..]);
+
+ // Expected decompressed data
+ try std.testing.expectEqual(46, n);
+ try std.testing.expectEqualStrings("Copyright Willem van Schaik, Singapore 1995-96", out[0..n]);
+
+ // Decompressor don't overshoot underlying reader.
+ // It is leaving it at the end of compressed data chunk.
+ try std.testing.expectEqual(data.len - 4, stream.getPos());
+ try std.testing.expectEqual(0, dcp.unreadBytes());
+
+ // 4 bytes after compressed chunk are available in reader.
+ n = try reader.readAll(out[0..]);
+ try std.testing.expectEqual(n, 4);
+ try std.testing.expectEqualSlices(u8, data[data.len - 4 .. data.len], out[0..n]);
+}
lib/std/compress/gzip.zig
@@ -1,66 +0,0 @@
-const deflate = @import("flate/deflate.zig");
-const inflate = @import("flate/inflate.zig");
-
-/// Decompress compressed data from reader and write plain data to the writer.
-pub fn decompress(reader: anytype, writer: anytype) !void {
- try inflate.decompress(.gzip, reader, writer);
-}
-
-/// Decompressor type
-pub fn Decompressor(comptime ReaderType: type) type {
- return inflate.Decompressor(.gzip, ReaderType);
-}
-
-/// Create Decompressor which will read compressed data from reader.
-pub fn decompressor(reader: anytype) Decompressor(@TypeOf(reader)) {
- return inflate.decompressor(.gzip, reader);
-}
-
-/// Compression level, trades between speed and compression size.
-pub const Options = deflate.Options;
-
-/// Compress plain data from reader and write compressed data to the writer.
-pub fn compress(reader: anytype, writer: anytype, options: Options) !void {
- try deflate.compress(.gzip, reader, writer, options);
-}
-
-/// Compressor type
-pub fn Compressor(comptime WriterType: type) type {
- return deflate.Compressor(.gzip, WriterType);
-}
-
-/// Create Compressor which outputs compressed data to the writer.
-pub fn compressor(writer: anytype, options: Options) !Compressor(@TypeOf(writer)) {
- return try deflate.compressor(.gzip, writer, options);
-}
-
-/// Huffman only compression. Without Lempel-Ziv match searching. Faster
-/// compression, less memory requirements but bigger compressed sizes.
-pub const huffman = struct {
- pub fn compress(reader: anytype, writer: anytype) !void {
- try deflate.huffman.compress(.gzip, reader, writer);
- }
-
- pub fn Compressor(comptime WriterType: type) type {
- return deflate.huffman.Compressor(.gzip, WriterType);
- }
-
- pub fn compressor(writer: anytype) !huffman.Compressor(@TypeOf(writer)) {
- return deflate.huffman.compressor(.gzip, writer);
- }
-};
-
-// No compression store only. Compressed size is slightly bigger than plain.
-pub const store = struct {
- pub fn compress(reader: anytype, writer: anytype) !void {
- try deflate.store.compress(.gzip, reader, writer);
- }
-
- pub fn Compressor(comptime WriterType: type) type {
- return deflate.store.Compressor(.gzip, WriterType);
- }
-
- pub fn compressor(writer: anytype) !store.Compressor(@TypeOf(writer)) {
- return deflate.store.compressor(.gzip, writer);
- }
-};
lib/std/compress/zlib.zig
@@ -1,101 +0,0 @@
-const deflate = @import("flate/deflate.zig");
-const inflate = @import("flate/inflate.zig");
-
-/// Decompress compressed data from reader and write plain data to the writer.
-pub fn decompress(reader: anytype, writer: anytype) !void {
- try inflate.decompress(.zlib, reader, writer);
-}
-
-/// Decompressor type
-pub fn Decompressor(comptime ReaderType: type) type {
- return inflate.Decompressor(.zlib, ReaderType);
-}
-
-/// Create Decompressor which will read compressed data from reader.
-pub fn decompressor(reader: anytype) Decompressor(@TypeOf(reader)) {
- return inflate.decompressor(.zlib, reader);
-}
-
-/// Compression level, trades between speed and compression size.
-pub const Options = deflate.Options;
-
-/// Compress plain data from reader and write compressed data to the writer.
-pub fn compress(reader: anytype, writer: anytype, options: Options) !void {
- try deflate.compress(.zlib, reader, writer, options);
-}
-
-/// Compressor type
-pub fn Compressor(comptime WriterType: type) type {
- return deflate.Compressor(.zlib, WriterType);
-}
-
-/// Create Compressor which outputs compressed data to the writer.
-pub fn compressor(writer: anytype, options: Options) !Compressor(@TypeOf(writer)) {
- return try deflate.compressor(.zlib, writer, options);
-}
-
-/// Huffman only compression. Without Lempel-Ziv match searching. Faster
-/// compression, less memory requirements but bigger compressed sizes.
-pub const huffman = struct {
- pub fn compress(reader: anytype, writer: anytype) !void {
- try deflate.huffman.compress(.zlib, reader, writer);
- }
-
- pub fn Compressor(comptime WriterType: type) type {
- return deflate.huffman.Compressor(.zlib, WriterType);
- }
-
- pub fn compressor(writer: anytype) !huffman.Compressor(@TypeOf(writer)) {
- return deflate.huffman.compressor(.zlib, writer);
- }
-};
-
-// No compression store only. Compressed size is slightly bigger than plain.
-pub const store = struct {
- pub fn compress(reader: anytype, writer: anytype) !void {
- try deflate.store.compress(.zlib, reader, writer);
- }
-
- pub fn Compressor(comptime WriterType: type) type {
- return deflate.store.Compressor(.zlib, WriterType);
- }
-
- pub fn compressor(writer: anytype) !store.Compressor(@TypeOf(writer)) {
- return deflate.store.compressor(.zlib, writer);
- }
-};
-
-test "should not overshoot" {
- const std = @import("std");
-
- // Compressed zlib data with extra 4 bytes at the end.
- const data = [_]u8{
- 0x78, 0x9c, 0x73, 0xce, 0x2f, 0xa8, 0x2c, 0xca, 0x4c, 0xcf, 0x28, 0x51, 0x08, 0xcf, 0xcc, 0xc9,
- 0x49, 0xcd, 0x55, 0x28, 0x4b, 0xcc, 0x53, 0x08, 0x4e, 0xce, 0x48, 0xcc, 0xcc, 0xd6, 0x51, 0x08,
- 0xce, 0xcc, 0x4b, 0x4f, 0x2c, 0xc8, 0x2f, 0x4a, 0x55, 0x30, 0xb4, 0xb4, 0x34, 0xd5, 0xb5, 0x34,
- 0x03, 0x00, 0x8b, 0x61, 0x0f, 0xa4, 0x52, 0x5a, 0x94, 0x12,
- };
-
- var stream = std.io.fixedBufferStream(data[0..]);
- const reader = stream.reader();
-
- var dcp = decompressor(reader);
- var out: [128]u8 = undefined;
-
- // Decompress
- var n = try dcp.reader().readAll(out[0..]);
-
- // Expected decompressed data
- try std.testing.expectEqual(46, n);
- try std.testing.expectEqualStrings("Copyright Willem van Schaik, Singapore 1995-96", out[0..n]);
-
- // Decompressor don't overshoot underlying reader.
- // It is leaving it at the end of compressed data chunk.
- try std.testing.expectEqual(data.len - 4, stream.getPos());
- try std.testing.expectEqual(0, dcp.unreadBytes());
-
- // 4 bytes after compressed chunk are available in reader.
- n = try reader.readAll(out[0..]);
- try std.testing.expectEqual(n, 4);
- try std.testing.expectEqualSlices(u8, data[data.len - 4 .. data.len], out[0..n]);
-}
lib/std/debug/Dwarf.zig
@@ -2235,18 +2235,14 @@ pub const ElfModule = struct {
const section_bytes = try chopSlice(mapped_mem, shdr.sh_offset, shdr.sh_size);
sections[section_index.?] = if ((shdr.sh_flags & elf.SHF_COMPRESSED) > 0) blk: {
- var section_stream = std.io.fixedBufferStream(section_bytes);
- const section_reader = section_stream.reader();
- const chdr = section_reader.readStruct(elf.Chdr) catch continue;
+ var section_reader: std.Io.Reader = .fixed(section_bytes);
+ const chdr = section_reader.takeStruct(elf.Chdr, endian) catch continue;
if (chdr.ch_type != .ZLIB) continue;
- var zlib_stream = std.compress.zlib.decompressor(section_reader);
-
- const decompressed_section = try gpa.alloc(u8, chdr.ch_size);
+ var zlib_stream: std.compress.flate.Decompress = .init(§ion_reader, .zlib, &.{});
+ const decompressed_section = zlib_stream.reader.allocRemaining(gpa, .unlimited) catch continue;
errdefer gpa.free(decompressed_section);
-
- const read = zlib_stream.reader().readAll(decompressed_section) catch continue;
- assert(read == decompressed_section.len);
+ assert(chdr.ch_size == decompressed_section.len);
break :blk .{
.data = decompressed_section,
lib/std/http/Client.zig
@@ -405,13 +405,8 @@ pub const RequestTransfer = union(enum) {
/// The decompressor for response messages.
pub const Compression = union(enum) {
- pub const DeflateDecompressor = std.compress.zlib.Decompressor(Request.TransferReader);
- pub const GzipDecompressor = std.compress.gzip.Decompressor(Request.TransferReader);
- // https://github.com/ziglang/zig/issues/18937
- //pub const ZstdDecompressor = std.compress.zstd.DecompressStream(Request.TransferReader, .{});
-
- deflate: DeflateDecompressor,
- gzip: GzipDecompressor,
+ deflate: std.compress.flate.Decompress,
+ gzip: std.compress.flate.Decompress,
// https://github.com/ziglang/zig/issues/18937
//zstd: ZstdDecompressor,
none: void,
lib/std/http/Server.zig
@@ -130,8 +130,8 @@ pub const Request = struct {
pub const DeflateDecompressor = std.compress.zlib.Decompressor(std.io.AnyReader);
pub const GzipDecompressor = std.compress.gzip.Decompressor(std.io.AnyReader);
- deflate: DeflateDecompressor,
- gzip: GzipDecompressor,
+ deflate: std.compress.flate.Decompress,
+ gzip: std.compress.flate.Decompress,
zstd: std.compress.zstd.Decompress,
none: void,
};
lib/std/compress.zig
@@ -1,8 +1,7 @@
//! Compression algorithms.
+/// gzip and zlib are here.
pub const flate = @import("compress/flate.zig");
-pub const gzip = @import("compress/gzip.zig");
-pub const zlib = @import("compress/zlib.zig");
pub const lzma = @import("compress/lzma.zig");
pub const lzma2 = @import("compress/lzma2.zig");
pub const xz = @import("compress/xz.zig");
@@ -14,6 +13,4 @@ test {
_ = lzma2;
_ = xz;
_ = zstd;
- _ = gzip;
- _ = zlib;
}
lib/std/zip.zig
@@ -5,11 +5,10 @@
const builtin = @import("builtin");
const std = @import("std");
-const testing = std.testing;
-
-pub const testutil = @import("zip/test.zig");
-const File = testutil.File;
-const FileStore = testutil.FileStore;
+const File = std.fs.File;
+const is_le = builtin.target.cpu.arch.endian() == .little;
+const Writer = std.io.Writer;
+const Reader = std.io.Reader;
pub const CompressionMethod = enum(u16) {
store = 0,
@@ -95,102 +94,116 @@ pub const EndRecord = extern struct {
central_directory_size: u32 align(1),
central_directory_offset: u32 align(1),
comment_len: u16 align(1),
+
pub fn need_zip64(self: EndRecord) bool {
return isMaxInt(self.record_count_disk) or
isMaxInt(self.record_count_total) or
isMaxInt(self.central_directory_size) or
isMaxInt(self.central_directory_offset);
}
-};
-/// Find and return the end record for the given seekable zip stream.
-/// Note that `seekable_stream` must be an instance of `std.io.SeekableStream` and
-/// its context must also have a `.reader()` method that returns an instance of
-/// `std.io.GenericReader`.
-pub fn findEndRecord(seekable_stream: anytype, stream_len: u64) !EndRecord {
- var buf: [@sizeOf(EndRecord) + std.math.maxInt(u16)]u8 = undefined;
- const record_len_max = @min(stream_len, buf.len);
- var loaded_len: u32 = 0;
-
- var comment_len: u16 = 0;
- while (true) {
- const record_len: u32 = @as(u32, comment_len) + @sizeOf(EndRecord);
- if (record_len > record_len_max)
- return error.ZipNoEndRecord;
-
- if (record_len > loaded_len) {
- const new_loaded_len = @min(loaded_len + 300, record_len_max);
- const read_len = new_loaded_len - loaded_len;
-
- try seekable_stream.seekTo(stream_len - @as(u64, new_loaded_len));
- const read_buf: []u8 = buf[buf.len - new_loaded_len ..][0..read_len];
- const len = try (if (@TypeOf(seekable_stream.context) == std.fs.File) seekable_stream.context.deprecatedReader() else seekable_stream.context.reader()).readAll(read_buf);
- if (len != read_len)
- return error.ZipTruncated;
- loaded_len = new_loaded_len;
- }
+ pub const FindBufferError = error{ ZipNoEndRecord, ZipTruncated };
- const record_bytes = buf[buf.len - record_len ..][0..@sizeOf(EndRecord)];
- if (std.mem.eql(u8, record_bytes[0..4], &end_record_sig) and
- std.mem.readInt(u16, record_bytes[20..22], .little) == comment_len)
- {
- const record: *align(1) EndRecord = @ptrCast(record_bytes.ptr);
- if (builtin.target.cpu.arch.endian() != .little) {
- std.mem.byteSwapAllFields(@TypeOf(record.*), record);
+ /// TODO audit this logic
+ pub fn findBuffer(buffer: []const u8) FindBufferError!EndRecord {
+ const pos = std.mem.lastIndexOf(u8, buffer, &end_record_sig) orelse return error.ZipNoEndRecord;
+ if (pos + @sizeOf(EndRecord) > buffer.len) return error.EndOfStream;
+ const record_ptr: *EndRecord = @ptrCast(buffer[pos..][0..@sizeOf(EndRecord)]);
+ var record = record_ptr.*;
+ if (!is_le) std.mem.byteSwapAllFields(EndRecord, &record);
+ return record;
+ }
+
+ pub const FindFileError = File.GetEndPosError || File.SeekError || File.ReadError || error{
+ ZipNoEndRecord,
+ EndOfStream,
+ };
+
+ pub fn findFile(fr: *File.Reader) FindFileError!EndRecord {
+ const end_pos = try fr.getSize();
+
+ var buf: [@sizeOf(EndRecord) + std.math.maxInt(u16)]u8 = undefined;
+ const record_len_max = @min(end_pos, buf.len);
+ var loaded_len: u32 = 0;
+ var comment_len: u16 = 0;
+ while (true) {
+ const record_len: u32 = @as(u32, comment_len) + @sizeOf(EndRecord);
+ if (record_len > record_len_max)
+ return error.ZipNoEndRecord;
+
+ if (record_len > loaded_len) {
+ const new_loaded_len = @min(loaded_len + 300, record_len_max);
+ const read_len = new_loaded_len - loaded_len;
+
+ try fr.seekTo(end_pos - @as(u64, new_loaded_len));
+ const read_buf: []u8 = buf[buf.len - new_loaded_len ..][0..read_len];
+ var br = fr.interface().unbuffered();
+ br.readSlice(read_buf) catch |err| switch (err) {
+ error.ReadFailed => return fr.err.?,
+ error.EndOfStream => return error.EndOfStream,
+ };
+ loaded_len = new_loaded_len;
+ }
+
+ const record_bytes = buf[buf.len - record_len ..][0..@sizeOf(EndRecord)];
+ if (std.mem.eql(u8, record_bytes[0..4], &end_record_sig) and
+ std.mem.readInt(u16, record_bytes[20..22], .little) == comment_len)
+ {
+ const record: *align(1) EndRecord = @ptrCast(record_bytes.ptr);
+ if (!is_le) std.mem.byteSwapAllFields(EndRecord, record);
+ return record.*;
}
- return record.*;
+
+ if (comment_len == std.math.maxInt(u16))
+ return error.ZipNoEndRecord;
+ comment_len += 1;
}
+ }
+};
- if (comment_len == std.math.maxInt(u16))
- return error.ZipNoEndRecord;
- comment_len += 1;
+pub const Decompress = struct {
+ interface: Reader,
+ state: union {
+ inflate: std.compress.flate.Decompress,
+ store: *Reader,
+ },
+
+ pub fn init(reader: *Reader, method: CompressionMethod, buffer: []u8) Reader {
+ return switch (method) {
+ .store => .{
+ .state = .{ .store = reader },
+ .interface = .{
+ .context = undefined,
+ .vtable = &.{ .stream = streamStore },
+ .buffer = buffer,
+ .end = 0,
+ .seek = 0,
+ },
+ },
+ .deflate => .{
+ .state = .{ .inflate = .init(reader, .raw) },
+ .interface = .{
+ .context = undefined,
+ .vtable = &.{ .stream = streamDeflate },
+ .buffer = buffer,
+ .end = 0,
+ .seek = 0,
+ },
+ },
+ else => unreachable,
+ };
}
-}
-/// Decompresses the given data from `reader` into `writer`. Stops early if more
-/// than `uncompressed_size` bytes are processed and verifies that exactly that
-/// number of bytes are decompressed. Returns the CRC-32 of the uncompressed data.
-/// `writer` can be anything with a `writeAll(self: *Self, chunk: []const u8) anyerror!void` method.
-pub fn decompress(
- method: CompressionMethod,
- uncompressed_size: u64,
- reader: anytype,
- writer: anytype,
-) !u32 {
- var hash = std.hash.Crc32.init();
-
- var total_uncompressed: u64 = 0;
- switch (method) {
- .store => {
- var buf: [4096]u8 = undefined;
- while (true) {
- const len = try reader.read(&buf);
- if (len == 0) break;
- try writer.writeAll(buf[0..len]);
- hash.update(buf[0..len]);
- total_uncompressed += @intCast(len);
- }
- },
- .deflate => {
- var br = std.io.bufferedReader(reader);
- var decompressor = std.compress.flate.decompressor(br.reader());
- while (try decompressor.next()) |chunk| {
- try writer.writeAll(chunk);
- hash.update(chunk);
- total_uncompressed += @intCast(chunk.len);
- if (total_uncompressed > uncompressed_size)
- return error.ZipUncompressSizeTooSmall;
- }
- if (br.end != br.start)
- return error.ZipDeflateTruncated;
- },
- _ => return error.UnsupportedCompressionMethod,
+ fn streamStore(r: *Reader, w: *Writer, limit: std.io.Limit) Reader.StreamError!usize {
+ const d: *Decompress = @fieldParentPtr("interface", r);
+ return d.store.read(w, limit);
}
- if (total_uncompressed != uncompressed_size)
- return error.ZipUncompressSizeMismatch;
- return hash.final();
-}
+ fn streamDeflate(r: *Reader, w: *Writer, limit: std.io.Limit) Reader.StreamError!usize {
+ const d: *Decompress = @fieldParentPtr("interface", r);
+ return std.compress.flate.Decompress.read(&d.inflate, w, limit);
+ }
+};
fn isBadFilename(filename: []const u8) bool {
if (filename.len == 0 or filename[0] == '/')
@@ -253,319 +266,332 @@ fn readZip64FileExtents(comptime T: type, header: T, extents: *FileExtents, data
}
}
-pub fn Iterator(comptime SeekableStream: type) type {
- return struct {
- stream: SeekableStream,
+pub const Iterator = struct {
+ input: *File.Reader,
- cd_record_count: u64,
- cd_zip_offset: u64,
- cd_size: u64,
+ cd_record_count: u64,
+ cd_zip_offset: u64,
+ cd_size: u64,
- cd_record_index: u64 = 0,
- cd_record_offset: u64 = 0,
+ cd_record_index: u64 = 0,
+ cd_record_offset: u64 = 0,
- const Self = @This();
+ pub fn init(input: *File.Reader) !Iterator {
+ const end_record = try EndRecord.findFile(input);
- pub fn init(stream: SeekableStream) !Self {
- const stream_len = try stream.getEndPos();
+ if (!isMaxInt(end_record.record_count_disk) and end_record.record_count_disk > end_record.record_count_total)
+ return error.ZipDiskRecordCountTooLarge;
- const end_record = try findEndRecord(stream, stream_len);
-
- if (!isMaxInt(end_record.record_count_disk) and end_record.record_count_disk > end_record.record_count_total)
- return error.ZipDiskRecordCountTooLarge;
-
- if (end_record.disk_number != 0 or end_record.central_directory_disk_number != 0)
- return error.ZipMultiDiskUnsupported;
+ if (end_record.disk_number != 0 or end_record.central_directory_disk_number != 0)
+ return error.ZipMultiDiskUnsupported;
- {
- const counts_valid = !isMaxInt(end_record.record_count_disk) and !isMaxInt(end_record.record_count_total);
- if (counts_valid and end_record.record_count_disk != end_record.record_count_total)
- return error.ZipMultiDiskUnsupported;
- }
-
- var result = Self{
- .stream = stream,
- .cd_record_count = end_record.record_count_total,
- .cd_zip_offset = end_record.central_directory_offset,
- .cd_size = end_record.central_directory_size,
- };
- if (!end_record.need_zip64()) return result;
-
- const locator_end_offset: u64 = @as(u64, end_record.comment_len) + @sizeOf(EndRecord) + @sizeOf(EndLocator64);
- if (locator_end_offset > stream_len)
- return error.ZipTruncated;
- try stream.seekTo(stream_len - locator_end_offset);
- const locator = try (if (@TypeOf(stream.context) == std.fs.File) stream.context.deprecatedReader() else stream.context.reader()).readStructEndian(EndLocator64, .little);
- if (!std.mem.eql(u8, &locator.signature, &end_locator64_sig))
- return error.ZipBadLocatorSig;
- if (locator.zip64_disk_count != 0)
- return error.ZipUnsupportedZip64DiskCount;
- if (locator.total_disk_count != 1)
+ {
+ const counts_valid = !isMaxInt(end_record.record_count_disk) and !isMaxInt(end_record.record_count_total);
+ if (counts_valid and end_record.record_count_disk != end_record.record_count_total)
return error.ZipMultiDiskUnsupported;
+ }
- try stream.seekTo(locator.record_file_offset);
-
- const record64 = try (if (@TypeOf(stream.context) == std.fs.File) stream.context.deprecatedReader() else stream.context.reader()).readStructEndian(EndRecord64, .little);
-
- if (!std.mem.eql(u8, &record64.signature, &end_record64_sig))
- return error.ZipBadEndRecord64Sig;
-
- if (record64.end_record_size < @sizeOf(EndRecord64) - 12)
- return error.ZipEndRecord64SizeTooSmall;
- if (record64.end_record_size > @sizeOf(EndRecord64) - 12)
- return error.ZipEndRecord64UnhandledExtraData;
+ var result: Iterator = .{
+ .input = input,
+ .cd_record_count = end_record.record_count_total,
+ .cd_zip_offset = end_record.central_directory_offset,
+ .cd_size = end_record.central_directory_size,
+ };
+ if (!end_record.need_zip64()) return result;
- if (record64.version_needed_to_extract > 45)
- return error.ZipUnsupportedVersion;
+ const locator_end_offset: u64 = @as(u64, end_record.comment_len) + @sizeOf(EndRecord) + @sizeOf(EndLocator64);
+ const stream_len = try input.getSize();
- {
- const is_multidisk = record64.disk_number != 0 or
- record64.central_directory_disk_number != 0 or
- record64.record_count_disk != record64.record_count_total;
- if (is_multidisk)
- return error.ZipMultiDiskUnsupported;
- }
+ if (locator_end_offset > stream_len)
+ return error.ZipTruncated;
+ try input.seekTo(stream_len - locator_end_offset);
+ const locator = input.interface.takeStructEndian(EndLocator64, .little) catch |err| switch (err) {
+ error.ReadFailed => return input.err.?,
+ error.EndOfStream => return error.EndOfStream,
+ };
+ if (!std.mem.eql(u8, &locator.signature, &end_locator64_sig))
+ return error.ZipBadLocatorSig;
+ if (locator.zip64_disk_count != 0)
+ return error.ZipUnsupportedZip64DiskCount;
+ if (locator.total_disk_count != 1)
+ return error.ZipMultiDiskUnsupported;
+
+ try input.seekTo(locator.record_file_offset);
+
+ const record64 = input.interface.takeStructEndian(EndRecord64, .little) catch |err| switch (err) {
+ error.ReadFailed => return input.err.?,
+ error.EndOfStream => return error.EndOfStream,
+ };
- if (isMaxInt(end_record.record_count_total)) {
- result.cd_record_count = record64.record_count_total;
- } else if (end_record.record_count_total != record64.record_count_total)
- return error.Zip64RecordCountTotalMismatch;
+ if (!std.mem.eql(u8, &record64.signature, &end_record64_sig))
+ return error.ZipBadEndRecord64Sig;
- if (isMaxInt(end_record.central_directory_offset)) {
- result.cd_zip_offset = record64.central_directory_offset;
- } else if (end_record.central_directory_offset != record64.central_directory_offset)
- return error.Zip64CentralDirectoryOffsetMismatch;
+ if (record64.end_record_size < @sizeOf(EndRecord64) - 12)
+ return error.ZipEndRecord64SizeTooSmall;
+ if (record64.end_record_size > @sizeOf(EndRecord64) - 12)
+ return error.ZipEndRecord64UnhandledExtraData;
- if (isMaxInt(end_record.central_directory_size)) {
- result.cd_size = record64.central_directory_size;
- } else if (end_record.central_directory_size != record64.central_directory_size)
- return error.Zip64CentralDirectorySizeMismatch;
+ if (record64.version_needed_to_extract > 45)
+ return error.ZipUnsupportedVersion;
- return result;
+ {
+ const is_multidisk = record64.disk_number != 0 or
+ record64.central_directory_disk_number != 0 or
+ record64.record_count_disk != record64.record_count_total;
+ if (is_multidisk)
+ return error.ZipMultiDiskUnsupported;
}
- pub fn next(self: *Self) !?Entry {
- if (self.cd_record_index == self.cd_record_count) {
- if (self.cd_record_offset != self.cd_size)
- return if (self.cd_size > self.cd_record_offset)
- error.ZipCdOversized
- else
- error.ZipCdUndersized;
+ if (isMaxInt(end_record.record_count_total)) {
+ result.cd_record_count = record64.record_count_total;
+ } else if (end_record.record_count_total != record64.record_count_total)
+ return error.Zip64RecordCountTotalMismatch;
- return null;
- }
+ if (isMaxInt(end_record.central_directory_offset)) {
+ result.cd_zip_offset = record64.central_directory_offset;
+ } else if (end_record.central_directory_offset != record64.central_directory_offset)
+ return error.Zip64CentralDirectoryOffsetMismatch;
- const header_zip_offset = self.cd_zip_offset + self.cd_record_offset;
- try self.stream.seekTo(header_zip_offset);
- const header = try (if (@TypeOf(self.stream.context) == std.fs.File) self.stream.context.deprecatedReader() else self.stream.context.reader()).readStructEndian(CentralDirectoryFileHeader, .little);
- if (!std.mem.eql(u8, &header.signature, ¢ral_file_header_sig))
- return error.ZipBadCdOffset;
+ if (isMaxInt(end_record.central_directory_size)) {
+ result.cd_size = record64.central_directory_size;
+ } else if (end_record.central_directory_size != record64.central_directory_size)
+ return error.Zip64CentralDirectorySizeMismatch;
- self.cd_record_index += 1;
- self.cd_record_offset += @sizeOf(CentralDirectoryFileHeader) + header.filename_len + header.extra_len + header.comment_len;
+ return result;
+ }
- // Note: checking the version_needed_to_extract doesn't seem to be helpful, i.e. the zip file
- // at https://github.com/ninja-build/ninja/releases/download/v1.12.0/ninja-linux.zip
- // has an undocumented version 788 but extracts just fine.
+ pub fn next(self: *Iterator) !?Entry {
+ if (self.cd_record_index == self.cd_record_count) {
+ if (self.cd_record_offset != self.cd_size)
+ return if (self.cd_size > self.cd_record_offset)
+ error.ZipCdOversized
+ else
+ error.ZipCdUndersized;
- if (header.flags.encrypted)
- return error.ZipEncryptionUnsupported;
- // TODO: check/verify more flags
- if (header.disk_number != 0)
- return error.ZipMultiDiskUnsupported;
+ return null;
+ }
- var extents: FileExtents = .{
- .uncompressed_size = header.uncompressed_size,
- .compressed_size = header.compressed_size,
- .local_file_header_offset = header.local_file_header_offset,
- };
+ const header_zip_offset = self.cd_zip_offset + self.cd_record_offset;
+ const input = self.input;
+ try input.seekTo(header_zip_offset);
+ const header = input.interface.takeStructEndian(CentralDirectoryFileHeader, .little) catch |err| switch (err) {
+ error.ReadFailed => return input.err.?,
+ error.EndOfStream => return error.EndOfStream,
+ };
+ if (!std.mem.eql(u8, &header.signature, ¢ral_file_header_sig))
+ return error.ZipBadCdOffset;
+
+ self.cd_record_index += 1;
+ self.cd_record_offset += @sizeOf(CentralDirectoryFileHeader) + header.filename_len + header.extra_len + header.comment_len;
+
+ // Note: checking the version_needed_to_extract doesn't seem to be helpful, i.e. the zip file
+ // at https://github.com/ninja-build/ninja/releases/download/v1.12.0/ninja-linux.zip
+ // has an undocumented version 788 but extracts just fine.
+
+ if (header.flags.encrypted)
+ return error.ZipEncryptionUnsupported;
+ // TODO: check/verify more flags
+ if (header.disk_number != 0)
+ return error.ZipMultiDiskUnsupported;
+
+ var extents: FileExtents = .{
+ .uncompressed_size = header.uncompressed_size,
+ .compressed_size = header.compressed_size,
+ .local_file_header_offset = header.local_file_header_offset,
+ };
- if (header.extra_len > 0) {
- var extra_buf: [std.math.maxInt(u16)]u8 = undefined;
- const extra = extra_buf[0..header.extra_len];
+ if (header.extra_len > 0) {
+ var extra_buf: [std.math.maxInt(u16)]u8 = undefined;
+ const extra = extra_buf[0..header.extra_len];
- {
- try self.stream.seekTo(header_zip_offset + @sizeOf(CentralDirectoryFileHeader) + header.filename_len);
- const len = try (if (@TypeOf(self.stream.context) == std.fs.File) self.stream.context.deprecatedReader() else self.stream.context.reader()).readAll(extra);
- if (len != extra.len)
- return error.ZipTruncated;
- }
+ try input.seekTo(header_zip_offset + @sizeOf(CentralDirectoryFileHeader) + header.filename_len);
+ input.interface.readSlice(extra) catch |err| switch (err) {
+ error.ReadFailed => return input.err.?,
+ error.EndOfStream => return error.EndOfStream,
+ };
- var extra_offset: usize = 0;
- while (extra_offset + 4 <= extra.len) {
- const header_id = std.mem.readInt(u16, extra[extra_offset..][0..2], .little);
- const data_size = std.mem.readInt(u16, extra[extra_offset..][2..4], .little);
- const end = extra_offset + 4 + data_size;
- if (end > extra.len)
- return error.ZipBadExtraFieldSize;
- const data = extra[extra_offset + 4 .. end];
- switch (@as(ExtraHeader, @enumFromInt(header_id))) {
- .zip64_info => try readZip64FileExtents(CentralDirectoryFileHeader, header, &extents, data),
- else => {}, // ignore
- }
- extra_offset = end;
+ var extra_offset: usize = 0;
+ while (extra_offset + 4 <= extra.len) {
+ const header_id = std.mem.readInt(u16, extra[extra_offset..][0..2], .little);
+ const data_size = std.mem.readInt(u16, extra[extra_offset..][2..4], .little);
+ const end = extra_offset + 4 + data_size;
+ if (end > extra.len)
+ return error.ZipBadExtraFieldSize;
+ const data = extra[extra_offset + 4 .. end];
+ switch (@as(ExtraHeader, @enumFromInt(header_id))) {
+ .zip64_info => try readZip64FileExtents(CentralDirectoryFileHeader, header, &extents, data),
+ else => {}, // ignore
}
+ extra_offset = end;
}
-
- return .{
- .version_needed_to_extract = header.version_needed_to_extract,
- .flags = header.flags,
- .compression_method = header.compression_method,
- .last_modification_time = header.last_modification_time,
- .last_modification_date = header.last_modification_date,
- .header_zip_offset = header_zip_offset,
- .crc32 = header.crc32,
- .filename_len = header.filename_len,
- .compressed_size = extents.compressed_size,
- .uncompressed_size = extents.uncompressed_size,
- .file_offset = extents.local_file_header_offset,
- };
}
- pub const Entry = struct {
- version_needed_to_extract: u16,
- flags: GeneralPurposeFlags,
- compression_method: CompressionMethod,
- last_modification_time: u16,
- last_modification_date: u16,
- header_zip_offset: u64,
- crc32: u32,
- filename_len: u32,
- compressed_size: u64,
- uncompressed_size: u64,
- file_offset: u64,
-
- pub fn extract(
- self: Entry,
- stream: SeekableStream,
- options: ExtractOptions,
- filename_buf: []u8,
- dest: std.fs.Dir,
- ) !u32 {
- if (filename_buf.len < self.filename_len)
- return error.ZipInsufficientBuffer;
- const filename = filename_buf[0..self.filename_len];
+ return .{
+ .version_needed_to_extract = header.version_needed_to_extract,
+ .flags = header.flags,
+ .compression_method = header.compression_method,
+ .last_modification_time = header.last_modification_time,
+ .last_modification_date = header.last_modification_date,
+ .header_zip_offset = header_zip_offset,
+ .crc32 = header.crc32,
+ .filename_len = header.filename_len,
+ .compressed_size = extents.compressed_size,
+ .uncompressed_size = extents.uncompressed_size,
+ .file_offset = extents.local_file_header_offset,
+ };
+ }
+ pub const Entry = struct {
+ version_needed_to_extract: u16,
+ flags: GeneralPurposeFlags,
+ compression_method: CompressionMethod,
+ last_modification_time: u16,
+ last_modification_date: u16,
+ header_zip_offset: u64,
+ crc32: u32,
+ filename_len: u32,
+ compressed_size: u64,
+ uncompressed_size: u64,
+ file_offset: u64,
+
+ pub fn extract(
+ self: Entry,
+ stream: *File.Reader,
+ options: ExtractOptions,
+ filename_buf: []u8,
+ dest: std.fs.Dir,
+ ) !u32 {
+ if (filename_buf.len < self.filename_len)
+ return error.ZipInsufficientBuffer;
+ switch (self.compression_method) {
+ .store, .deflate => {},
+ else => return error.UnsupportedCompressionMethod,
+ }
+ const filename = filename_buf[0..self.filename_len];
+ {
try stream.seekTo(self.header_zip_offset + @sizeOf(CentralDirectoryFileHeader));
+ try stream.interface.readSlice(filename);
+ }
- {
- const len = try (if (@TypeOf(stream.context) == std.fs.File) stream.context.deprecatedReader() else stream.context.reader()).readAll(filename);
- if (len != filename.len)
- return error.ZipBadFileOffset;
- }
+ const local_data_header_offset: u64 = local_data_header_offset: {
+ const local_header = blk: {
+ try stream.seekTo(self.file_offset);
+ break :blk try stream.interface.takeStructEndian(LocalFileHeader, .little);
+ };
+ if (!std.mem.eql(u8, &local_header.signature, &local_file_header_sig))
+ return error.ZipBadFileOffset;
+ if (local_header.version_needed_to_extract != self.version_needed_to_extract)
+ return error.ZipMismatchVersionNeeded;
+ if (local_header.last_modification_time != self.last_modification_time)
+ return error.ZipMismatchModTime;
+ if (local_header.last_modification_date != self.last_modification_date)
+ return error.ZipMismatchModDate;
+
+ if (@as(u16, @bitCast(local_header.flags)) != @as(u16, @bitCast(self.flags)))
+ return error.ZipMismatchFlags;
+ if (local_header.crc32 != 0 and local_header.crc32 != self.crc32)
+ return error.ZipMismatchCrc32;
+ var extents: FileExtents = .{
+ .uncompressed_size = local_header.uncompressed_size,
+ .compressed_size = local_header.compressed_size,
+ .local_file_header_offset = 0,
+ };
+ if (local_header.extra_len > 0) {
+ var extra_buf: [std.math.maxInt(u16)]u8 = undefined;
+ const extra = extra_buf[0..local_header.extra_len];
- const local_data_header_offset: u64 = local_data_header_offset: {
- const local_header = blk: {
- try stream.seekTo(self.file_offset);
- break :blk try (if (@TypeOf(stream.context) == std.fs.File) stream.context.deprecatedReader() else stream.context.reader()).readStructEndian(LocalFileHeader, .little);
- };
- if (!std.mem.eql(u8, &local_header.signature, &local_file_header_sig))
- return error.ZipBadFileOffset;
- if (local_header.version_needed_to_extract != self.version_needed_to_extract)
- return error.ZipMismatchVersionNeeded;
- if (local_header.last_modification_time != self.last_modification_time)
- return error.ZipMismatchModTime;
- if (local_header.last_modification_date != self.last_modification_date)
- return error.ZipMismatchModDate;
-
- if (@as(u16, @bitCast(local_header.flags)) != @as(u16, @bitCast(self.flags)))
- return error.ZipMismatchFlags;
- if (local_header.crc32 != 0 and local_header.crc32 != self.crc32)
- return error.ZipMismatchCrc32;
- var extents: FileExtents = .{
- .uncompressed_size = local_header.uncompressed_size,
- .compressed_size = local_header.compressed_size,
- .local_file_header_offset = 0,
- };
- if (local_header.extra_len > 0) {
- var extra_buf: [std.math.maxInt(u16)]u8 = undefined;
- const extra = extra_buf[0..local_header.extra_len];
-
- {
- try stream.seekTo(self.file_offset + @sizeOf(LocalFileHeader) + local_header.filename_len);
- const len = try (if (@TypeOf(stream.context) == std.fs.File) stream.context.deprecatedReader() else stream.context.reader()).readAll(extra);
- if (len != extra.len)
- return error.ZipTruncated;
- }
+ {
+ try stream.seekTo(self.file_offset + @sizeOf(LocalFileHeader) + local_header.filename_len);
+ try stream.interface.readSlice(extra);
+ }
- var extra_offset: usize = 0;
- while (extra_offset + 4 <= local_header.extra_len) {
- const header_id = std.mem.readInt(u16, extra[extra_offset..][0..2], .little);
- const data_size = std.mem.readInt(u16, extra[extra_offset..][2..4], .little);
- const end = extra_offset + 4 + data_size;
- if (end > local_header.extra_len)
- return error.ZipBadExtraFieldSize;
- const data = extra[extra_offset + 4 .. end];
- switch (@as(ExtraHeader, @enumFromInt(header_id))) {
- .zip64_info => try readZip64FileExtents(LocalFileHeader, local_header, &extents, data),
- else => {}, // ignore
- }
- extra_offset = end;
+ var extra_offset: usize = 0;
+ while (extra_offset + 4 <= local_header.extra_len) {
+ const header_id = std.mem.readInt(u16, extra[extra_offset..][0..2], .little);
+ const data_size = std.mem.readInt(u16, extra[extra_offset..][2..4], .little);
+ const end = extra_offset + 4 + data_size;
+ if (end > local_header.extra_len)
+ return error.ZipBadExtraFieldSize;
+ const data = extra[extra_offset + 4 .. end];
+ switch (@as(ExtraHeader, @enumFromInt(header_id))) {
+ .zip64_info => try readZip64FileExtents(LocalFileHeader, local_header, &extents, data),
+ else => {}, // ignore
}
+ extra_offset = end;
}
+ }
- if (extents.compressed_size != 0 and
- extents.compressed_size != self.compressed_size)
- return error.ZipMismatchCompLen;
- if (extents.uncompressed_size != 0 and
- extents.uncompressed_size != self.uncompressed_size)
- return error.ZipMismatchUncompLen;
+ if (extents.compressed_size != 0 and
+ extents.compressed_size != self.compressed_size)
+ return error.ZipMismatchCompLen;
+ if (extents.uncompressed_size != 0 and
+ extents.uncompressed_size != self.uncompressed_size)
+ return error.ZipMismatchUncompLen;
- if (local_header.filename_len != self.filename_len)
- return error.ZipMismatchFilenameLen;
+ if (local_header.filename_len != self.filename_len)
+ return error.ZipMismatchFilenameLen;
- break :local_data_header_offset @as(u64, local_header.filename_len) +
- @as(u64, local_header.extra_len);
- };
+ break :local_data_header_offset @as(u64, local_header.filename_len) +
+ @as(u64, local_header.extra_len);
+ };
- if (isBadFilename(filename))
- return error.ZipBadFilename;
+ if (isBadFilename(filename))
+ return error.ZipBadFilename;
- if (options.allow_backslashes) {
- std.mem.replaceScalar(u8, filename, '\\', '/');
- } else {
- if (std.mem.indexOfScalar(u8, filename, '\\')) |_|
- return error.ZipFilenameHasBackslash;
- }
+ if (options.allow_backslashes) {
+ std.mem.replaceScalar(u8, filename, '\\', '/');
+ } else {
+ if (std.mem.indexOfScalar(u8, filename, '\\')) |_|
+ return error.ZipFilenameHasBackslash;
+ }
- // All entries that end in '/' are directories
- if (filename[filename.len - 1] == '/') {
- if (self.uncompressed_size != 0)
- return error.ZipBadDirectorySize;
- try dest.makePath(filename[0 .. filename.len - 1]);
- return std.hash.Crc32.hash(&.{});
- }
+ // All entries that end in '/' are directories
+ if (filename[filename.len - 1] == '/') {
+ if (self.uncompressed_size != 0)
+ return error.ZipBadDirectorySize;
+ try dest.makePath(filename[0 .. filename.len - 1]);
+ return std.hash.Crc32.hash(&.{});
+ }
- const out_file = blk: {
- if (std.fs.path.dirname(filename)) |dirname| {
- var parent_dir = try dest.makeOpenPath(dirname, .{});
- defer parent_dir.close();
+ const out_file = blk: {
+ if (std.fs.path.dirname(filename)) |dirname| {
+ var parent_dir = try dest.makeOpenPath(dirname, .{});
+ defer parent_dir.close();
- const basename = std.fs.path.basename(filename);
- break :blk try parent_dir.createFile(basename, .{ .exclusive = true });
- }
- break :blk try dest.createFile(filename, .{ .exclusive = true });
- };
- defer out_file.close();
- const local_data_file_offset: u64 =
- @as(u64, self.file_offset) +
- @as(u64, @sizeOf(LocalFileHeader)) +
- local_data_header_offset;
- try stream.seekTo(local_data_file_offset);
- var limited_reader = std.io.limitedReader((if (@TypeOf(stream.context) == std.fs.File) stream.context.deprecatedReader() else stream.context.reader()), self.compressed_size);
- const crc = try decompress(
- self.compression_method,
- self.uncompressed_size,
- limited_reader.reader(),
- out_file.deprecatedWriter(),
- );
- if (limited_reader.bytes_left != 0)
- return error.ZipDecompressTruncated;
- return crc;
- }
- };
+ const basename = std.fs.path.basename(filename);
+ break :blk try parent_dir.createFile(basename, .{ .exclusive = true });
+ }
+ break :blk try dest.createFile(filename, .{ .exclusive = true });
+ };
+ defer out_file.close();
+ var file_writer = out_file.writer();
+ var file_bw = file_writer.writer(&.{});
+ const local_data_file_offset: u64 =
+ @as(u64, self.file_offset) +
+ @as(u64, @sizeOf(LocalFileHeader)) +
+ local_data_header_offset;
+ try stream.seekTo(local_data_file_offset);
+ var limited_file_reader = stream.interface.limited(.limited(self.compressed_size));
+ var file_read_buffer: [1000]u8 = undefined;
+ var decompress_read_buffer: [1000]u8 = undefined;
+ var limited_br = limited_file_reader.reader().buffered(&file_read_buffer);
+ var decompress: Decompress = undefined;
+ var decompress_br = decompress.readable(&limited_br, self.compression_method, &decompress_read_buffer);
+ const start_out = file_bw.count;
+ var hash_writer = file_bw.hashed(std.hash.Crc32.init());
+ var hash_bw = hash_writer.writer(&.{});
+ decompress_br.readAll(&hash_bw, .limited(self.uncompressed_size)) catch |err| switch (err) {
+ error.ReadFailed => return stream.err.?,
+ error.WriteFailed => return file_writer.err.?,
+ error.EndOfStream => return error.ZipDecompressTruncated,
+ };
+ if (limited_file_reader.remaining.nonzero()) return error.ZipDecompressTruncated;
+ const written = file_bw.count - start_out;
+ if (written != self.uncompressed_size) return error.ZipUncompressSizeMismatch;
+ return hash_writer.hasher.final();
+ }
};
-}
+};
// returns true if `filename` starts with `root` followed by a forward slash
fn filenameInRoot(filename: []const u8, root: []const u8) bool {
@@ -614,17 +640,13 @@ pub const ExtractOptions = struct {
diagnostics: ?*Diagnostics = null,
};
-/// Extract the zipped files inside `seekable_stream` to the given `dest` directory.
-/// Note that `seekable_stream` must be an instance of `std.io.SeekableStream` and
-/// its context must also have a `.reader()` method that returns an instance of
-/// `std.io.GenericReader`.
-pub fn extract(dest: std.fs.Dir, seekable_stream: anytype, options: ExtractOptions) !void {
- const SeekableStream = @TypeOf(seekable_stream);
- var iter = try Iterator(SeekableStream).init(seekable_stream);
+/// Extract the zipped files to the given `dest` directory.
+pub fn extract(dest: std.fs.Dir, fr: *File.Reader, options: ExtractOptions) !void {
+ var iter = try Iterator.init(fr);
var filename_buf: [std.fs.max_path_bytes]u8 = undefined;
while (try iter.next()) |entry| {
- const crc32 = try entry.extract(seekable_stream, options, &filename_buf, dest);
+ const crc32 = try entry.extract(fr, options, &filename_buf, dest);
if (crc32 != entry.crc32)
return error.ZipCrcMismatch;
if (options.diagnostics) |d| {
@@ -633,173 +655,6 @@ pub fn extract(dest: std.fs.Dir, seekable_stream: anytype, options: ExtractOptio
}
}
-fn testZip(options: ExtractOptions, comptime files: []const File, write_opt: testutil.WriteZipOptions) !void {
- var store: [files.len]FileStore = undefined;
- try testZipWithStore(options, files, write_opt, &store);
-}
-fn testZipWithStore(
- options: ExtractOptions,
- test_files: []const File,
- write_opt: testutil.WriteZipOptions,
- store: []FileStore,
-) !void {
- var zip_buf: [4096]u8 = undefined;
- var fbs = try testutil.makeZipWithStore(&zip_buf, test_files, write_opt, store);
-
- var tmp = testing.tmpDir(.{ .no_follow = true });
- defer tmp.cleanup();
- try extract(tmp.dir, fbs.seekableStream(), options);
- try testutil.expectFiles(test_files, tmp.dir, .{});
-}
-fn testZipError(expected_error: anyerror, file: File, options: ExtractOptions) !void {
- var zip_buf: [4096]u8 = undefined;
- var store: [1]FileStore = undefined;
- var fbs = try testutil.makeZipWithStore(&zip_buf, &[_]File{file}, .{}, &store);
- var tmp = testing.tmpDir(.{ .no_follow = true });
- defer tmp.cleanup();
- try testing.expectError(expected_error, extract(tmp.dir, fbs.seekableStream(), options));
-}
-
-test "zip one file" {
- try testZip(.{}, &[_]File{
- .{ .name = "onefile.txt", .content = "Just a single file\n", .compression = .store },
- }, .{});
-}
-test "zip multiple files" {
- try testZip(.{ .allow_backslashes = true }, &[_]File{
- .{ .name = "foo", .content = "a foo file\n", .compression = .store },
- .{ .name = "subdir/bar", .content = "bar is this right?\nanother newline\n", .compression = .store },
- .{ .name = "subdir\\whoa", .content = "you can do backslashes", .compression = .store },
- .{ .name = "subdir/another/baz", .content = "bazzy mc bazzerson", .compression = .store },
- }, .{});
-}
-test "zip deflated" {
- try testZip(.{}, &[_]File{
- .{ .name = "deflateme", .content = "This is a deflated file.\nIt should be smaller in the Zip file1\n", .compression = .deflate },
- // TODO: re-enable this if/when we add support for deflate64
- //.{ .name = "deflateme64", .content = "The 64k version of deflate!\n", .compression = .deflate64 },
- .{ .name = "raw", .content = "Not all files need to be deflated in the same Zip.\n", .compression = .store },
- }, .{});
-}
-test "zip verify filenames" {
- // no empty filenames
- try testZipError(error.ZipBadFilename, .{ .name = "", .content = "", .compression = .store }, .{});
- // no absolute paths
- try testZipError(error.ZipBadFilename, .{ .name = "/", .content = "", .compression = .store }, .{});
- try testZipError(error.ZipBadFilename, .{ .name = "/foo", .content = "", .compression = .store }, .{});
- try testZipError(error.ZipBadFilename, .{ .name = "/foo/bar", .content = "", .compression = .store }, .{});
- // no '..' components
- try testZipError(error.ZipBadFilename, .{ .name = "..", .content = "", .compression = .store }, .{});
- try testZipError(error.ZipBadFilename, .{ .name = "foo/..", .content = "", .compression = .store }, .{});
- try testZipError(error.ZipBadFilename, .{ .name = "foo/bar/..", .content = "", .compression = .store }, .{});
- try testZipError(error.ZipBadFilename, .{ .name = "foo/bar/../", .content = "", .compression = .store }, .{});
- // no backslashes
- try testZipError(error.ZipFilenameHasBackslash, .{ .name = "foo\\bar", .content = "", .compression = .store }, .{});
-}
-
-test "zip64" {
- const test_files = [_]File{
- .{ .name = "fram", .content = "fram foo fro fraba", .compression = .store },
- .{ .name = "subdir/barro", .content = "aljdk;jal;jfd;lajkf", .compression = .store },
- };
-
- try testZip(.{}, &test_files, .{
- .end = .{
- .zip64 = .{},
- .record_count_disk = std.math.maxInt(u16), // trigger zip64
- },
- });
- try testZip(.{}, &test_files, .{
- .end = .{
- .zip64 = .{},
- .record_count_total = std.math.maxInt(u16), // trigger zip64
- },
- });
- try testZip(.{}, &test_files, .{
- .end = .{
- .zip64 = .{},
- .record_count_disk = std.math.maxInt(u16), // trigger zip64
- .record_count_total = std.math.maxInt(u16), // trigger zip64
- },
- });
- try testZip(.{}, &test_files, .{
- .end = .{
- .zip64 = .{},
- .central_directory_size = std.math.maxInt(u32), // trigger zip64
- },
- });
- try testZip(.{}, &test_files, .{
- .end = .{
- .zip64 = .{},
- .central_directory_offset = std.math.maxInt(u32), // trigger zip64
- },
- });
- try testZip(.{}, &test_files, .{
- .end = .{
- .zip64 = .{},
- .central_directory_offset = std.math.maxInt(u32), // trigger zip64
- },
- .local_header = .{
- .zip64 = .{ // trigger local header zip64
- .data_size = 16,
- },
- .compressed_size = std.math.maxInt(u32),
- .uncompressed_size = std.math.maxInt(u32),
- .extra_len = 20,
- },
- });
-}
-
-test "bad zip files" {
- var tmp = testing.tmpDir(.{ .no_follow = true });
- defer tmp.cleanup();
- var zip_buf: [4096]u8 = undefined;
-
- const file_a = [_]File{.{ .name = "a", .content = "", .compression = .store }};
-
- {
- var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .sig = [_]u8{ 1, 2, 3, 4 } } });
- try testing.expectError(error.ZipNoEndRecord, extract(tmp.dir, fbs.seekableStream(), .{}));
- }
- {
- var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .comment_len = 1 } });
- try testing.expectError(error.ZipNoEndRecord, extract(tmp.dir, fbs.seekableStream(), .{}));
- }
- {
- var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .comment = "a", .comment_len = 0 } });
- try testing.expectError(error.ZipNoEndRecord, extract(tmp.dir, fbs.seekableStream(), .{}));
- }
- {
- var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .disk_number = 1 } });
- try testing.expectError(error.ZipMultiDiskUnsupported, extract(tmp.dir, fbs.seekableStream(), .{}));
- }
- {
- var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .central_directory_disk_number = 1 } });
- try testing.expectError(error.ZipMultiDiskUnsupported, extract(tmp.dir, fbs.seekableStream(), .{}));
- }
- {
- var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .record_count_disk = 1 } });
- try testing.expectError(error.ZipDiskRecordCountTooLarge, extract(tmp.dir, fbs.seekableStream(), .{}));
- }
- {
- var fbs = try testutil.makeZip(&zip_buf, &.{}, .{ .end = .{ .central_directory_size = 1 } });
- try testing.expectError(error.ZipCdOversized, extract(tmp.dir, fbs.seekableStream(), .{}));
- }
- {
- var fbs = try testutil.makeZip(&zip_buf, &file_a, .{ .end = .{ .central_directory_size = 0 } });
- try testing.expectError(error.ZipCdUndersized, extract(tmp.dir, fbs.seekableStream(), .{}));
- }
- {
- var fbs = try testutil.makeZip(&zip_buf, &file_a, .{ .end = .{ .central_directory_offset = 0 } });
- try testing.expectError(error.ZipBadCdOffset, extract(tmp.dir, fbs.seekableStream(), .{}));
- }
- {
- var fbs = try testutil.makeZip(&zip_buf, &file_a, .{
- .end = .{
- .zip64 = .{ .locator_sig = [_]u8{ 1, 2, 3, 4 } },
- .central_directory_size = std.math.maxInt(u32), // trigger 64
- },
- });
- try testing.expectError(error.ZipBadLocatorSig, extract(tmp.dir, fbs.seekableStream(), .{}));
- }
+test {
+ _ = @import("zip/test.zig");
}