master
  1const Decompress = @This();
  2const std = @import("../../std.zig");
  3const Allocator = std.mem.Allocator;
  4const ArrayList = std.ArrayList;
  5const Crc32 = std.hash.Crc32;
  6const Crc64 = std.hash.crc.Crc64Xz;
  7const Sha256 = std.crypto.hash.sha2.Sha256;
  8const lzma2 = std.compress.lzma2;
  9const Writer = std.Io.Writer;
 10const Reader = std.Io.Reader;
 11const assert = std.debug.assert;
 12
 13/// Underlying compressed data stream to pull bytes from.
 14input: *Reader,
 15/// Uncompressed bytes output by this stream implementation.
 16reader: Reader,
 17gpa: Allocator,
 18check: Check,
 19block_count: usize,
 20err: ?Error,
 21
 22pub const Error = error{
 23    ReadFailed,
 24    OutOfMemory,
 25    CorruptInput,
 26    EndOfStream,
 27    WrongChecksum,
 28    Unsupported,
 29    Overflow,
 30    InvalidRangeCode,
 31    DecompressedSizeMismatch,
 32    CompressedSizeMismatch,
 33};
 34
 35pub const Check = enum(u4) {
 36    none = 0x00,
 37    crc32 = 0x01,
 38    crc64 = 0x04,
 39    sha256 = 0x0A,
 40    _,
 41};
 42
 43pub const StreamFlags = packed struct(u16) {
 44    null: u8 = 0,
 45    check: Check,
 46    reserved: u4 = 0,
 47};
 48
 49pub const InitError = error{
 50    NotXzStream,
 51    WrongChecksum,
 52};
 53
 54/// XZ uses a series of LZMA2 blocks which each specify a dictionary size
 55/// anywhere from 4K to 4G. Thus, this API dynamically allocates the dictionary
 56/// as-needed.
 57pub fn init(
 58    input: *Reader,
 59    gpa: Allocator,
 60    /// Decompress takes ownership of this buffer and resizes it with `gpa`.
 61    buffer: []u8,
 62) !Decompress {
 63    const magic = try input.takeArray(6);
 64    if (!std.mem.eql(u8, magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 }))
 65        return error.NotXzStream;
 66
 67    const computed_checksum = Crc32.hash(try input.peek(@sizeOf(StreamFlags)));
 68    const stream_flags = input.takeStruct(StreamFlags, .little) catch unreachable;
 69    const stored_hash = try input.takeInt(u32, .little);
 70    if (computed_checksum != stored_hash) return error.WrongChecksum;
 71
 72    return .{
 73        .input = input,
 74        .reader = .{
 75            .vtable = &.{
 76                .stream = stream,
 77                .readVec = readVec,
 78                .discard = discard,
 79            },
 80            .buffer = buffer,
 81            .seek = 0,
 82            .end = 0,
 83        },
 84        .gpa = gpa,
 85        .check = stream_flags.check,
 86        .block_count = 0,
 87        .err = null,
 88    };
 89}
 90
 91/// Reclaim ownership of the buffer passed to `init`.
 92pub fn takeBuffer(d: *Decompress) []u8 {
 93    const buffer = d.reader.buffer;
 94    d.reader.buffer = &.{};
 95    return buffer;
 96}
 97
 98pub fn deinit(d: *Decompress) void {
 99    const gpa = d.gpa;
100    gpa.free(d.reader.buffer);
101    d.* = undefined;
102}
103
104fn readVec(r: *Reader, data: [][]u8) Reader.Error!usize {
105    _ = data;
106    return readIndirect(r);
107}
108
109fn stream(r: *Reader, w: *Writer, limit: std.Io.Limit) Reader.StreamError!usize {
110    _ = w;
111    _ = limit;
112    return readIndirect(r);
113}
114
115fn discard(r: *Reader, limit: std.Io.Limit) Reader.Error!usize {
116    const d: *Decompress = @alignCast(@fieldParentPtr("reader", r));
117    _ = d;
118    _ = limit;
119    @panic("TODO");
120}
121
122fn readIndirect(r: *Reader) Reader.Error!usize {
123    const d: *Decompress = @alignCast(@fieldParentPtr("reader", r));
124    const gpa = d.gpa;
125    const input = d.input;
126
127    var allocating = Writer.Allocating.initOwnedSlice(gpa, r.buffer);
128    allocating.writer.end = r.end;
129    defer {
130        r.buffer = allocating.writer.buffer;
131        r.end = allocating.writer.end;
132    }
133
134    if (d.err != null) return error.ReadFailed;
135    if (d.block_count == std.math.maxInt(usize)) return error.EndOfStream;
136
137    readBlock(input, &allocating) catch |err| switch (err) {
138        error.WriteFailed => {
139            d.err = error.OutOfMemory;
140            return error.ReadFailed;
141        },
142        error.SuccessfulEndOfStream => {
143            finish(d) catch |finish_err| {
144                d.err = finish_err;
145                return error.ReadFailed;
146            };
147            d.block_count = std.math.maxInt(usize);
148            return error.EndOfStream;
149        },
150        else => |e| {
151            d.err = e;
152            return error.ReadFailed;
153        },
154    };
155    switch (d.check) {
156        .none => {},
157        .crc32 => {
158            const declared_checksum = try input.takeInt(u32, .little);
159            // TODO
160            //const hash_a = Crc32.hash(unpacked_bytes);
161            //if (hash_a != hash_b) return error.WrongChecksum;
162            _ = declared_checksum;
163        },
164        .crc64 => {
165            const declared_checksum = try input.takeInt(u64, .little);
166            // TODO
167            //const hash_a = Crc64.hash(unpacked_bytes);
168            //if (hash_a != hash_b) return error.WrongChecksum;
169            _ = declared_checksum;
170        },
171        .sha256 => {
172            const declared_hash = try input.take(Sha256.digest_length);
173            // TODO
174            //var hash_a: [Sha256.digest_length]u8 = undefined;
175            //Sha256.hash(unpacked_bytes, &hash_a, .{});
176            //if (!std.mem.eql(u8, &hash_a, &hash_b))
177            //    return error.WrongChecksum;
178            _ = declared_hash;
179        },
180        else => {
181            d.err = error.Unsupported;
182            return error.ReadFailed;
183        },
184    }
185    d.block_count += 1;
186    return 0;
187}
188
189fn readBlock(input: *Reader, allocating: *Writer.Allocating) !void {
190    var packed_size: ?u64 = null;
191    var unpacked_size: ?u64 = null;
192
193    const header_size = h: {
194        // Read the block header via peeking so that we can hash the whole thing too.
195        const first_byte: usize = try input.peekByte();
196        if (first_byte == 0) return error.SuccessfulEndOfStream;
197
198        const declared_header_size = first_byte * 4;
199        try input.fill(declared_header_size);
200        const header_seek_start = input.seek;
201        input.toss(1);
202
203        const Flags = packed struct(u8) {
204            last_filter_index: u2,
205            reserved: u4,
206            has_packed_size: bool,
207            has_unpacked_size: bool,
208        };
209        const flags = try input.takeStruct(Flags, .little);
210
211        const filter_count = @as(u3, flags.last_filter_index) + 1;
212        if (filter_count > 1) return error.Unsupported;
213
214        if (flags.has_packed_size) packed_size = try input.takeLeb128(u64);
215        if (flags.has_unpacked_size) unpacked_size = try input.takeLeb128(u64);
216
217        const FilterId = enum(u64) {
218            lzma2 = 0x21,
219            _,
220        };
221
222        const filter_id: FilterId = @enumFromInt(try input.takeLeb128(u64));
223        if (filter_id != .lzma2) return error.Unsupported;
224
225        const properties_size = try input.takeLeb128(u64);
226        if (properties_size != 1) return error.CorruptInput;
227        // TODO: use filter properties
228        _ = try input.takeByte();
229
230        const actual_header_size = input.seek - header_seek_start;
231        if (actual_header_size > declared_header_size) return error.CorruptInput;
232        const remaining_bytes = declared_header_size - actual_header_size;
233        for (0..remaining_bytes) |_| {
234            if (try input.takeByte() != 0) return error.CorruptInput;
235        }
236
237        const header_slice = input.buffer[header_seek_start..][0..declared_header_size];
238        const computed_checksum = Crc32.hash(header_slice);
239        const declared_checksum = try input.takeInt(u32, .little);
240        if (computed_checksum != declared_checksum) return error.WrongChecksum;
241        break :h declared_header_size;
242    };
243
244    // Compressed Data
245
246    var lzma2_decode = try lzma2.Decode.init(allocating.allocator);
247    defer lzma2_decode.deinit(allocating.allocator);
248    const before_size = allocating.writer.end;
249    const packed_bytes_read = try lzma2_decode.decompress(input, allocating);
250    const unpacked_bytes = allocating.writer.end - before_size;
251
252    if (packed_size) |s| {
253        if (s != packed_bytes_read) return error.CorruptInput;
254    }
255
256    if (unpacked_size) |s| {
257        if (s != unpacked_bytes) return error.CorruptInput;
258    }
259
260    // Block Padding
261    const block_counter = header_size + packed_bytes_read;
262    const padding = try input.take(@intCast((4 - (block_counter % 4)) % 4));
263    for (padding) |byte| {
264        if (byte != 0) return error.CorruptInput;
265    }
266}
267
268fn finish(d: *Decompress) !void {
269    const input = d.input;
270    const index_size = blk: {
271        // Assume that we already peeked a zero in readBlock().
272        assert(input.buffered()[0] == 0);
273        var input_counter: u64 = 1;
274        var checksum: Crc32 = .init();
275        checksum.update(&.{0});
276        input.toss(1);
277
278        const record_count = try countLeb128(input, u64, &input_counter, &checksum);
279        if (record_count != d.block_count)
280            return error.CorruptInput;
281
282        for (0..@intCast(record_count)) |_| {
283            // TODO: validate records
284            _ = try countLeb128(input, u64, &input_counter, &checksum);
285            _ = try countLeb128(input, u64, &input_counter, &checksum);
286        }
287
288        const padding = try input.take(@intCast((4 - (input_counter % 4)) % 4));
289        for (padding) |byte| {
290            if (byte != 0) return error.CorruptInput;
291        }
292        checksum.update(padding);
293
294        const declared_checksum = try input.takeInt(u32, .little);
295        const computed_checksum = checksum.final();
296        if (computed_checksum != declared_checksum) return error.WrongChecksum;
297
298        break :blk input_counter + padding.len + 4;
299    };
300
301    const declared_checksum = try input.takeInt(u32, .little);
302    const computed_checksum = Crc32.hash(try input.peek(4 + @sizeOf(StreamFlags)));
303    if (declared_checksum != computed_checksum) return error.WrongChecksum;
304    const backward_size = (@as(u64, try input.takeInt(u32, .little)) + 1) * 4;
305    if (backward_size != index_size) return error.CorruptInput;
306    input.toss(@sizeOf(StreamFlags));
307    if (!std.mem.eql(u8, try input.takeArray(2), &.{ 'Y', 'Z' }))
308        return error.CorruptInput;
309}
310
311fn countLeb128(reader: *Reader, comptime T: type, counter: *u64, hasher: *Crc32) !T {
312    try reader.fill(8);
313    const start = reader.seek;
314    const result = try reader.takeLeb128(T);
315    const read_slice = reader.buffer[start..reader.seek];
316    hasher.update(read_slice);
317    counter.* += read_slice.len;
318    return result;
319}