master
1const Decompress = @This();
2const std = @import("../../std.zig");
3const Allocator = std.mem.Allocator;
4const ArrayList = std.ArrayList;
5const Crc32 = std.hash.Crc32;
6const Crc64 = std.hash.crc.Crc64Xz;
7const Sha256 = std.crypto.hash.sha2.Sha256;
8const lzma2 = std.compress.lzma2;
9const Writer = std.Io.Writer;
10const Reader = std.Io.Reader;
11const assert = std.debug.assert;
12
13/// Underlying compressed data stream to pull bytes from.
14input: *Reader,
15/// Uncompressed bytes output by this stream implementation.
16reader: Reader,
17gpa: Allocator,
18check: Check,
19block_count: usize,
20err: ?Error,
21
22pub const Error = error{
23 ReadFailed,
24 OutOfMemory,
25 CorruptInput,
26 EndOfStream,
27 WrongChecksum,
28 Unsupported,
29 Overflow,
30 InvalidRangeCode,
31 DecompressedSizeMismatch,
32 CompressedSizeMismatch,
33};
34
35pub const Check = enum(u4) {
36 none = 0x00,
37 crc32 = 0x01,
38 crc64 = 0x04,
39 sha256 = 0x0A,
40 _,
41};
42
43pub const StreamFlags = packed struct(u16) {
44 null: u8 = 0,
45 check: Check,
46 reserved: u4 = 0,
47};
48
49pub const InitError = error{
50 NotXzStream,
51 WrongChecksum,
52};
53
54/// XZ uses a series of LZMA2 blocks which each specify a dictionary size
55/// anywhere from 4K to 4G. Thus, this API dynamically allocates the dictionary
56/// as-needed.
57pub fn init(
58 input: *Reader,
59 gpa: Allocator,
60 /// Decompress takes ownership of this buffer and resizes it with `gpa`.
61 buffer: []u8,
62) !Decompress {
63 const magic = try input.takeArray(6);
64 if (!std.mem.eql(u8, magic, &.{ 0xFD, '7', 'z', 'X', 'Z', 0x00 }))
65 return error.NotXzStream;
66
67 const computed_checksum = Crc32.hash(try input.peek(@sizeOf(StreamFlags)));
68 const stream_flags = input.takeStruct(StreamFlags, .little) catch unreachable;
69 const stored_hash = try input.takeInt(u32, .little);
70 if (computed_checksum != stored_hash) return error.WrongChecksum;
71
72 return .{
73 .input = input,
74 .reader = .{
75 .vtable = &.{
76 .stream = stream,
77 .readVec = readVec,
78 .discard = discard,
79 },
80 .buffer = buffer,
81 .seek = 0,
82 .end = 0,
83 },
84 .gpa = gpa,
85 .check = stream_flags.check,
86 .block_count = 0,
87 .err = null,
88 };
89}
90
91/// Reclaim ownership of the buffer passed to `init`.
92pub fn takeBuffer(d: *Decompress) []u8 {
93 const buffer = d.reader.buffer;
94 d.reader.buffer = &.{};
95 return buffer;
96}
97
98pub fn deinit(d: *Decompress) void {
99 const gpa = d.gpa;
100 gpa.free(d.reader.buffer);
101 d.* = undefined;
102}
103
104fn readVec(r: *Reader, data: [][]u8) Reader.Error!usize {
105 _ = data;
106 return readIndirect(r);
107}
108
109fn stream(r: *Reader, w: *Writer, limit: std.Io.Limit) Reader.StreamError!usize {
110 _ = w;
111 _ = limit;
112 return readIndirect(r);
113}
114
115fn discard(r: *Reader, limit: std.Io.Limit) Reader.Error!usize {
116 const d: *Decompress = @alignCast(@fieldParentPtr("reader", r));
117 _ = d;
118 _ = limit;
119 @panic("TODO");
120}
121
122fn readIndirect(r: *Reader) Reader.Error!usize {
123 const d: *Decompress = @alignCast(@fieldParentPtr("reader", r));
124 const gpa = d.gpa;
125 const input = d.input;
126
127 var allocating = Writer.Allocating.initOwnedSlice(gpa, r.buffer);
128 allocating.writer.end = r.end;
129 defer {
130 r.buffer = allocating.writer.buffer;
131 r.end = allocating.writer.end;
132 }
133
134 if (d.err != null) return error.ReadFailed;
135 if (d.block_count == std.math.maxInt(usize)) return error.EndOfStream;
136
137 readBlock(input, &allocating) catch |err| switch (err) {
138 error.WriteFailed => {
139 d.err = error.OutOfMemory;
140 return error.ReadFailed;
141 },
142 error.SuccessfulEndOfStream => {
143 finish(d) catch |finish_err| {
144 d.err = finish_err;
145 return error.ReadFailed;
146 };
147 d.block_count = std.math.maxInt(usize);
148 return error.EndOfStream;
149 },
150 else => |e| {
151 d.err = e;
152 return error.ReadFailed;
153 },
154 };
155 switch (d.check) {
156 .none => {},
157 .crc32 => {
158 const declared_checksum = try input.takeInt(u32, .little);
159 // TODO
160 //const hash_a = Crc32.hash(unpacked_bytes);
161 //if (hash_a != hash_b) return error.WrongChecksum;
162 _ = declared_checksum;
163 },
164 .crc64 => {
165 const declared_checksum = try input.takeInt(u64, .little);
166 // TODO
167 //const hash_a = Crc64.hash(unpacked_bytes);
168 //if (hash_a != hash_b) return error.WrongChecksum;
169 _ = declared_checksum;
170 },
171 .sha256 => {
172 const declared_hash = try input.take(Sha256.digest_length);
173 // TODO
174 //var hash_a: [Sha256.digest_length]u8 = undefined;
175 //Sha256.hash(unpacked_bytes, &hash_a, .{});
176 //if (!std.mem.eql(u8, &hash_a, &hash_b))
177 // return error.WrongChecksum;
178 _ = declared_hash;
179 },
180 else => {
181 d.err = error.Unsupported;
182 return error.ReadFailed;
183 },
184 }
185 d.block_count += 1;
186 return 0;
187}
188
189fn readBlock(input: *Reader, allocating: *Writer.Allocating) !void {
190 var packed_size: ?u64 = null;
191 var unpacked_size: ?u64 = null;
192
193 const header_size = h: {
194 // Read the block header via peeking so that we can hash the whole thing too.
195 const first_byte: usize = try input.peekByte();
196 if (first_byte == 0) return error.SuccessfulEndOfStream;
197
198 const declared_header_size = first_byte * 4;
199 try input.fill(declared_header_size);
200 const header_seek_start = input.seek;
201 input.toss(1);
202
203 const Flags = packed struct(u8) {
204 last_filter_index: u2,
205 reserved: u4,
206 has_packed_size: bool,
207 has_unpacked_size: bool,
208 };
209 const flags = try input.takeStruct(Flags, .little);
210
211 const filter_count = @as(u3, flags.last_filter_index) + 1;
212 if (filter_count > 1) return error.Unsupported;
213
214 if (flags.has_packed_size) packed_size = try input.takeLeb128(u64);
215 if (flags.has_unpacked_size) unpacked_size = try input.takeLeb128(u64);
216
217 const FilterId = enum(u64) {
218 lzma2 = 0x21,
219 _,
220 };
221
222 const filter_id: FilterId = @enumFromInt(try input.takeLeb128(u64));
223 if (filter_id != .lzma2) return error.Unsupported;
224
225 const properties_size = try input.takeLeb128(u64);
226 if (properties_size != 1) return error.CorruptInput;
227 // TODO: use filter properties
228 _ = try input.takeByte();
229
230 const actual_header_size = input.seek - header_seek_start;
231 if (actual_header_size > declared_header_size) return error.CorruptInput;
232 const remaining_bytes = declared_header_size - actual_header_size;
233 for (0..remaining_bytes) |_| {
234 if (try input.takeByte() != 0) return error.CorruptInput;
235 }
236
237 const header_slice = input.buffer[header_seek_start..][0..declared_header_size];
238 const computed_checksum = Crc32.hash(header_slice);
239 const declared_checksum = try input.takeInt(u32, .little);
240 if (computed_checksum != declared_checksum) return error.WrongChecksum;
241 break :h declared_header_size;
242 };
243
244 // Compressed Data
245
246 var lzma2_decode = try lzma2.Decode.init(allocating.allocator);
247 defer lzma2_decode.deinit(allocating.allocator);
248 const before_size = allocating.writer.end;
249 const packed_bytes_read = try lzma2_decode.decompress(input, allocating);
250 const unpacked_bytes = allocating.writer.end - before_size;
251
252 if (packed_size) |s| {
253 if (s != packed_bytes_read) return error.CorruptInput;
254 }
255
256 if (unpacked_size) |s| {
257 if (s != unpacked_bytes) return error.CorruptInput;
258 }
259
260 // Block Padding
261 const block_counter = header_size + packed_bytes_read;
262 const padding = try input.take(@intCast((4 - (block_counter % 4)) % 4));
263 for (padding) |byte| {
264 if (byte != 0) return error.CorruptInput;
265 }
266}
267
268fn finish(d: *Decompress) !void {
269 const input = d.input;
270 const index_size = blk: {
271 // Assume that we already peeked a zero in readBlock().
272 assert(input.buffered()[0] == 0);
273 var input_counter: u64 = 1;
274 var checksum: Crc32 = .init();
275 checksum.update(&.{0});
276 input.toss(1);
277
278 const record_count = try countLeb128(input, u64, &input_counter, &checksum);
279 if (record_count != d.block_count)
280 return error.CorruptInput;
281
282 for (0..@intCast(record_count)) |_| {
283 // TODO: validate records
284 _ = try countLeb128(input, u64, &input_counter, &checksum);
285 _ = try countLeb128(input, u64, &input_counter, &checksum);
286 }
287
288 const padding = try input.take(@intCast((4 - (input_counter % 4)) % 4));
289 for (padding) |byte| {
290 if (byte != 0) return error.CorruptInput;
291 }
292 checksum.update(padding);
293
294 const declared_checksum = try input.takeInt(u32, .little);
295 const computed_checksum = checksum.final();
296 if (computed_checksum != declared_checksum) return error.WrongChecksum;
297
298 break :blk input_counter + padding.len + 4;
299 };
300
301 const declared_checksum = try input.takeInt(u32, .little);
302 const computed_checksum = Crc32.hash(try input.peek(4 + @sizeOf(StreamFlags)));
303 if (declared_checksum != computed_checksum) return error.WrongChecksum;
304 const backward_size = (@as(u64, try input.takeInt(u32, .little)) + 1) * 4;
305 if (backward_size != index_size) return error.CorruptInput;
306 input.toss(@sizeOf(StreamFlags));
307 if (!std.mem.eql(u8, try input.takeArray(2), &.{ 'Y', 'Z' }))
308 return error.CorruptInput;
309}
310
311fn countLeb128(reader: *Reader, comptime T: type, counter: *u64, hasher: *Crc32) !T {
312 try reader.fill(8);
313 const start = reader.seek;
314 const result = try reader.takeLeb128(T);
315 const read_slice = reader.buffer[start..reader.seek];
316 hasher.update(read_slice);
317 counter.* += read_slice.len;
318 return result;
319}