Commit 3158dc424e

LeRoyce Pearson <leroycepearson@geemili.xyz>
2020-03-05 08:07:17
Partially implement cache hash API in zig
1 parent 0ecdbdb
Changed files (2)
lib/std/cache_hash.zig
@@ -0,0 +1,329 @@
+const Blake3 = @import("crypto.zig").Blake3;
+const fs = @import("fs.zig");
+const File = fs.File;
+const base64 = @import("base64.zig");
+const ArrayList = @import("array_list.zig").ArrayList;
+const debug = @import("debug.zig");
+const testing = @import("testing.zig");
+const mem = @import("mem.zig");
+const fmt = @import("fmt.zig");
+const Allocator = mem.Allocator;
+const Buffer = @import("buffer.zig").Buffer;
+const os = @import("os.zig");
+
+const base64_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
+const base64_pad_char = '=';
+const encoder = base64.Base64Encoder.init(base64_alphabet, base64_pad_char);
+const decoder = base64.Base64Decoder.init(base64_alphabet, base64_pad_char);
+const BIN_DIGEST_LEN = 32;
+
+pub const CacheHashFile = struct {
+    path: ?[]const u8,
+    stat: fs.File.Stat,
+    file_handle: os.fd_t,
+    bin_digest: [BIN_DIGEST_LEN]u8,
+    contents: ?[]const u8,
+
+    pub fn deinit(self: *@This(), alloc: *Allocator) void {
+        if (self.path) |owned_slice| {
+            alloc.free(owned_slice);
+            self.path = null;
+        }
+        if (self.contents) |owned_slice| {
+            alloc.free(owned_slice);
+            self.contents = null;
+        }
+    }
+};
+
+pub const CacheHash = struct {
+    alloc: *Allocator,
+    blake3: Blake3,
+    manifest_dir: []const u8,
+    manifest_file_path: ?[]const u8,
+    manifest_file: ?File,
+    manifest_dirty: bool,
+    force_check_manifest: bool,
+    files: ArrayList(CacheHashFile),
+    b64_digest: ArrayList(u8),
+
+    pub fn init(alloc: *Allocator, manifest_dir_path: []const u8) !@This() {
+        return CacheHash{
+            .alloc = alloc,
+            .blake3 = Blake3.init(),
+            .manifest_dir = manifest_dir_path,
+            .manifest_file_path = null,
+            .manifest_file = null,
+            .manifest_dirty = false,
+            .force_check_manifest = false,
+            .files = ArrayList(CacheHashFile).init(alloc),
+            .b64_digest = ArrayList(u8).init(alloc),
+        };
+    }
+
+    pub fn cache_buf(self: *@This(), val: []const u8) !void {
+        debug.assert(self.manifest_file_path == null);
+
+        var temp_buffer = try self.alloc.alloc(u8, val.len + 1);
+        defer self.alloc.free(temp_buffer);
+
+        mem.copy(u8, temp_buffer, val);
+        temp_buffer[val.len] = 0;
+
+        self.blake3.update(temp_buffer);
+    }
+
+    pub fn cache_file(self: *@This(), file_path: []const u8) !void {
+        debug.assert(self.manifest_file_path == null);
+
+        var cache_hash_file = try self.files.addOne();
+        cache_hash_file.path = try fs.path.resolve(self.alloc, &[_][]const u8{file_path});
+
+        try self.cache_buf(cache_hash_file.path.?);
+    }
+
+    pub fn hit(self: *@This(), out_digest: *ArrayList(u8)) !bool {
+        debug.assert(self.manifest_file_path == null);
+
+        var bin_digest: [BIN_DIGEST_LEN]u8 = undefined;
+        self.blake3.final(&bin_digest);
+
+        const OUT_DIGEST_LEN = base64.Base64Encoder.calcSize(BIN_DIGEST_LEN);
+        try self.b64_digest.resize(OUT_DIGEST_LEN);
+        encoder.encode(self.b64_digest.toSlice(), &bin_digest);
+
+        if (self.files.toSlice().len == 0 and !self.force_check_manifest) {
+            try out_digest.resize(OUT_DIGEST_LEN);
+            mem.copy(u8, out_digest.toSlice(), self.b64_digest.toSlice());
+            return true;
+        }
+
+        self.blake3 = Blake3.init();
+        self.blake3.update(&bin_digest);
+
+        {
+            const manifest_file_path_slice = try fs.path.join(self.alloc, &[_][]const u8{ self.manifest_dir, self.b64_digest.toSlice() });
+            var path_buf = ArrayList(u8).fromOwnedSlice(self.alloc, manifest_file_path_slice);
+            defer path_buf.deinit();
+            try path_buf.appendSlice(".txt");
+
+            self.manifest_file_path = path_buf.toOwnedSlice();
+        }
+
+        const cwd = fs.cwd();
+
+        try cwd.makePath(self.manifest_dir);
+
+        // TODO: Open file with a file lock
+        self.manifest_file = try cwd.createFile(self.manifest_file_path.?, .{ .read = true, .truncate = false });
+
+        // TODO: Figure out a good max value?
+        const file_contents = try self.manifest_file.?.inStream().stream.readAllAlloc(self.alloc, 16 * 1024);
+        defer self.alloc.free(file_contents);
+
+        const input_file_count = self.files.len;
+        var any_file_changed = false;
+        var line_iter = mem.tokenize(file_contents, "\n");
+        var idx: usize = 0;
+        while (line_iter.next()) |line| {
+            defer idx += 1;
+
+            var cache_hash_file: *CacheHashFile = undefined;
+            if (idx < input_file_count) {
+                cache_hash_file = self.files.ptrAt(idx);
+            } else {
+                cache_hash_file = try self.files.addOne();
+                cache_hash_file.path = null;
+            }
+
+            var iter = mem.tokenize(line, " ");
+            const file_handle_str = iter.next() orelse return error.InvalidFormat;
+            const mtime_nsec_str = iter.next() orelse return error.InvalidFormat;
+            const digest_str = iter.next() orelse return error.InvalidFormat;
+            const file_path = iter.rest();
+
+            cache_hash_file.file_handle = fmt.parseInt(os.fd_t, file_handle_str, 10) catch return error.InvalidFormat;
+            cache_hash_file.stat.mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat;
+            decoder.decode(&cache_hash_file.bin_digest, digest_str) catch return error.InvalidFormat;
+
+            if (file_path.len == 0) {
+                return error.InvalidFormat;
+            }
+            if (cache_hash_file.path != null and !mem.eql(u8, file_path, cache_hash_file.path.?)) {
+                return error.InvalidFormat;
+            }
+            cache_hash_file.path = try mem.dupe(self.alloc, u8, file_path);
+
+            const this_file = cwd.openFile(cache_hash_file.path.?, .{ .read = true }) catch {
+                self.manifest_file.?.close();
+                self.manifest_file = null;
+                return error.CacheUnavailable;
+            };
+            defer this_file.close();
+            cache_hash_file.stat = try this_file.stat();
+            // TODO: check mtime
+            if (false) {} else {
+                self.manifest_dirty = true;
+
+                // TODO: check for problematic timestamp
+
+                var actual_digest: [32]u8 = undefined;
+                try hash_file(self.alloc, &actual_digest, &this_file);
+
+                if (!mem.eql(u8, &cache_hash_file.bin_digest, &actual_digest)) {
+                    mem.copy(u8, &cache_hash_file.bin_digest, &actual_digest);
+                    // keep going until we have the input file digests
+                    any_file_changed = true;
+                }
+            }
+
+            if (!any_file_changed) {
+                self.blake3.update(&cache_hash_file.bin_digest);
+            }
+        }
+
+        if (any_file_changed) {
+            // cache miss
+            // keep the manifest file open (TODO: with rw lock)
+            // reset the hash
+            self.blake3 = Blake3.init();
+            self.blake3.update(&bin_digest);
+            try self.files.resize(input_file_count);
+            for (self.files.toSlice()) |file| {
+                self.blake3.update(&file.bin_digest);
+            }
+            return false;
+        }
+
+        if (idx < input_file_count or idx == 0) {
+            self.manifest_dirty = true;
+            while (idx < input_file_count) : (idx += 1) {
+                var cache_hash_file = self.files.ptrAt(idx);
+                self.populate_file_hash(cache_hash_file) catch |err| {
+                    self.manifest_file.?.close();
+                    self.manifest_file = null;
+                    return error.CacheUnavailable;
+                };
+            }
+            return false;
+        }
+
+        try self.final(out_digest);
+        return true;
+    }
+
+    pub fn populate_file_hash(self: *@This(), cache_hash_file: *CacheHashFile) !void {
+        debug.assert(cache_hash_file.path != null);
+
+        const this_file = try fs.cwd().openFile(cache_hash_file.path.?, .{});
+        defer this_file.close();
+
+        cache_hash_file.stat = try this_file.stat();
+
+        // TODO: check for problematic timestamp
+
+        try hash_file(self.alloc, &cache_hash_file.bin_digest, &this_file);
+        self.blake3.update(&cache_hash_file.bin_digest);
+    }
+
+    pub fn final(self: *@This(), out_digest: *ArrayList(u8)) !void {
+        debug.assert(self.manifest_file_path != null);
+
+        var bin_digest: [BIN_DIGEST_LEN]u8 = undefined;
+        self.blake3.final(&bin_digest);
+
+        const OUT_DIGEST_LEN = base64.Base64Encoder.calcSize(BIN_DIGEST_LEN);
+        try out_digest.resize(OUT_DIGEST_LEN);
+        encoder.encode(out_digest.toSlice(), &bin_digest);
+    }
+
+    pub fn write_manifest(self: *@This()) !void {
+        debug.assert(self.manifest_file_path != null);
+
+        const OUT_DIGEST_LEN = base64.Base64Encoder.calcSize(BIN_DIGEST_LEN);
+        var encoded_digest = try Buffer.initSize(self.alloc, OUT_DIGEST_LEN);
+        defer encoded_digest.deinit();
+        var contents = try Buffer.init(self.alloc, "");
+        defer contents.deinit();
+
+        for (self.files.toSlice()) |file| {
+            encoder.encode(encoded_digest.toSlice(), &file.bin_digest);
+            try contents.print("{} {} {} {}\n", .{ file.file_handle, file.stat.mtime, encoded_digest.toSlice(), file.path });
+        }
+
+        try self.manifest_file.?.seekTo(0);
+        try self.manifest_file.?.writeAll(contents.toSlice());
+    }
+
+    pub fn release(self: *@This()) void {
+        debug.assert(self.manifest_file_path != null);
+
+        if (self.manifest_dirty) {
+            self.write_manifest() catch |err| {
+                debug.warn("Unable to write cache file '{}': {}\n", .{ self.manifest_file_path, err });
+            };
+        }
+
+        self.manifest_file.?.close();
+        if (self.manifest_file_path) |owned_slice| {
+            self.alloc.free(owned_slice);
+        }
+        for (self.files.toSlice()) |*file| {
+            file.deinit(self.alloc);
+        }
+        self.files.deinit();
+        self.b64_digest.deinit();
+    }
+};
+
+fn hash_file(alloc: *Allocator, bin_digest: []u8, handle: *const File) !void {
+    var blake3 = Blake3.init();
+    var in_stream = handle.inStream().stream;
+
+    const contents = try handle.inStream().stream.readAllAlloc(alloc, 64 * 1024);
+    defer alloc.free(contents);
+
+    blake3.update(contents);
+
+    blake3.final(bin_digest);
+}
+
+test "see if imported" {
+    const cwd = fs.cwd();
+
+    const temp_manifest_dir = "temp_manifest_dir";
+
+    try cwd.writeFile("test.txt", "Hello, world!\n");
+
+    var digest1 = try ArrayList(u8).initCapacity(testing.allocator, 32);
+    defer digest1.deinit();
+    var digest2 = try ArrayList(u8).initCapacity(testing.allocator, 32);
+    defer digest2.deinit();
+
+    {
+        var ch = try CacheHash.init(testing.allocator, temp_manifest_dir);
+        defer ch.release();
+
+        try ch.cache_buf("1234");
+        try ch.cache_file("test.txt");
+
+        // There should be nothing in the cache
+        debug.assert((try ch.hit(&digest1)) == false);
+
+        try ch.final(&digest1);
+    }
+    {
+        var ch = try CacheHash.init(testing.allocator, temp_manifest_dir);
+        defer ch.release();
+
+        try ch.cache_buf("1234");
+        try ch.cache_file("test.txt");
+
+        // Cache hit! We just "built" the same file
+        debug.assert((try ch.hit(&digest2)) == true);
+    }
+
+    debug.assert(mem.eql(u8, digest1.toSlice(), digest2.toSlice()));
+
+    try cwd.deleteTree(temp_manifest_dir);
+}
lib/std/std.zig
@@ -31,6 +31,7 @@ pub const base64 = @import("base64.zig");
 pub const build = @import("build.zig");
 pub const builtin = @import("builtin.zig");
 pub const c = @import("c.zig");
+pub const cache_hash = @import("cache_hash.zig");
 pub const coff = @import("coff.zig");
 pub const crypto = @import("crypto.zig");
 pub const cstr = @import("cstr.zig");