Commit a354000090

Ersikan <julien.philippon@epitech.eu>
2021-03-14 07:03:22
zig fmt: fix non-UTF-8 encoding #2820
Fixes #2820 After reading the source code, the first two bytes are inspected, and if they correspond to a UTF-16 BOM in little-endian order, the source code is converted to UTF-8.
1 parent f76bd56
Changed files (2)
src/main.zig
@@ -2708,9 +2708,22 @@ pub fn cmdFmt(gpa: *Allocator, args: []const []const u8) !void {
             fatal("cannot use --stdin with positional arguments", .{});
         }
 
-        const stdin = io.getStdIn().reader();
-
-        const source_code = try stdin.readAllAlloc(gpa, max_src_size);
+        const stdin = io.getStdIn();
+
+        const source_code = blk: {
+            const source_code = try stdin.readToEndAllocOptions(gpa, max_src_size, null, @alignOf(u16), null);
+            errdefer gpa.free(source_code);
+
+            // If the file starts with a UTF-16 BOM, translate it to UTF-8
+            if (mem.startsWith(u8, source_code, "\xff\xfe")) {
+                const source_code_utf16_le = mem.bytesAsSlice(u16, source_code);
+                const source_code_utf8 = try std.unicode.utf16leToUtf8Alloc(gpa, source_code_utf16_le);
+                gpa.free(source_code);
+                break :blk source_code_utf8;
+            } else {
+                break :blk source_code;
+            }
+        };
         defer gpa.free(source_code);
 
         var tree = std.zig.parse(gpa, source_code) catch |err| {
@@ -2785,6 +2798,7 @@ const FmtError = error{
     EndOfStream,
     Unseekable,
     NotOpenForWriting,
+    UnknownTextFormat,
 } || fs.File.OpenError;
 
 fn fmtPath(fmt: *Fmt, file_path: []const u8, check_mode: bool, dir: fs.Dir, sub_path: []const u8) FmtError!void {
@@ -2850,20 +2864,38 @@ fn fmtPathFile(
     if (stat.kind == .Directory)
         return error.IsDir;
 
-    const source_code = source_file.readToEndAllocOptions(
-        fmt.gpa,
-        max_src_size,
-        std.math.cast(usize, stat.size) catch return error.FileTooBig,
-        @alignOf(u8),
-        null,
-    ) catch |err| switch (err) {
-        error.ConnectionResetByPeer => unreachable,
-        error.ConnectionTimedOut => unreachable,
-        error.NotOpenForReading => unreachable,
-        else => |e| return e,
+    const source_code = blk: {
+        const source_code = source_file.readToEndAllocOptions(
+            fmt.gpa,
+            max_src_size,
+            std.math.cast(usize, stat.size) catch return error.FileTooBig,
+            @alignOf(u16),
+            null,
+        ) catch |err| switch (err) {
+            error.ConnectionResetByPeer => unreachable,
+            error.ConnectionTimedOut => unreachable,
+            error.NotOpenForReading => unreachable,
+            else => |e| return e,
+        };
+        source_file.close();
+        file_closed = true;
+        errdefer fmt.gpa.free(source_code);
+
+        // If the file starts with a UTF-16 BOM, translate it to UTF-8
+        if (mem.eql(u8, source_code[0..2], "\xff\xfe")) {
+            const source_code_utf16_le = mem.bytesAsSlice(u16, source_code);
+            const source_code_utf8 = std.unicode.utf16leToUtf8Alloc(fmt.gpa, source_code_utf16_le) catch |err| return switch (err) {
+                error.DanglingSurrogateHalf => FmtError.UnknownTextFormat,
+                error.ExpectedSecondSurrogateHalf => FmtError.UnknownTextFormat,
+                error.UnexpectedSecondSurrogateHalf => FmtError.UnknownTextFormat,
+                else => |e| e,
+            };
+            fmt.gpa.free(source_code);
+            break :blk source_code_utf8;
+        } else {
+            break :blk source_code;
+        }
     };
-    source_file.close();
-    file_closed = true;
     defer fmt.gpa.free(source_code);
 
     // Add to set after no longer possible to get error.IsDir.
test/cli.zig
@@ -174,4 +174,13 @@ fn testZigFmt(zig_exe: []const u8, dir_path: []const u8) !void {
     const run_result3 = try exec(dir_path, true, &[_][]const u8{ zig_exe, "fmt", dir_path });
     // both files have been formatted, nothing should change now
     testing.expect(run_result3.stdout.len == 0);
+
+    // Check UTF-16 decoding
+    const fmt4_zig_path = try fs.path.join(a, &[_][]const u8{ dir_path, "fmt4.zig" });
+    var unformatted_code_utf16 = "\xff\xfe \x00 \x00 \x00 \x00/\x00/\x00 \x00n\x00o\x00 \x00r\x00e\x00a\x00s\x00o\x00n\x00";
+    try fs.cwd().writeFile(fmt4_zig_path, unformatted_code_utf16);
+
+    const run_result4 = try exec(dir_path, true, &[_][]const u8{ zig_exe, "fmt", dir_path });
+    testing.expect(std.mem.startsWith(u8, run_result4.stdout, fmt4_zig_path));
+    testing.expect(run_result4.stdout.len == fmt4_zig_path.len + 1 and run_result4.stdout[run_result4.stdout.len - 1] == '\n');
 }