Commit 2933a8241a
lib/std/json/test.zig
@@ -27,6 +27,20 @@ fn err(comptime s: []const u8) void {
} else |_| {}
}
+fn utf8Error(comptime s: []const u8) void {
+ std.testing.expect(!std.json.validate(s));
+
+ var mem_buffer: [1024 * 20]u8 = undefined;
+ const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
+ var p = std.json.Parser.init(allocator, false);
+
+ if (p.parse(s)) |_| {
+ unreachable;
+ } else |e| {
+ std.testing.expect(e == error.InvalidUtf8Byte);
+ }
+}
+
fn any(comptime s: []const u8) void {
_ = std.json.validate(s);
@@ -1936,3 +1950,55 @@ test "i_structure_UTF-8_BOM_empty_object" {
\\{}
);
}
+
+test "truncated UTF-8 sequence" {
+ utf8Error("\"\xc2\"");
+ utf8Error("\"\xdf\"");
+ utf8Error("\"\xed\xa0\"");
+ utf8Error("\"\xf0\x80\"");
+ utf8Error("\"\xf0\x80\x80\"");
+}
+
+test "invalid continuation byte" {
+ utf8Error("\"\xc2\x00\"");
+ utf8Error("\"\xc2\x7f\"");
+ utf8Error("\"\xc2\xc0\"");
+ utf8Error("\"\xc3\xc1\"");
+ utf8Error("\"\xc4\xf5\"");
+ utf8Error("\"\xc5\xff\"");
+ utf8Error("\"\xe4\x80\x00\"");
+ utf8Error("\"\xe5\x80\x10\"");
+ utf8Error("\"\xe6\x80\xc0\"");
+ utf8Error("\"\xe7\x80\xf5\"");
+ utf8Error("\"\xe8\x00\x80\"");
+ utf8Error("\"\xf2\x00\x80\x80\"");
+ utf8Error("\"\xf0\x80\x00\x80\"");
+ utf8Error("\"\xf1\x80\xc0\x80\"");
+ utf8Error("\"\xf2\x80\x80\x00\"");
+ utf8Error("\"\xf3\x80\x80\xc0\"");
+ utf8Error("\"\xf4\x80\x80\xf5\"");
+}
+
+test "disallowed overlong form" {
+ utf8Error("\"\xc0\x80\"");
+ utf8Error("\"\xc0\x90\"");
+ utf8Error("\"\xc1\x80\"");
+ utf8Error("\"\xc1\x90\"");
+ utf8Error("\"\xe0\x80\x80\"");
+ utf8Error("\"\xf0\x80\x80\x80\"");
+}
+
+test "out of UTF-16 range" {
+ utf8Error("\"\xf4\x90\x80\x80\"");
+ utf8Error("\"\xf5\x80\x80\x80\"");
+ utf8Error("\"\xf6\x80\x80\x80\"");
+ utf8Error("\"\xf7\x80\x80\x80\"");
+ utf8Error("\"\xf8\x80\x80\x80\"");
+ utf8Error("\"\xf9\x80\x80\x80\"");
+ utf8Error("\"\xfa\x80\x80\x80\"");
+ utf8Error("\"\xfb\x80\x80\x80\"");
+ utf8Error("\"\xfc\x80\x80\x80\"");
+ utf8Error("\"\xfd\x80\x80\x80\"");
+ utf8Error("\"\xfe\x80\x80\x80\"");
+ utf8Error("\"\xff\x80\x80\x80\"");
+}
lib/std/json.zig
@@ -87,6 +87,8 @@ pub const StreamingParser = struct {
string_last_was_high_surrogate: bool,
// Used inside of StringEscapeHexUnicode* states
string_unicode_codepoint: u21,
+ // The first byte needs to be stored to validate 3- and 4-byte sequences.
+ sequence_first_byte: u8 = undefined,
// When in .Number states, is the number a (still) valid integer?
number_is_integer: bool,
@@ -132,9 +134,12 @@ pub const StreamingParser = struct {
ValueBeginNoClosing,
String,
- StringUtf8Byte3,
- StringUtf8Byte2,
- StringUtf8Byte1,
+ StringUtf8Byte2Of2,
+ StringUtf8Byte2Of3,
+ StringUtf8Byte3Of3,
+ StringUtf8Byte2Of4,
+ StringUtf8Byte3Of4,
+ StringUtf8Byte4Of4,
StringEscapeCharacter,
StringEscapeHexUnicode4,
StringEscapeHexUnicode3,
@@ -581,35 +586,68 @@ pub const StreamingParser = struct {
// non-control ascii
p.string_last_was_high_surrogate = false;
},
- 0xC0...0xDF => {
- p.state = .StringUtf8Byte1;
+ 0xC2...0xDF => {
+ p.state = .StringUtf8Byte2Of2;
},
0xE0...0xEF => {
- p.state = .StringUtf8Byte2;
+ p.state = .StringUtf8Byte2Of3;
+ p.sequence_first_byte = c;
},
- 0xF0...0xFF => {
- p.state = .StringUtf8Byte3;
+ 0xF0...0xF4 => {
+ p.state = .StringUtf8Byte2Of4;
+ p.sequence_first_byte = c;
},
else => {
return error.InvalidUtf8Byte;
},
},
- .StringUtf8Byte3 => switch (c >> 6) {
- 0b10 => p.state = .StringUtf8Byte2,
+ .StringUtf8Byte2Of2 => switch (c >> 6) {
+ 0b10 => p.state = .String,
else => return error.InvalidUtf8Byte,
},
-
- .StringUtf8Byte2 => switch (c >> 6) {
- 0b10 => p.state = .StringUtf8Byte1,
+ .StringUtf8Byte2Of3 => {
+ switch (p.sequence_first_byte) {
+ 0xE0 => switch (c) {
+ 0xA0...0xBF => {},
+ else => return error.InvalidUtf8Byte,
+ },
+ 0xE1...0xEF => switch (c) {
+ 0x80...0xBF => {},
+ else => return error.InvalidUtf8Byte,
+ },
+ else => return error.InvalidUtf8Byte,
+ }
+ p.state = .StringUtf8Byte3Of3;
+ },
+ .StringUtf8Byte3Of3 => switch (c) {
+ 0x80...0xBF => p.state = .String,
else => return error.InvalidUtf8Byte,
},
-
- .StringUtf8Byte1 => switch (c >> 6) {
- 0b10 => {
- p.state = .String;
- p.string_last_was_high_surrogate = false;
- },
+ .StringUtf8Byte2Of4 => {
+ switch (p.sequence_first_byte) {
+ 0xF0 => switch (c) {
+ 0x90...0xBF => {},
+ else => return error.InvalidUtf8Byte,
+ },
+ 0xF1...0xF3 => switch (c) {
+ 0x80...0xBF => {},
+ else => return error.InvalidUtf8Byte,
+ },
+ 0xF4 => switch (c) {
+ 0x80...0x8F => {},
+ else => return error.InvalidUtf8Byte,
+ },
+ else => return error.InvalidUtf8Byte,
+ }
+ p.state = .StringUtf8Byte3Of4;
+ },
+ .StringUtf8Byte3Of4 => switch (c) {
+ 0x80...0xBF => p.state = .StringUtf8Byte4Of4,
+ else => return error.InvalidUtf8Byte,
+ },
+ .StringUtf8Byte4Of4 => switch (c) {
+ 0x80...0xBF => p.state = .String,
else => return error.InvalidUtf8Byte,
},