master
1//! The lowest level parsing API in this package;
2//! supports streaming input with a low memory footprint.
3//! The memory requirement is `O(d)` where d is the nesting depth of `[]` or `{}` containers in the input.
4//! Specifically `d/8` bytes are required for this purpose,
5//! with some extra buffer according to the implementation of `std.ArrayList`.
6//!
7//! This scanner can emit partial tokens; see `std.json.Token`.
8//! The input to this class is a sequence of input buffers that you must supply one at a time.
9//! Call `feedInput()` with the first buffer, then call `next()` repeatedly until `error.BufferUnderrun` is returned.
10//! Then call `feedInput()` again and so forth.
11//! Call `endInput()` when the last input buffer has been given to `feedInput()`, either immediately after calling `feedInput()`,
12//! or when `error.BufferUnderrun` requests more data and there is no more.
13//! Be sure to call `next()` after calling `endInput()` until `Token.end_of_document` has been returned.
14//!
15//! Notes on standards compliance: https://datatracker.ietf.org/doc/html/rfc8259
16//! * RFC 8259 requires JSON documents be valid UTF-8,
17//! but makes an allowance for systems that are "part of a closed ecosystem".
18//! I have no idea what that's supposed to mean in the context of a standard specification.
19//! This implementation requires inputs to be valid UTF-8.
20//! * RFC 8259 contradicts itself regarding whether lowercase is allowed in \u hex digits,
21//! but this is probably a bug in the spec, and it's clear that lowercase is meant to be allowed.
22//! (RFC 5234 defines HEXDIG to only allow uppercase.)
23//! * When RFC 8259 refers to a "character", I assume they really mean a "Unicode scalar value".
24//! See http://www.unicode.org/glossary/#unicode_scalar_value .
25//! * RFC 8259 doesn't explicitly disallow unpaired surrogate halves in \u escape sequences,
26//! but vaguely implies that \u escapes are for encoding Unicode "characters" (i.e. Unicode scalar values?),
27//! which would mean that unpaired surrogate halves are forbidden.
28//! By contrast ECMA-404 (a competing(/compatible?) JSON standard, which JavaScript's JSON.parse() conforms to)
29//! explicitly allows unpaired surrogate halves.
30//! This implementation forbids unpaired surrogate halves in \u sequences.
31//! If a high surrogate half appears in a \u sequence,
32//! then a low surrogate half must immediately follow in \u notation.
33//! * RFC 8259 allows implementations to "accept non-JSON forms or extensions".
34//! This implementation does not accept any of that.
35//! * RFC 8259 allows implementations to put limits on "the size of texts",
36//! "the maximum depth of nesting", "the range and precision of numbers",
37//! and "the length and character contents of strings".
38//! This low-level implementation does not limit these,
39//! except where noted above, and except that nesting depth requires memory allocation.
40//! Note that this low-level API does not interpret numbers numerically,
41//! but simply emits their source form for some higher level code to make sense of.
42//! * This low-level implementation allows duplicate object keys,
43//! and key/value pairs are emitted in the order they appear in the input.
44
45const Scanner = @This();
46const std = @import("std");
47
48const Allocator = std.mem.Allocator;
49const assert = std.debug.assert;
50const BitStack = std.BitStack;
51
52state: State = .value,
53string_is_object_key: bool = false,
54stack: BitStack,
55value_start: usize = undefined,
56utf16_code_units: [2]u16 = undefined,
57
58input: []const u8 = "",
59cursor: usize = 0,
60is_end_of_input: bool = false,
61diagnostics: ?*Diagnostics = null,
62
63/// The allocator is only used to track `[]` and `{}` nesting levels.
64pub fn initStreaming(allocator: Allocator) @This() {
65 return .{
66 .stack = BitStack.init(allocator),
67 };
68}
69/// Use this if your input is a single slice.
70/// This is effectively equivalent to:
71/// ```
72/// initStreaming(allocator);
73/// feedInput(complete_input);
74/// endInput();
75/// ```
76pub fn initCompleteInput(allocator: Allocator, complete_input: []const u8) @This() {
77 return .{
78 .stack = BitStack.init(allocator),
79 .input = complete_input,
80 .is_end_of_input = true,
81 };
82}
83pub fn deinit(self: *@This()) void {
84 self.stack.deinit();
85 self.* = undefined;
86}
87
88pub fn enableDiagnostics(self: *@This(), diagnostics: *Diagnostics) void {
89 diagnostics.cursor_pointer = &self.cursor;
90 self.diagnostics = diagnostics;
91}
92
93/// Call this whenever you get `error.BufferUnderrun` from `next()`.
94/// When there is no more input to provide, call `endInput()`.
95pub fn feedInput(self: *@This(), input: []const u8) void {
96 assert(self.cursor == self.input.len); // Not done with the last input slice.
97 if (self.diagnostics) |diag| {
98 diag.total_bytes_before_current_input += self.input.len;
99 // This usually goes "negative" to measure how far before the beginning
100 // of the new buffer the current line started.
101 diag.line_start_cursor -%= self.cursor;
102 }
103 self.input = input;
104 self.cursor = 0;
105 self.value_start = 0;
106}
107/// Call this when you will no longer call `feedInput()` anymore.
108/// This can be called either immediately after the last `feedInput()`,
109/// or at any time afterward, such as when getting `error.BufferUnderrun` from `next()`.
110/// Don't forget to call `next*()` after `endInput()` until you get `.end_of_document`.
111pub fn endInput(self: *@This()) void {
112 self.is_end_of_input = true;
113}
114
115pub const NextError = Error || Allocator.Error || error{BufferUnderrun};
116pub const AllocError = Error || Allocator.Error || error{ValueTooLong};
117pub const PeekError = Error || error{BufferUnderrun};
118pub const SkipError = Error || Allocator.Error;
119pub const AllocIntoArrayListError = AllocError || error{BufferUnderrun};
120
121/// Equivalent to `nextAllocMax(allocator, when, default_max_value_len);`
122/// This function is only available after `endInput()` (or `initCompleteInput()`) has been called.
123/// See also `std.json.Token` for documentation of `nextAlloc*()` function behavior.
124pub fn nextAlloc(self: *@This(), allocator: Allocator, when: AllocWhen) AllocError!Token {
125 return self.nextAllocMax(allocator, when, default_max_value_len);
126}
127
128/// This function is only available after `endInput()` (or `initCompleteInput()`) has been called.
129/// See also `std.json.Token` for documentation of `nextAlloc*()` function behavior.
130pub fn nextAllocMax(self: *@This(), allocator: Allocator, when: AllocWhen, max_value_len: usize) AllocError!Token {
131 assert(self.is_end_of_input); // This function is not available in streaming mode.
132 const token_type = self.peekNextTokenType() catch |e| switch (e) {
133 error.BufferUnderrun => unreachable,
134 else => |err| return err,
135 };
136 switch (token_type) {
137 .number, .string => {
138 var value_list = std.array_list.Managed(u8).init(allocator);
139 errdefer {
140 value_list.deinit();
141 }
142 if (self.allocNextIntoArrayListMax(&value_list, when, max_value_len) catch |e| switch (e) {
143 error.BufferUnderrun => unreachable,
144 else => |err| return err,
145 }) |slice| {
146 return if (token_type == .number)
147 Token{ .number = slice }
148 else
149 Token{ .string = slice };
150 } else {
151 return if (token_type == .number)
152 Token{ .allocated_number = try value_list.toOwnedSlice() }
153 else
154 Token{ .allocated_string = try value_list.toOwnedSlice() };
155 }
156 },
157
158 // Simple tokens never alloc.
159 .object_begin,
160 .object_end,
161 .array_begin,
162 .array_end,
163 .true,
164 .false,
165 .null,
166 .end_of_document,
167 => return self.next() catch |e| switch (e) {
168 error.BufferUnderrun => unreachable,
169 else => |err| return err,
170 },
171 }
172}
173
174/// Equivalent to `allocNextIntoArrayListMax(value_list, when, default_max_value_len);`
175pub fn allocNextIntoArrayList(self: *@This(), value_list: *std.array_list.Managed(u8), when: AllocWhen) AllocIntoArrayListError!?[]const u8 {
176 return self.allocNextIntoArrayListMax(value_list, when, default_max_value_len);
177}
178/// The next token type must be either `.number` or `.string`. See `peekNextTokenType()`.
179/// When allocation is not necessary with `.alloc_if_needed`,
180/// this method returns the content slice from the input buffer, and `value_list` is not touched.
181/// When allocation is necessary or with `.alloc_always`, this method concatenates partial tokens into the given `value_list`,
182/// and returns `null` once the final `.number` or `.string` token has been written into it.
183/// In case of an `error.BufferUnderrun`, partial values will be left in the given value_list.
184/// The given `value_list` is never reset by this method, so an `error.BufferUnderrun` situation
185/// can be resumed by passing the same array list in again.
186/// This method does not indicate whether the token content being returned is for a `.number` or `.string` token type;
187/// the caller of this method is expected to know which type of token is being processed.
188pub fn allocNextIntoArrayListMax(self: *@This(), value_list: *std.array_list.Managed(u8), when: AllocWhen, max_value_len: usize) AllocIntoArrayListError!?[]const u8 {
189 while (true) {
190 const token = try self.next();
191 switch (token) {
192 // Accumulate partial values.
193 .partial_number, .partial_string => |slice| {
194 try appendSlice(value_list, slice, max_value_len);
195 },
196 .partial_string_escaped_1 => |buf| {
197 try appendSlice(value_list, buf[0..], max_value_len);
198 },
199 .partial_string_escaped_2 => |buf| {
200 try appendSlice(value_list, buf[0..], max_value_len);
201 },
202 .partial_string_escaped_3 => |buf| {
203 try appendSlice(value_list, buf[0..], max_value_len);
204 },
205 .partial_string_escaped_4 => |buf| {
206 try appendSlice(value_list, buf[0..], max_value_len);
207 },
208
209 // Return complete values.
210 .number => |slice| {
211 if (when == .alloc_if_needed and value_list.items.len == 0) {
212 // No alloc necessary.
213 return slice;
214 }
215 try appendSlice(value_list, slice, max_value_len);
216 // The token is complete.
217 return null;
218 },
219 .string => |slice| {
220 if (when == .alloc_if_needed and value_list.items.len == 0) {
221 // No alloc necessary.
222 return slice;
223 }
224 try appendSlice(value_list, slice, max_value_len);
225 // The token is complete.
226 return null;
227 },
228
229 .object_begin,
230 .object_end,
231 .array_begin,
232 .array_end,
233 .true,
234 .false,
235 .null,
236 .end_of_document,
237 => unreachable, // Only .number and .string token types are allowed here. Check peekNextTokenType() before calling this.
238
239 .allocated_number, .allocated_string => unreachable,
240 }
241 }
242}
243
244/// This function is only available after `endInput()` (or `initCompleteInput()`) has been called.
245/// If the next token type is `.object_begin` or `.array_begin`,
246/// this function calls `next()` repeatedly until the corresponding `.object_end` or `.array_end` is found.
247/// If the next token type is `.number` or `.string`,
248/// this function calls `next()` repeatedly until the (non `.partial_*`) `.number` or `.string` token is found.
249/// If the next token type is `.true`, `.false`, or `.null`, this function calls `next()` once.
250/// The next token type must not be `.object_end`, `.array_end`, or `.end_of_document`;
251/// see `peekNextTokenType()`.
252pub fn skipValue(self: *@This()) SkipError!void {
253 assert(self.is_end_of_input); // This function is not available in streaming mode.
254 switch (self.peekNextTokenType() catch |e| switch (e) {
255 error.BufferUnderrun => unreachable,
256 else => |err| return err,
257 }) {
258 .object_begin, .array_begin => {
259 self.skipUntilStackHeight(self.stackHeight()) catch |e| switch (e) {
260 error.BufferUnderrun => unreachable,
261 else => |err| return err,
262 };
263 },
264 .number, .string => {
265 while (true) {
266 switch (self.next() catch |e| switch (e) {
267 error.BufferUnderrun => unreachable,
268 else => |err| return err,
269 }) {
270 .partial_number,
271 .partial_string,
272 .partial_string_escaped_1,
273 .partial_string_escaped_2,
274 .partial_string_escaped_3,
275 .partial_string_escaped_4,
276 => continue,
277
278 .number, .string => break,
279
280 else => unreachable,
281 }
282 }
283 },
284 .true, .false, .null => {
285 _ = self.next() catch |e| switch (e) {
286 error.BufferUnderrun => unreachable,
287 else => |err| return err,
288 };
289 },
290
291 .object_end, .array_end, .end_of_document => unreachable, // Attempt to skip a non-value token.
292 }
293}
294
295/// Skip tokens until an `.object_end` or `.array_end` token results in a `stackHeight()` equal the given stack height.
296/// Unlike `skipValue()`, this function is available in streaming mode.
297pub fn skipUntilStackHeight(self: *@This(), terminal_stack_height: usize) NextError!void {
298 while (true) {
299 switch (try self.next()) {
300 .object_end, .array_end => {
301 if (self.stackHeight() == terminal_stack_height) break;
302 },
303 .end_of_document => unreachable,
304 else => continue,
305 }
306 }
307}
308
309/// The depth of `{}` or `[]` nesting levels at the current position.
310pub fn stackHeight(self: *const @This()) usize {
311 return self.stack.bit_len;
312}
313
314/// Pre allocate memory to hold the given number of nesting levels.
315/// `stackHeight()` up to the given number will not cause allocations.
316pub fn ensureTotalStackCapacity(self: *@This(), height: usize) Allocator.Error!void {
317 try self.stack.ensureTotalCapacity(height);
318}
319
320/// See `std.json.Token` for documentation of this function.
321pub fn next(self: *@This()) NextError!Token {
322 state_loop: while (true) {
323 switch (self.state) {
324 .value => {
325 switch (try self.skipWhitespaceExpectByte()) {
326 // Object, Array
327 '{' => {
328 try self.stack.push(OBJECT_MODE);
329 self.cursor += 1;
330 self.state = .object_start;
331 return .object_begin;
332 },
333 '[' => {
334 try self.stack.push(ARRAY_MODE);
335 self.cursor += 1;
336 self.state = .array_start;
337 return .array_begin;
338 },
339
340 // String
341 '"' => {
342 self.cursor += 1;
343 self.value_start = self.cursor;
344 self.state = .string;
345 continue :state_loop;
346 },
347
348 // Number
349 '1'...'9' => {
350 self.value_start = self.cursor;
351 self.cursor += 1;
352 self.state = .number_int;
353 continue :state_loop;
354 },
355 '0' => {
356 self.value_start = self.cursor;
357 self.cursor += 1;
358 self.state = .number_leading_zero;
359 continue :state_loop;
360 },
361 '-' => {
362 self.value_start = self.cursor;
363 self.cursor += 1;
364 self.state = .number_minus;
365 continue :state_loop;
366 },
367
368 // literal values
369 't' => {
370 self.cursor += 1;
371 self.state = .literal_t;
372 continue :state_loop;
373 },
374 'f' => {
375 self.cursor += 1;
376 self.state = .literal_f;
377 continue :state_loop;
378 },
379 'n' => {
380 self.cursor += 1;
381 self.state = .literal_n;
382 continue :state_loop;
383 },
384
385 else => return error.SyntaxError,
386 }
387 },
388
389 .post_value => {
390 if (try self.skipWhitespaceCheckEnd()) return .end_of_document;
391
392 const c = self.input[self.cursor];
393 if (self.string_is_object_key) {
394 self.string_is_object_key = false;
395 switch (c) {
396 ':' => {
397 self.cursor += 1;
398 self.state = .value;
399 continue :state_loop;
400 },
401 else => return error.SyntaxError,
402 }
403 }
404
405 switch (c) {
406 '}' => {
407 if (self.stack.pop() != OBJECT_MODE) return error.SyntaxError;
408 self.cursor += 1;
409 // stay in .post_value state.
410 return .object_end;
411 },
412 ']' => {
413 if (self.stack.pop() != ARRAY_MODE) return error.SyntaxError;
414 self.cursor += 1;
415 // stay in .post_value state.
416 return .array_end;
417 },
418 ',' => {
419 switch (self.stack.peek()) {
420 OBJECT_MODE => {
421 self.state = .object_post_comma;
422 },
423 ARRAY_MODE => {
424 self.state = .value;
425 },
426 }
427 self.cursor += 1;
428 continue :state_loop;
429 },
430 else => return error.SyntaxError,
431 }
432 },
433
434 .object_start => {
435 switch (try self.skipWhitespaceExpectByte()) {
436 '"' => {
437 self.cursor += 1;
438 self.value_start = self.cursor;
439 self.state = .string;
440 self.string_is_object_key = true;
441 continue :state_loop;
442 },
443 '}' => {
444 self.cursor += 1;
445 _ = self.stack.pop();
446 self.state = .post_value;
447 return .object_end;
448 },
449 else => return error.SyntaxError,
450 }
451 },
452 .object_post_comma => {
453 switch (try self.skipWhitespaceExpectByte()) {
454 '"' => {
455 self.cursor += 1;
456 self.value_start = self.cursor;
457 self.state = .string;
458 self.string_is_object_key = true;
459 continue :state_loop;
460 },
461 else => return error.SyntaxError,
462 }
463 },
464
465 .array_start => {
466 switch (try self.skipWhitespaceExpectByte()) {
467 ']' => {
468 self.cursor += 1;
469 _ = self.stack.pop();
470 self.state = .post_value;
471 return .array_end;
472 },
473 else => {
474 self.state = .value;
475 continue :state_loop;
476 },
477 }
478 },
479
480 .number_minus => {
481 if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
482 switch (self.input[self.cursor]) {
483 '0' => {
484 self.cursor += 1;
485 self.state = .number_leading_zero;
486 continue :state_loop;
487 },
488 '1'...'9' => {
489 self.cursor += 1;
490 self.state = .number_int;
491 continue :state_loop;
492 },
493 else => return error.SyntaxError,
494 }
495 },
496 .number_leading_zero => {
497 if (self.cursor >= self.input.len) return self.endOfBufferInNumber(true);
498 switch (self.input[self.cursor]) {
499 '.' => {
500 self.cursor += 1;
501 self.state = .number_post_dot;
502 continue :state_loop;
503 },
504 'e', 'E' => {
505 self.cursor += 1;
506 self.state = .number_post_e;
507 continue :state_loop;
508 },
509 else => {
510 self.state = .post_value;
511 return Token{ .number = self.takeValueSlice() };
512 },
513 }
514 },
515 .number_int => {
516 while (self.cursor < self.input.len) : (self.cursor += 1) {
517 switch (self.input[self.cursor]) {
518 '0'...'9' => continue,
519 '.' => {
520 self.cursor += 1;
521 self.state = .number_post_dot;
522 continue :state_loop;
523 },
524 'e', 'E' => {
525 self.cursor += 1;
526 self.state = .number_post_e;
527 continue :state_loop;
528 },
529 else => {
530 self.state = .post_value;
531 return Token{ .number = self.takeValueSlice() };
532 },
533 }
534 }
535 return self.endOfBufferInNumber(true);
536 },
537 .number_post_dot => {
538 if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
539 switch (self.input[self.cursor]) {
540 '0'...'9' => {
541 self.cursor += 1;
542 self.state = .number_frac;
543 continue :state_loop;
544 },
545 else => return error.SyntaxError,
546 }
547 },
548 .number_frac => {
549 while (self.cursor < self.input.len) : (self.cursor += 1) {
550 switch (self.input[self.cursor]) {
551 '0'...'9' => continue,
552 'e', 'E' => {
553 self.cursor += 1;
554 self.state = .number_post_e;
555 continue :state_loop;
556 },
557 else => {
558 self.state = .post_value;
559 return Token{ .number = self.takeValueSlice() };
560 },
561 }
562 }
563 return self.endOfBufferInNumber(true);
564 },
565 .number_post_e => {
566 if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
567 switch (self.input[self.cursor]) {
568 '0'...'9' => {
569 self.cursor += 1;
570 self.state = .number_exp;
571 continue :state_loop;
572 },
573 '+', '-' => {
574 self.cursor += 1;
575 self.state = .number_post_e_sign;
576 continue :state_loop;
577 },
578 else => return error.SyntaxError,
579 }
580 },
581 .number_post_e_sign => {
582 if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
583 switch (self.input[self.cursor]) {
584 '0'...'9' => {
585 self.cursor += 1;
586 self.state = .number_exp;
587 continue :state_loop;
588 },
589 else => return error.SyntaxError,
590 }
591 },
592 .number_exp => {
593 while (self.cursor < self.input.len) : (self.cursor += 1) {
594 switch (self.input[self.cursor]) {
595 '0'...'9' => continue,
596 else => {
597 self.state = .post_value;
598 return Token{ .number = self.takeValueSlice() };
599 },
600 }
601 }
602 return self.endOfBufferInNumber(true);
603 },
604
605 .string => {
606 while (self.cursor < self.input.len) : (self.cursor += 1) {
607 switch (self.input[self.cursor]) {
608 0...0x1f => return error.SyntaxError, // Bare ASCII control code in string.
609
610 // ASCII plain text.
611 0x20...('"' - 1), ('"' + 1)...('\\' - 1), ('\\' + 1)...0x7F => continue,
612
613 // Special characters.
614 '"' => {
615 const result = Token{ .string = self.takeValueSlice() };
616 self.cursor += 1;
617 self.state = .post_value;
618 return result;
619 },
620 '\\' => {
621 const slice = self.takeValueSlice();
622 self.cursor += 1;
623 self.state = .string_backslash;
624 if (slice.len > 0) return Token{ .partial_string = slice };
625 continue :state_loop;
626 },
627
628 // UTF-8 validation.
629 // See http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
630 0xC2...0xDF => {
631 self.cursor += 1;
632 self.state = .string_utf8_last_byte;
633 continue :state_loop;
634 },
635 0xE0 => {
636 self.cursor += 1;
637 self.state = .string_utf8_second_to_last_byte_guard_against_overlong;
638 continue :state_loop;
639 },
640 0xE1...0xEC, 0xEE...0xEF => {
641 self.cursor += 1;
642 self.state = .string_utf8_second_to_last_byte;
643 continue :state_loop;
644 },
645 0xED => {
646 self.cursor += 1;
647 self.state = .string_utf8_second_to_last_byte_guard_against_surrogate_half;
648 continue :state_loop;
649 },
650 0xF0 => {
651 self.cursor += 1;
652 self.state = .string_utf8_third_to_last_byte_guard_against_overlong;
653 continue :state_loop;
654 },
655 0xF1...0xF3 => {
656 self.cursor += 1;
657 self.state = .string_utf8_third_to_last_byte;
658 continue :state_loop;
659 },
660 0xF4 => {
661 self.cursor += 1;
662 self.state = .string_utf8_third_to_last_byte_guard_against_too_large;
663 continue :state_loop;
664 },
665 0x80...0xC1, 0xF5...0xFF => return error.SyntaxError, // Invalid UTF-8.
666 }
667 }
668 if (self.is_end_of_input) return error.UnexpectedEndOfInput;
669 const slice = self.takeValueSlice();
670 if (slice.len > 0) return Token{ .partial_string = slice };
671 return error.BufferUnderrun;
672 },
673 .string_backslash => {
674 if (self.cursor >= self.input.len) return self.endOfBufferInString();
675 switch (self.input[self.cursor]) {
676 '"', '\\', '/' => {
677 // Since these characters now represent themselves literally,
678 // we can simply begin the next plaintext slice here.
679 self.value_start = self.cursor;
680 self.cursor += 1;
681 self.state = .string;
682 continue :state_loop;
683 },
684 'b' => {
685 self.cursor += 1;
686 self.value_start = self.cursor;
687 self.state = .string;
688 return Token{ .partial_string_escaped_1 = [_]u8{0x08} };
689 },
690 'f' => {
691 self.cursor += 1;
692 self.value_start = self.cursor;
693 self.state = .string;
694 return Token{ .partial_string_escaped_1 = [_]u8{0x0c} };
695 },
696 'n' => {
697 self.cursor += 1;
698 self.value_start = self.cursor;
699 self.state = .string;
700 return Token{ .partial_string_escaped_1 = [_]u8{'\n'} };
701 },
702 'r' => {
703 self.cursor += 1;
704 self.value_start = self.cursor;
705 self.state = .string;
706 return Token{ .partial_string_escaped_1 = [_]u8{'\r'} };
707 },
708 't' => {
709 self.cursor += 1;
710 self.value_start = self.cursor;
711 self.state = .string;
712 return Token{ .partial_string_escaped_1 = [_]u8{'\t'} };
713 },
714 'u' => {
715 self.cursor += 1;
716 self.state = .string_backslash_u;
717 continue :state_loop;
718 },
719 else => return error.SyntaxError,
720 }
721 },
722 .string_backslash_u => {
723 if (self.cursor >= self.input.len) return self.endOfBufferInString();
724 const c = self.input[self.cursor];
725 switch (c) {
726 '0'...'9' => {
727 self.utf16_code_units[0] = @as(u16, c - '0') << 12;
728 },
729 'A'...'F' => {
730 self.utf16_code_units[0] = @as(u16, c - 'A' + 10) << 12;
731 },
732 'a'...'f' => {
733 self.utf16_code_units[0] = @as(u16, c - 'a' + 10) << 12;
734 },
735 else => return error.SyntaxError,
736 }
737 self.cursor += 1;
738 self.state = .string_backslash_u_1;
739 continue :state_loop;
740 },
741 .string_backslash_u_1 => {
742 if (self.cursor >= self.input.len) return self.endOfBufferInString();
743 const c = self.input[self.cursor];
744 switch (c) {
745 '0'...'9' => {
746 self.utf16_code_units[0] |= @as(u16, c - '0') << 8;
747 },
748 'A'...'F' => {
749 self.utf16_code_units[0] |= @as(u16, c - 'A' + 10) << 8;
750 },
751 'a'...'f' => {
752 self.utf16_code_units[0] |= @as(u16, c - 'a' + 10) << 8;
753 },
754 else => return error.SyntaxError,
755 }
756 self.cursor += 1;
757 self.state = .string_backslash_u_2;
758 continue :state_loop;
759 },
760 .string_backslash_u_2 => {
761 if (self.cursor >= self.input.len) return self.endOfBufferInString();
762 const c = self.input[self.cursor];
763 switch (c) {
764 '0'...'9' => {
765 self.utf16_code_units[0] |= @as(u16, c - '0') << 4;
766 },
767 'A'...'F' => {
768 self.utf16_code_units[0] |= @as(u16, c - 'A' + 10) << 4;
769 },
770 'a'...'f' => {
771 self.utf16_code_units[0] |= @as(u16, c - 'a' + 10) << 4;
772 },
773 else => return error.SyntaxError,
774 }
775 self.cursor += 1;
776 self.state = .string_backslash_u_3;
777 continue :state_loop;
778 },
779 .string_backslash_u_3 => {
780 if (self.cursor >= self.input.len) return self.endOfBufferInString();
781 const c = self.input[self.cursor];
782 switch (c) {
783 '0'...'9' => {
784 self.utf16_code_units[0] |= c - '0';
785 },
786 'A'...'F' => {
787 self.utf16_code_units[0] |= c - 'A' + 10;
788 },
789 'a'...'f' => {
790 self.utf16_code_units[0] |= c - 'a' + 10;
791 },
792 else => return error.SyntaxError,
793 }
794 self.cursor += 1;
795 if (std.unicode.utf16IsHighSurrogate(self.utf16_code_units[0])) {
796 self.state = .string_surrogate_half;
797 continue :state_loop;
798 } else if (std.unicode.utf16IsLowSurrogate(self.utf16_code_units[0])) {
799 return error.SyntaxError; // Unexpected low surrogate half.
800 } else {
801 self.value_start = self.cursor;
802 self.state = .string;
803 return partialStringCodepoint(self.utf16_code_units[0]);
804 }
805 },
806 .string_surrogate_half => {
807 if (self.cursor >= self.input.len) return self.endOfBufferInString();
808 switch (self.input[self.cursor]) {
809 '\\' => {
810 self.cursor += 1;
811 self.state = .string_surrogate_half_backslash;
812 continue :state_loop;
813 },
814 else => return error.SyntaxError, // Expected low surrogate half.
815 }
816 },
817 .string_surrogate_half_backslash => {
818 if (self.cursor >= self.input.len) return self.endOfBufferInString();
819 switch (self.input[self.cursor]) {
820 'u' => {
821 self.cursor += 1;
822 self.state = .string_surrogate_half_backslash_u;
823 continue :state_loop;
824 },
825 else => return error.SyntaxError, // Expected low surrogate half.
826 }
827 },
828 .string_surrogate_half_backslash_u => {
829 if (self.cursor >= self.input.len) return self.endOfBufferInString();
830 switch (self.input[self.cursor]) {
831 'D', 'd' => {
832 self.cursor += 1;
833 self.utf16_code_units[1] = 0xD << 12;
834 self.state = .string_surrogate_half_backslash_u_1;
835 continue :state_loop;
836 },
837 else => return error.SyntaxError, // Expected low surrogate half.
838 }
839 },
840 .string_surrogate_half_backslash_u_1 => {
841 if (self.cursor >= self.input.len) return self.endOfBufferInString();
842 const c = self.input[self.cursor];
843 switch (c) {
844 'C'...'F' => {
845 self.cursor += 1;
846 self.utf16_code_units[1] |= @as(u16, c - 'A' + 10) << 8;
847 self.state = .string_surrogate_half_backslash_u_2;
848 continue :state_loop;
849 },
850 'c'...'f' => {
851 self.cursor += 1;
852 self.utf16_code_units[1] |= @as(u16, c - 'a' + 10) << 8;
853 self.state = .string_surrogate_half_backslash_u_2;
854 continue :state_loop;
855 },
856 else => return error.SyntaxError, // Expected low surrogate half.
857 }
858 },
859 .string_surrogate_half_backslash_u_2 => {
860 if (self.cursor >= self.input.len) return self.endOfBufferInString();
861 const c = self.input[self.cursor];
862 switch (c) {
863 '0'...'9' => {
864 self.cursor += 1;
865 self.utf16_code_units[1] |= @as(u16, c - '0') << 4;
866 self.state = .string_surrogate_half_backslash_u_3;
867 continue :state_loop;
868 },
869 'A'...'F' => {
870 self.cursor += 1;
871 self.utf16_code_units[1] |= @as(u16, c - 'A' + 10) << 4;
872 self.state = .string_surrogate_half_backslash_u_3;
873 continue :state_loop;
874 },
875 'a'...'f' => {
876 self.cursor += 1;
877 self.utf16_code_units[1] |= @as(u16, c - 'a' + 10) << 4;
878 self.state = .string_surrogate_half_backslash_u_3;
879 continue :state_loop;
880 },
881 else => return error.SyntaxError,
882 }
883 },
884 .string_surrogate_half_backslash_u_3 => {
885 if (self.cursor >= self.input.len) return self.endOfBufferInString();
886 const c = self.input[self.cursor];
887 switch (c) {
888 '0'...'9' => {
889 self.utf16_code_units[1] |= c - '0';
890 },
891 'A'...'F' => {
892 self.utf16_code_units[1] |= c - 'A' + 10;
893 },
894 'a'...'f' => {
895 self.utf16_code_units[1] |= c - 'a' + 10;
896 },
897 else => return error.SyntaxError,
898 }
899 self.cursor += 1;
900 self.value_start = self.cursor;
901 self.state = .string;
902 const code_point = std.unicode.utf16DecodeSurrogatePair(&self.utf16_code_units) catch unreachable;
903 return partialStringCodepoint(code_point);
904 },
905
906 .string_utf8_last_byte => {
907 if (self.cursor >= self.input.len) return self.endOfBufferInString();
908 switch (self.input[self.cursor]) {
909 0x80...0xBF => {
910 self.cursor += 1;
911 self.state = .string;
912 continue :state_loop;
913 },
914 else => return error.SyntaxError, // Invalid UTF-8.
915 }
916 },
917 .string_utf8_second_to_last_byte => {
918 if (self.cursor >= self.input.len) return self.endOfBufferInString();
919 switch (self.input[self.cursor]) {
920 0x80...0xBF => {
921 self.cursor += 1;
922 self.state = .string_utf8_last_byte;
923 continue :state_loop;
924 },
925 else => return error.SyntaxError, // Invalid UTF-8.
926 }
927 },
928 .string_utf8_second_to_last_byte_guard_against_overlong => {
929 if (self.cursor >= self.input.len) return self.endOfBufferInString();
930 switch (self.input[self.cursor]) {
931 0xA0...0xBF => {
932 self.cursor += 1;
933 self.state = .string_utf8_last_byte;
934 continue :state_loop;
935 },
936 else => return error.SyntaxError, // Invalid UTF-8.
937 }
938 },
939 .string_utf8_second_to_last_byte_guard_against_surrogate_half => {
940 if (self.cursor >= self.input.len) return self.endOfBufferInString();
941 switch (self.input[self.cursor]) {
942 0x80...0x9F => {
943 self.cursor += 1;
944 self.state = .string_utf8_last_byte;
945 continue :state_loop;
946 },
947 else => return error.SyntaxError, // Invalid UTF-8.
948 }
949 },
950 .string_utf8_third_to_last_byte => {
951 if (self.cursor >= self.input.len) return self.endOfBufferInString();
952 switch (self.input[self.cursor]) {
953 0x80...0xBF => {
954 self.cursor += 1;
955 self.state = .string_utf8_second_to_last_byte;
956 continue :state_loop;
957 },
958 else => return error.SyntaxError, // Invalid UTF-8.
959 }
960 },
961 .string_utf8_third_to_last_byte_guard_against_overlong => {
962 if (self.cursor >= self.input.len) return self.endOfBufferInString();
963 switch (self.input[self.cursor]) {
964 0x90...0xBF => {
965 self.cursor += 1;
966 self.state = .string_utf8_second_to_last_byte;
967 continue :state_loop;
968 },
969 else => return error.SyntaxError, // Invalid UTF-8.
970 }
971 },
972 .string_utf8_third_to_last_byte_guard_against_too_large => {
973 if (self.cursor >= self.input.len) return self.endOfBufferInString();
974 switch (self.input[self.cursor]) {
975 0x80...0x8F => {
976 self.cursor += 1;
977 self.state = .string_utf8_second_to_last_byte;
978 continue :state_loop;
979 },
980 else => return error.SyntaxError, // Invalid UTF-8.
981 }
982 },
983
984 .literal_t => {
985 switch (try self.expectByte()) {
986 'r' => {
987 self.cursor += 1;
988 self.state = .literal_tr;
989 continue :state_loop;
990 },
991 else => return error.SyntaxError,
992 }
993 },
994 .literal_tr => {
995 switch (try self.expectByte()) {
996 'u' => {
997 self.cursor += 1;
998 self.state = .literal_tru;
999 continue :state_loop;
1000 },
1001 else => return error.SyntaxError,
1002 }
1003 },
1004 .literal_tru => {
1005 switch (try self.expectByte()) {
1006 'e' => {
1007 self.cursor += 1;
1008 self.state = .post_value;
1009 return .true;
1010 },
1011 else => return error.SyntaxError,
1012 }
1013 },
1014 .literal_f => {
1015 switch (try self.expectByte()) {
1016 'a' => {
1017 self.cursor += 1;
1018 self.state = .literal_fa;
1019 continue :state_loop;
1020 },
1021 else => return error.SyntaxError,
1022 }
1023 },
1024 .literal_fa => {
1025 switch (try self.expectByte()) {
1026 'l' => {
1027 self.cursor += 1;
1028 self.state = .literal_fal;
1029 continue :state_loop;
1030 },
1031 else => return error.SyntaxError,
1032 }
1033 },
1034 .literal_fal => {
1035 switch (try self.expectByte()) {
1036 's' => {
1037 self.cursor += 1;
1038 self.state = .literal_fals;
1039 continue :state_loop;
1040 },
1041 else => return error.SyntaxError,
1042 }
1043 },
1044 .literal_fals => {
1045 switch (try self.expectByte()) {
1046 'e' => {
1047 self.cursor += 1;
1048 self.state = .post_value;
1049 return .false;
1050 },
1051 else => return error.SyntaxError,
1052 }
1053 },
1054 .literal_n => {
1055 switch (try self.expectByte()) {
1056 'u' => {
1057 self.cursor += 1;
1058 self.state = .literal_nu;
1059 continue :state_loop;
1060 },
1061 else => return error.SyntaxError,
1062 }
1063 },
1064 .literal_nu => {
1065 switch (try self.expectByte()) {
1066 'l' => {
1067 self.cursor += 1;
1068 self.state = .literal_nul;
1069 continue :state_loop;
1070 },
1071 else => return error.SyntaxError,
1072 }
1073 },
1074 .literal_nul => {
1075 switch (try self.expectByte()) {
1076 'l' => {
1077 self.cursor += 1;
1078 self.state = .post_value;
1079 return .null;
1080 },
1081 else => return error.SyntaxError,
1082 }
1083 },
1084 }
1085 unreachable;
1086 }
1087}
1088
1089/// Seeks ahead in the input until the first byte of the next token (or the end of the input)
1090/// determines which type of token will be returned from the next `next*()` call.
1091/// This function is idempotent, only advancing past commas, colons, and inter-token whitespace.
1092pub fn peekNextTokenType(self: *@This()) PeekError!TokenType {
1093 state_loop: while (true) {
1094 switch (self.state) {
1095 .value => {
1096 switch (try self.skipWhitespaceExpectByte()) {
1097 '{' => return .object_begin,
1098 '[' => return .array_begin,
1099 '"' => return .string,
1100 '-', '0'...'9' => return .number,
1101 't' => return .true,
1102 'f' => return .false,
1103 'n' => return .null,
1104 else => return error.SyntaxError,
1105 }
1106 },
1107
1108 .post_value => {
1109 if (try self.skipWhitespaceCheckEnd()) return .end_of_document;
1110
1111 const c = self.input[self.cursor];
1112 if (self.string_is_object_key) {
1113 self.string_is_object_key = false;
1114 switch (c) {
1115 ':' => {
1116 self.cursor += 1;
1117 self.state = .value;
1118 continue :state_loop;
1119 },
1120 else => return error.SyntaxError,
1121 }
1122 }
1123
1124 switch (c) {
1125 '}' => return .object_end,
1126 ']' => return .array_end,
1127 ',' => {
1128 switch (self.stack.peek()) {
1129 OBJECT_MODE => {
1130 self.state = .object_post_comma;
1131 },
1132 ARRAY_MODE => {
1133 self.state = .value;
1134 },
1135 }
1136 self.cursor += 1;
1137 continue :state_loop;
1138 },
1139 else => return error.SyntaxError,
1140 }
1141 },
1142
1143 .object_start => {
1144 switch (try self.skipWhitespaceExpectByte()) {
1145 '"' => return .string,
1146 '}' => return .object_end,
1147 else => return error.SyntaxError,
1148 }
1149 },
1150 .object_post_comma => {
1151 switch (try self.skipWhitespaceExpectByte()) {
1152 '"' => return .string,
1153 else => return error.SyntaxError,
1154 }
1155 },
1156
1157 .array_start => {
1158 switch (try self.skipWhitespaceExpectByte()) {
1159 ']' => return .array_end,
1160 else => {
1161 self.state = .value;
1162 continue :state_loop;
1163 },
1164 }
1165 },
1166
1167 .number_minus,
1168 .number_leading_zero,
1169 .number_int,
1170 .number_post_dot,
1171 .number_frac,
1172 .number_post_e,
1173 .number_post_e_sign,
1174 .number_exp,
1175 => return .number,
1176
1177 .string,
1178 .string_backslash,
1179 .string_backslash_u,
1180 .string_backslash_u_1,
1181 .string_backslash_u_2,
1182 .string_backslash_u_3,
1183 .string_surrogate_half,
1184 .string_surrogate_half_backslash,
1185 .string_surrogate_half_backslash_u,
1186 .string_surrogate_half_backslash_u_1,
1187 .string_surrogate_half_backslash_u_2,
1188 .string_surrogate_half_backslash_u_3,
1189 => return .string,
1190
1191 .string_utf8_last_byte,
1192 .string_utf8_second_to_last_byte,
1193 .string_utf8_second_to_last_byte_guard_against_overlong,
1194 .string_utf8_second_to_last_byte_guard_against_surrogate_half,
1195 .string_utf8_third_to_last_byte,
1196 .string_utf8_third_to_last_byte_guard_against_overlong,
1197 .string_utf8_third_to_last_byte_guard_against_too_large,
1198 => return .string,
1199
1200 .literal_t,
1201 .literal_tr,
1202 .literal_tru,
1203 => return .true,
1204 .literal_f,
1205 .literal_fa,
1206 .literal_fal,
1207 .literal_fals,
1208 => return .false,
1209 .literal_n,
1210 .literal_nu,
1211 .literal_nul,
1212 => return .null,
1213 }
1214 unreachable;
1215 }
1216}
1217
1218const State = enum {
1219 value,
1220 post_value,
1221
1222 object_start,
1223 object_post_comma,
1224
1225 array_start,
1226
1227 number_minus,
1228 number_leading_zero,
1229 number_int,
1230 number_post_dot,
1231 number_frac,
1232 number_post_e,
1233 number_post_e_sign,
1234 number_exp,
1235
1236 string,
1237 string_backslash,
1238 string_backslash_u,
1239 string_backslash_u_1,
1240 string_backslash_u_2,
1241 string_backslash_u_3,
1242 string_surrogate_half,
1243 string_surrogate_half_backslash,
1244 string_surrogate_half_backslash_u,
1245 string_surrogate_half_backslash_u_1,
1246 string_surrogate_half_backslash_u_2,
1247 string_surrogate_half_backslash_u_3,
1248
1249 // From http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
1250 string_utf8_last_byte, // State A
1251 string_utf8_second_to_last_byte, // State B
1252 string_utf8_second_to_last_byte_guard_against_overlong, // State C
1253 string_utf8_second_to_last_byte_guard_against_surrogate_half, // State D
1254 string_utf8_third_to_last_byte, // State E
1255 string_utf8_third_to_last_byte_guard_against_overlong, // State F
1256 string_utf8_third_to_last_byte_guard_against_too_large, // State G
1257
1258 literal_t,
1259 literal_tr,
1260 literal_tru,
1261 literal_f,
1262 literal_fa,
1263 literal_fal,
1264 literal_fals,
1265 literal_n,
1266 literal_nu,
1267 literal_nul,
1268};
1269
1270fn expectByte(self: *const @This()) !u8 {
1271 if (self.cursor < self.input.len) {
1272 return self.input[self.cursor];
1273 }
1274 // No byte.
1275 if (self.is_end_of_input) return error.UnexpectedEndOfInput;
1276 return error.BufferUnderrun;
1277}
1278
1279fn skipWhitespace(self: *@This()) void {
1280 while (self.cursor < self.input.len) : (self.cursor += 1) {
1281 switch (self.input[self.cursor]) {
1282 // Whitespace
1283 ' ', '\t', '\r' => continue,
1284 '\n' => {
1285 if (self.diagnostics) |diag| {
1286 diag.line_number += 1;
1287 // This will count the newline itself,
1288 // which means a straight-forward subtraction will give a 1-based column number.
1289 diag.line_start_cursor = self.cursor;
1290 }
1291 continue;
1292 },
1293 else => return,
1294 }
1295 }
1296}
1297
1298fn skipWhitespaceExpectByte(self: *@This()) !u8 {
1299 self.skipWhitespace();
1300 return self.expectByte();
1301}
1302
1303fn skipWhitespaceCheckEnd(self: *@This()) !bool {
1304 self.skipWhitespace();
1305 if (self.cursor >= self.input.len) {
1306 // End of buffer.
1307 if (self.is_end_of_input) {
1308 // End of everything.
1309 if (self.stackHeight() == 0) {
1310 // We did it!
1311 return true;
1312 }
1313 return error.UnexpectedEndOfInput;
1314 }
1315 return error.BufferUnderrun;
1316 }
1317 if (self.stackHeight() == 0) return error.SyntaxError;
1318 return false;
1319}
1320
1321fn takeValueSlice(self: *@This()) []const u8 {
1322 const slice = self.input[self.value_start..self.cursor];
1323 self.value_start = self.cursor;
1324 return slice;
1325}
1326fn takeValueSliceMinusTrailingOffset(self: *@This(), trailing_negative_offset: usize) []const u8 {
1327 // Check if the escape sequence started before the current input buffer.
1328 // (The algebra here is awkward to avoid unsigned underflow,
1329 // but it's just making sure the slice on the next line isn't UB.)
1330 if (self.cursor <= self.value_start + trailing_negative_offset) return "";
1331 const slice = self.input[self.value_start .. self.cursor - trailing_negative_offset];
1332 // When trailing_negative_offset is non-zero, setting self.value_start doesn't matter,
1333 // because we always set it again while emitting the .partial_string_escaped_*.
1334 self.value_start = self.cursor;
1335 return slice;
1336}
1337
1338fn endOfBufferInNumber(self: *@This(), allow_end: bool) !Token {
1339 const slice = self.takeValueSlice();
1340 if (self.is_end_of_input) {
1341 if (!allow_end) return error.UnexpectedEndOfInput;
1342 self.state = .post_value;
1343 return Token{ .number = slice };
1344 }
1345 if (slice.len == 0) return error.BufferUnderrun;
1346 return Token{ .partial_number = slice };
1347}
1348
1349fn endOfBufferInString(self: *@This()) !Token {
1350 if (self.is_end_of_input) return error.UnexpectedEndOfInput;
1351 const slice = self.takeValueSliceMinusTrailingOffset(switch (self.state) {
1352 // Don't include the escape sequence in the partial string.
1353 .string_backslash => 1,
1354 .string_backslash_u => 2,
1355 .string_backslash_u_1 => 3,
1356 .string_backslash_u_2 => 4,
1357 .string_backslash_u_3 => 5,
1358 .string_surrogate_half => 6,
1359 .string_surrogate_half_backslash => 7,
1360 .string_surrogate_half_backslash_u => 8,
1361 .string_surrogate_half_backslash_u_1 => 9,
1362 .string_surrogate_half_backslash_u_2 => 10,
1363 .string_surrogate_half_backslash_u_3 => 11,
1364
1365 // Include everything up to the cursor otherwise.
1366 .string,
1367 .string_utf8_last_byte,
1368 .string_utf8_second_to_last_byte,
1369 .string_utf8_second_to_last_byte_guard_against_overlong,
1370 .string_utf8_second_to_last_byte_guard_against_surrogate_half,
1371 .string_utf8_third_to_last_byte,
1372 .string_utf8_third_to_last_byte_guard_against_overlong,
1373 .string_utf8_third_to_last_byte_guard_against_too_large,
1374 => 0,
1375
1376 else => unreachable,
1377 });
1378 if (slice.len == 0) return error.BufferUnderrun;
1379 return Token{ .partial_string = slice };
1380}
1381
1382fn partialStringCodepoint(code_point: u21) Token {
1383 var buf: [4]u8 = undefined;
1384 switch (std.unicode.utf8Encode(code_point, &buf) catch unreachable) {
1385 1 => return Token{ .partial_string_escaped_1 = buf[0..1].* },
1386 2 => return Token{ .partial_string_escaped_2 = buf[0..2].* },
1387 3 => return Token{ .partial_string_escaped_3 = buf[0..3].* },
1388 4 => return Token{ .partial_string_escaped_4 = buf[0..4].* },
1389 else => unreachable,
1390 }
1391}
1392
1393/// Scan the input and check for malformed JSON.
1394/// On `SyntaxError` or `UnexpectedEndOfInput`, returns `false`.
1395/// Returns any errors from the allocator as-is, which is unlikely,
1396/// but can be caused by extreme nesting depth in the input.
1397pub fn validate(allocator: Allocator, s: []const u8) Allocator.Error!bool {
1398 var scanner = Scanner.initCompleteInput(allocator, s);
1399 defer scanner.deinit();
1400
1401 while (true) {
1402 const token = scanner.next() catch |err| switch (err) {
1403 error.SyntaxError, error.UnexpectedEndOfInput => return false,
1404 error.OutOfMemory => return error.OutOfMemory,
1405 error.BufferUnderrun => unreachable,
1406 };
1407 if (token == .end_of_document) break;
1408 }
1409
1410 return true;
1411}
1412
1413/// The parsing errors are divided into two categories:
1414/// * `SyntaxError` is for clearly malformed JSON documents,
1415/// such as giving an input document that isn't JSON at all.
1416/// * `UnexpectedEndOfInput` is for signaling that everything's been
1417/// valid so far, but the input appears to be truncated for some reason.
1418/// Note that a completely empty (or whitespace-only) input will give `UnexpectedEndOfInput`.
1419pub const Error = error{ SyntaxError, UnexpectedEndOfInput };
1420
1421/// Used by `json.reader`.
1422pub const default_buffer_size = 0x1000;
1423
1424/// The tokens emitted by `std.json.Scanner` and `std.json.Reader` `.next*()` functions follow this grammar:
1425/// ```
1426/// <document> = <value> .end_of_document
1427/// <value> =
1428/// | <object>
1429/// | <array>
1430/// | <number>
1431/// | <string>
1432/// | .true
1433/// | .false
1434/// | .null
1435/// <object> = .object_begin ( <string> <value> )* .object_end
1436/// <array> = .array_begin ( <value> )* .array_end
1437/// <number> = <It depends. See below.>
1438/// <string> = <It depends. See below.>
1439/// ```
1440///
1441/// What you get for `<number>` and `<string>` values depends on which `next*()` method you call:
1442///
1443/// ```
1444/// next():
1445/// <number> = ( .partial_number )* .number
1446/// <string> = ( <partial_string> )* .string
1447/// <partial_string> =
1448/// | .partial_string
1449/// | .partial_string_escaped_1
1450/// | .partial_string_escaped_2
1451/// | .partial_string_escaped_3
1452/// | .partial_string_escaped_4
1453///
1454/// nextAlloc*(..., .alloc_always):
1455/// <number> = .allocated_number
1456/// <string> = .allocated_string
1457///
1458/// nextAlloc*(..., .alloc_if_needed):
1459/// <number> =
1460/// | .number
1461/// | .allocated_number
1462/// <string> =
1463/// | .string
1464/// | .allocated_string
1465/// ```
1466///
1467/// For all tokens with a `[]const u8`, `[]u8`, or `[n]u8` payload, the payload represents the content of the value.
1468/// For number values, this is the representation of the number exactly as it appears in the input.
1469/// For strings, this is the content of the string after resolving escape sequences.
1470///
1471/// For `.allocated_number` and `.allocated_string`, the `[]u8` payloads are allocations made with the given allocator.
1472/// You are responsible for managing that memory. `json.Reader.deinit()` does *not* free those allocations.
1473///
1474/// The `.partial_*` tokens indicate that a value spans multiple input buffers or that a string contains escape sequences.
1475/// To get a complete value in memory, you need to concatenate the values yourself.
1476/// Calling `nextAlloc*()` does this for you, and returns an `.allocated_*` token with the result.
1477///
1478/// For tokens with a `[]const u8` payload, the payload is a slice into the current input buffer.
1479/// The memory may become undefined during the next call to `json.Scanner.feedInput()`
1480/// or any `json.Reader` method whose return error set includes `json.Error`.
1481/// To keep the value persistently, it recommended to make a copy or to use `.alloc_always`,
1482/// which makes a copy for you.
1483///
1484/// Note that `.number` and `.string` tokens that follow `.partial_*` tokens may have `0` length to indicate that
1485/// the previously partial value is completed with no additional bytes.
1486/// (This can happen when the break between input buffers happens to land on the exact end of a value. E.g. `"[1234"`, `"]"`.)
1487/// `.partial_*` tokens never have `0` length.
1488///
1489/// The recommended strategy for using the different `next*()` methods is something like this:
1490///
1491/// When you're expecting an object key, use `.alloc_if_needed`.
1492/// You often don't need a copy of the key string to persist; you might just check which field it is.
1493/// In the case that the key happens to require an allocation, free it immediately after checking it.
1494///
1495/// When you're expecting a meaningful string value (such as on the right of a `:`),
1496/// use `.alloc_always` in order to keep the value valid throughout parsing the rest of the document.
1497///
1498/// When you're expecting a number value, use `.alloc_if_needed`.
1499/// You're probably going to be parsing the string representation of the number into a numeric representation,
1500/// so you need the complete string representation only temporarily.
1501///
1502/// When you're skipping an unrecognized value, use `skipValue()`.
1503pub const Token = union(enum) {
1504 object_begin,
1505 object_end,
1506 array_begin,
1507 array_end,
1508
1509 true,
1510 false,
1511 null,
1512
1513 number: []const u8,
1514 partial_number: []const u8,
1515 allocated_number: []u8,
1516
1517 string: []const u8,
1518 partial_string: []const u8,
1519 partial_string_escaped_1: [1]u8,
1520 partial_string_escaped_2: [2]u8,
1521 partial_string_escaped_3: [3]u8,
1522 partial_string_escaped_4: [4]u8,
1523 allocated_string: []u8,
1524
1525 end_of_document,
1526};
1527
1528/// This is only used in `peekNextTokenType()` and gives a categorization based on the first byte of the next token that will be emitted from a `next*()` call.
1529pub const TokenType = enum {
1530 object_begin,
1531 object_end,
1532 array_begin,
1533 array_end,
1534 true,
1535 false,
1536 null,
1537 number,
1538 string,
1539 end_of_document,
1540};
1541
1542/// To enable diagnostics, declare `var diagnostics = Diagnostics{};` then call `source.enableDiagnostics(&diagnostics);`
1543/// where `source` is either a `std.json.Reader` or a `std.json.Scanner` that has just been initialized.
1544/// At any time, notably just after an error, call `getLine()`, `getColumn()`, and/or `getByteOffset()`
1545/// to get meaningful information from this.
1546pub const Diagnostics = struct {
1547 line_number: u64 = 1,
1548 line_start_cursor: usize = @as(usize, @bitCast(@as(isize, -1))), // Start just "before" the input buffer to get a 1-based column for line 1.
1549 total_bytes_before_current_input: u64 = 0,
1550 cursor_pointer: *const usize = undefined,
1551
1552 /// Starts at 1.
1553 pub fn getLine(self: *const @This()) u64 {
1554 return self.line_number;
1555 }
1556 /// Starts at 1.
1557 pub fn getColumn(self: *const @This()) u64 {
1558 return self.cursor_pointer.* -% self.line_start_cursor;
1559 }
1560 /// Starts at 0. Measures the byte offset since the start of the input.
1561 pub fn getByteOffset(self: *const @This()) u64 {
1562 return self.total_bytes_before_current_input + self.cursor_pointer.*;
1563 }
1564};
1565
1566/// See the documentation for `std.json.Token`.
1567pub const AllocWhen = enum { alloc_if_needed, alloc_always };
1568
1569/// For security, the maximum size allocated to store a single string or number value is limited to 4MiB by default.
1570/// This limit can be specified by calling `nextAllocMax()` instead of `nextAlloc()`.
1571pub const default_max_value_len = 4 * 1024 * 1024;
1572
1573/// All `next*()` methods here handle `error.BufferUnderrun` from `std.json.Scanner`, and then read from the reader.
1574pub const Reader = struct {
1575 scanner: Scanner,
1576 reader: *std.Io.Reader,
1577
1578 /// The allocator is only used to track `[]` and `{}` nesting levels.
1579 pub fn init(allocator: Allocator, io_reader: *std.Io.Reader) @This() {
1580 return .{
1581 .scanner = Scanner.initStreaming(allocator),
1582 .reader = io_reader,
1583 };
1584 }
1585 pub fn deinit(self: *@This()) void {
1586 self.scanner.deinit();
1587 self.* = undefined;
1588 }
1589
1590 /// Calls `std.json.Scanner.enableDiagnostics`.
1591 pub fn enableDiagnostics(self: *@This(), diagnostics: *Diagnostics) void {
1592 self.scanner.enableDiagnostics(diagnostics);
1593 }
1594
1595 pub const NextError = std.Io.Reader.Error || Error || Allocator.Error;
1596 pub const SkipError = Reader.NextError;
1597 pub const AllocError = Reader.NextError || error{ValueTooLong};
1598 pub const PeekError = std.Io.Reader.Error || Error;
1599
1600 /// Equivalent to `nextAllocMax(allocator, when, default_max_value_len);`
1601 /// See also `std.json.Token` for documentation of `nextAlloc*()` function behavior.
1602 pub fn nextAlloc(self: *@This(), allocator: Allocator, when: AllocWhen) Reader.AllocError!Token {
1603 return self.nextAllocMax(allocator, when, default_max_value_len);
1604 }
1605 /// See also `std.json.Token` for documentation of `nextAlloc*()` function behavior.
1606 pub fn nextAllocMax(self: *@This(), allocator: Allocator, when: AllocWhen, max_value_len: usize) Reader.AllocError!Token {
1607 const token_type = try self.peekNextTokenType();
1608 switch (token_type) {
1609 .number, .string => {
1610 var value_list = std.array_list.Managed(u8).init(allocator);
1611 errdefer {
1612 value_list.deinit();
1613 }
1614 if (try self.allocNextIntoArrayListMax(&value_list, when, max_value_len)) |slice| {
1615 return if (token_type == .number)
1616 Token{ .number = slice }
1617 else
1618 Token{ .string = slice };
1619 } else {
1620 return if (token_type == .number)
1621 Token{ .allocated_number = try value_list.toOwnedSlice() }
1622 else
1623 Token{ .allocated_string = try value_list.toOwnedSlice() };
1624 }
1625 },
1626
1627 // Simple tokens never alloc.
1628 .object_begin,
1629 .object_end,
1630 .array_begin,
1631 .array_end,
1632 .true,
1633 .false,
1634 .null,
1635 .end_of_document,
1636 => return try self.next(),
1637 }
1638 }
1639
1640 /// Equivalent to `allocNextIntoArrayListMax(value_list, when, default_max_value_len);`
1641 pub fn allocNextIntoArrayList(self: *@This(), value_list: *std.array_list.Managed(u8), when: AllocWhen) Reader.AllocError!?[]const u8 {
1642 return self.allocNextIntoArrayListMax(value_list, when, default_max_value_len);
1643 }
1644 /// Calls `std.json.Scanner.allocNextIntoArrayListMax` and handles `error.BufferUnderrun`.
1645 pub fn allocNextIntoArrayListMax(self: *@This(), value_list: *std.array_list.Managed(u8), when: AllocWhen, max_value_len: usize) Reader.AllocError!?[]const u8 {
1646 while (true) {
1647 return self.scanner.allocNextIntoArrayListMax(value_list, when, max_value_len) catch |err| switch (err) {
1648 error.BufferUnderrun => {
1649 try self.refillBuffer();
1650 continue;
1651 },
1652 else => |other_err| return other_err,
1653 };
1654 }
1655 }
1656
1657 /// Like `std.json.Scanner.skipValue`, but handles `error.BufferUnderrun`.
1658 pub fn skipValue(self: *@This()) Reader.SkipError!void {
1659 switch (try self.peekNextTokenType()) {
1660 .object_begin, .array_begin => {
1661 try self.skipUntilStackHeight(self.stackHeight());
1662 },
1663 .number, .string => {
1664 while (true) {
1665 switch (try self.next()) {
1666 .partial_number,
1667 .partial_string,
1668 .partial_string_escaped_1,
1669 .partial_string_escaped_2,
1670 .partial_string_escaped_3,
1671 .partial_string_escaped_4,
1672 => continue,
1673
1674 .number, .string => break,
1675
1676 else => unreachable,
1677 }
1678 }
1679 },
1680 .true, .false, .null => {
1681 _ = try self.next();
1682 },
1683
1684 .object_end, .array_end, .end_of_document => unreachable, // Attempt to skip a non-value token.
1685 }
1686 }
1687 /// Like `std.json.Scanner.skipUntilStackHeight()` but handles `error.BufferUnderrun`.
1688 pub fn skipUntilStackHeight(self: *@This(), terminal_stack_height: usize) Reader.NextError!void {
1689 while (true) {
1690 return self.scanner.skipUntilStackHeight(terminal_stack_height) catch |err| switch (err) {
1691 error.BufferUnderrun => {
1692 try self.refillBuffer();
1693 continue;
1694 },
1695 else => |other_err| return other_err,
1696 };
1697 }
1698 }
1699
1700 /// Calls `std.json.Scanner.stackHeight`.
1701 pub fn stackHeight(self: *const @This()) usize {
1702 return self.scanner.stackHeight();
1703 }
1704 /// Calls `std.json.Scanner.ensureTotalStackCapacity`.
1705 pub fn ensureTotalStackCapacity(self: *@This(), height: usize) Allocator.Error!void {
1706 try self.scanner.ensureTotalStackCapacity(height);
1707 }
1708
1709 /// See `std.json.Token` for documentation of this function.
1710 pub fn next(self: *@This()) Reader.NextError!Token {
1711 while (true) {
1712 return self.scanner.next() catch |err| switch (err) {
1713 error.BufferUnderrun => {
1714 try self.refillBuffer();
1715 continue;
1716 },
1717 else => |other_err| return other_err,
1718 };
1719 }
1720 }
1721
1722 /// See `std.json.Scanner.peekNextTokenType()`.
1723 pub fn peekNextTokenType(self: *@This()) Reader.PeekError!TokenType {
1724 while (true) {
1725 return self.scanner.peekNextTokenType() catch |err| switch (err) {
1726 error.BufferUnderrun => {
1727 try self.refillBuffer();
1728 continue;
1729 },
1730 else => |other_err| return other_err,
1731 };
1732 }
1733 }
1734
1735 fn refillBuffer(self: *@This()) std.Io.Reader.Error!void {
1736 const input = self.reader.peekGreedy(1) catch |err| switch (err) {
1737 error.ReadFailed => return error.ReadFailed,
1738 error.EndOfStream => return self.scanner.endInput(),
1739 };
1740 self.reader.toss(input.len);
1741 self.scanner.feedInput(input);
1742 }
1743};
1744
1745const OBJECT_MODE = 0;
1746const ARRAY_MODE = 1;
1747
1748fn appendSlice(list: *std.array_list.Managed(u8), buf: []const u8, max_value_len: usize) !void {
1749 const new_len = std.math.add(usize, list.items.len, buf.len) catch return error.ValueTooLong;
1750 if (new_len > max_value_len) return error.ValueTooLong;
1751 try list.appendSlice(buf);
1752}
1753
1754/// For the slice you get from a `Token.number` or `Token.allocated_number`,
1755/// this function returns true if the number doesn't contain any fraction or exponent components, and is not `-0`.
1756/// Note, the numeric value encoded by the value may still be an integer, such as `1.0`.
1757/// This function is meant to give a hint about whether integer parsing or float parsing should be used on the value.
1758/// This function will not give meaningful results on non-numeric input.
1759pub fn isNumberFormattedLikeAnInteger(value: []const u8) bool {
1760 if (std.mem.eql(u8, value, "-0")) return false;
1761 return std.mem.indexOfAny(u8, value, ".eE") == null;
1762}
1763
1764test {
1765 _ = @import("./scanner_test.zig");
1766}