zig/lib/std/json/Scanner.zig at master

   1//! The lowest level parsing API in this package;
   2//! supports streaming input with a low memory footprint.
   3//! The memory requirement is `O(d)` where d is the nesting depth of `[]` or `{}` containers in the input.
   4//! Specifically `d/8` bytes are required for this purpose,
   5//! with some extra buffer according to the implementation of `std.ArrayList`.
   6//!
   7//! This scanner can emit partial tokens; see `std.json.Token`.
   8//! The input to this class is a sequence of input buffers that you must supply one at a time.
   9//! Call `feedInput()` with the first buffer, then call `next()` repeatedly until `error.BufferUnderrun` is returned.
  10//! Then call `feedInput()` again and so forth.
  11//! Call `endInput()` when the last input buffer has been given to `feedInput()`, either immediately after calling `feedInput()`,
  12//! or when `error.BufferUnderrun` requests more data and there is no more.
  13//! Be sure to call `next()` after calling `endInput()` until `Token.end_of_document` has been returned.
  14//!
  15//! Notes on standards compliance: https://datatracker.ietf.org/doc/html/rfc8259
  16//! * RFC 8259 requires JSON documents be valid UTF-8,
  17//!   but makes an allowance for systems that are "part of a closed ecosystem".
  18//!   I have no idea what that's supposed to mean in the context of a standard specification.
  19//!   This implementation requires inputs to be valid UTF-8.
  20//! * RFC 8259 contradicts itself regarding whether lowercase is allowed in \u hex digits,
  21//!   but this is probably a bug in the spec, and it's clear that lowercase is meant to be allowed.
  22//!   (RFC 5234 defines HEXDIG to only allow uppercase.)
  23//! * When RFC 8259 refers to a "character", I assume they really mean a "Unicode scalar value".
  24//!   See http://www.unicode.org/glossary/#unicode_scalar_value .
  25//! * RFC 8259 doesn't explicitly disallow unpaired surrogate halves in \u escape sequences,
  26//!   but vaguely implies that \u escapes are for encoding Unicode "characters" (i.e. Unicode scalar values?),
  27//!   which would mean that unpaired surrogate halves are forbidden.
  28//!   By contrast ECMA-404 (a competing(/compatible?) JSON standard, which JavaScript's JSON.parse() conforms to)
  29//!   explicitly allows unpaired surrogate halves.
  30//!   This implementation forbids unpaired surrogate halves in \u sequences.
  31//!   If a high surrogate half appears in a \u sequence,
  32//!   then a low surrogate half must immediately follow in \u notation.
  33//! * RFC 8259 allows implementations to "accept non-JSON forms or extensions".
  34//!   This implementation does not accept any of that.
  35//! * RFC 8259 allows implementations to put limits on "the size of texts",
  36//!   "the maximum depth of nesting", "the range and precision of numbers",
  37//!   and "the length and character contents of strings".
  38//!   This low-level implementation does not limit these,
  39//!   except where noted above, and except that nesting depth requires memory allocation.
  40//!   Note that this low-level API does not interpret numbers numerically,
  41//!   but simply emits their source form for some higher level code to make sense of.
  42//! * This low-level implementation allows duplicate object keys,
  43//!   and key/value pairs are emitted in the order they appear in the input.
  44
  45const Scanner = @This();
  46const std = @import("std");
  47
  48const Allocator = std.mem.Allocator;
  49const assert = std.debug.assert;
  50const BitStack = std.BitStack;
  51
  52state: State = .value,
  53string_is_object_key: bool = false,
  54stack: BitStack,
  55value_start: usize = undefined,
  56utf16_code_units: [2]u16 = undefined,
  57
  58input: []const u8 = "",
  59cursor: usize = 0,
  60is_end_of_input: bool = false,
  61diagnostics: ?*Diagnostics = null,
  62
  63/// The allocator is only used to track `[]` and `{}` nesting levels.
  64pub fn initStreaming(allocator: Allocator) @This() {
  65    return .{
  66        .stack = BitStack.init(allocator),
  67    };
  68}
  69/// Use this if your input is a single slice.
  70/// This is effectively equivalent to:
  71/// ```
  72/// initStreaming(allocator);
  73/// feedInput(complete_input);
  74/// endInput();
  75/// ```
  76pub fn initCompleteInput(allocator: Allocator, complete_input: []const u8) @This() {
  77    return .{
  78        .stack = BitStack.init(allocator),
  79        .input = complete_input,
  80        .is_end_of_input = true,
  81    };
  82}
  83pub fn deinit(self: *@This()) void {
  84    self.stack.deinit();
  85    self.* = undefined;
  86}
  87
  88pub fn enableDiagnostics(self: *@This(), diagnostics: *Diagnostics) void {
  89    diagnostics.cursor_pointer = &self.cursor;
  90    self.diagnostics = diagnostics;
  91}
  92
  93/// Call this whenever you get `error.BufferUnderrun` from `next()`.
  94/// When there is no more input to provide, call `endInput()`.
  95pub fn feedInput(self: *@This(), input: []const u8) void {
  96    assert(self.cursor == self.input.len); // Not done with the last input slice.
  97    if (self.diagnostics) |diag| {
  98        diag.total_bytes_before_current_input += self.input.len;
  99        // This usually goes "negative" to measure how far before the beginning
 100        // of the new buffer the current line started.
 101        diag.line_start_cursor -%= self.cursor;
 102    }
 103    self.input = input;
 104    self.cursor = 0;
 105    self.value_start = 0;
 106}
 107/// Call this when you will no longer call `feedInput()` anymore.
 108/// This can be called either immediately after the last `feedInput()`,
 109/// or at any time afterward, such as when getting `error.BufferUnderrun` from `next()`.
 110/// Don't forget to call `next*()` after `endInput()` until you get `.end_of_document`.
 111pub fn endInput(self: *@This()) void {
 112    self.is_end_of_input = true;
 113}
 114
 115pub const NextError = Error || Allocator.Error || error{BufferUnderrun};
 116pub const AllocError = Error || Allocator.Error || error{ValueTooLong};
 117pub const PeekError = Error || error{BufferUnderrun};
 118pub const SkipError = Error || Allocator.Error;
 119pub const AllocIntoArrayListError = AllocError || error{BufferUnderrun};
 120
 121/// Equivalent to `nextAllocMax(allocator, when, default_max_value_len);`
 122/// This function is only available after `endInput()` (or `initCompleteInput()`) has been called.
 123/// See also `std.json.Token` for documentation of `nextAlloc*()` function behavior.
 124pub fn nextAlloc(self: *@This(), allocator: Allocator, when: AllocWhen) AllocError!Token {
 125    return self.nextAllocMax(allocator, when, default_max_value_len);
 126}
 127
 128/// This function is only available after `endInput()` (or `initCompleteInput()`) has been called.
 129/// See also `std.json.Token` for documentation of `nextAlloc*()` function behavior.
 130pub fn nextAllocMax(self: *@This(), allocator: Allocator, when: AllocWhen, max_value_len: usize) AllocError!Token {
 131    assert(self.is_end_of_input); // This function is not available in streaming mode.
 132    const token_type = self.peekNextTokenType() catch |e| switch (e) {
 133        error.BufferUnderrun => unreachable,
 134        else => |err| return err,
 135    };
 136    switch (token_type) {
 137        .number, .string => {
 138            var value_list = std.array_list.Managed(u8).init(allocator);
 139            errdefer {
 140                value_list.deinit();
 141            }
 142            if (self.allocNextIntoArrayListMax(&value_list, when, max_value_len) catch |e| switch (e) {
 143                error.BufferUnderrun => unreachable,
 144                else => |err| return err,
 145            }) |slice| {
 146                return if (token_type == .number)
 147                    Token{ .number = slice }
 148                else
 149                    Token{ .string = slice };
 150            } else {
 151                return if (token_type == .number)
 152                    Token{ .allocated_number = try value_list.toOwnedSlice() }
 153                else
 154                    Token{ .allocated_string = try value_list.toOwnedSlice() };
 155            }
 156        },
 157
 158        // Simple tokens never alloc.
 159        .object_begin,
 160        .object_end,
 161        .array_begin,
 162        .array_end,
 163        .true,
 164        .false,
 165        .null,
 166        .end_of_document,
 167        => return self.next() catch |e| switch (e) {
 168            error.BufferUnderrun => unreachable,
 169            else => |err| return err,
 170        },
 171    }
 172}
 173
 174/// Equivalent to `allocNextIntoArrayListMax(value_list, when, default_max_value_len);`
 175pub fn allocNextIntoArrayList(self: *@This(), value_list: *std.array_list.Managed(u8), when: AllocWhen) AllocIntoArrayListError!?[]const u8 {
 176    return self.allocNextIntoArrayListMax(value_list, when, default_max_value_len);
 177}
 178/// The next token type must be either `.number` or `.string`. See `peekNextTokenType()`.
 179/// When allocation is not necessary with `.alloc_if_needed`,
 180/// this method returns the content slice from the input buffer, and `value_list` is not touched.
 181/// When allocation is necessary or with `.alloc_always`, this method concatenates partial tokens into the given `value_list`,
 182/// and returns `null` once the final `.number` or `.string` token has been written into it.
 183/// In case of an `error.BufferUnderrun`, partial values will be left in the given value_list.
 184/// The given `value_list` is never reset by this method, so an `error.BufferUnderrun` situation
 185/// can be resumed by passing the same array list in again.
 186/// This method does not indicate whether the token content being returned is for a `.number` or `.string` token type;
 187/// the caller of this method is expected to know which type of token is being processed.
 188pub fn allocNextIntoArrayListMax(self: *@This(), value_list: *std.array_list.Managed(u8), when: AllocWhen, max_value_len: usize) AllocIntoArrayListError!?[]const u8 {
 189    while (true) {
 190        const token = try self.next();
 191        switch (token) {
 192            // Accumulate partial values.
 193            .partial_number, .partial_string => |slice| {
 194                try appendSlice(value_list, slice, max_value_len);
 195            },
 196            .partial_string_escaped_1 => |buf| {
 197                try appendSlice(value_list, buf[0..], max_value_len);
 198            },
 199            .partial_string_escaped_2 => |buf| {
 200                try appendSlice(value_list, buf[0..], max_value_len);
 201            },
 202            .partial_string_escaped_3 => |buf| {
 203                try appendSlice(value_list, buf[0..], max_value_len);
 204            },
 205            .partial_string_escaped_4 => |buf| {
 206                try appendSlice(value_list, buf[0..], max_value_len);
 207            },
 208
 209            // Return complete values.
 210            .number => |slice| {
 211                if (when == .alloc_if_needed and value_list.items.len == 0) {
 212                    // No alloc necessary.
 213                    return slice;
 214                }
 215                try appendSlice(value_list, slice, max_value_len);
 216                // The token is complete.
 217                return null;
 218            },
 219            .string => |slice| {
 220                if (when == .alloc_if_needed and value_list.items.len == 0) {
 221                    // No alloc necessary.
 222                    return slice;
 223                }
 224                try appendSlice(value_list, slice, max_value_len);
 225                // The token is complete.
 226                return null;
 227            },
 228
 229            .object_begin,
 230            .object_end,
 231            .array_begin,
 232            .array_end,
 233            .true,
 234            .false,
 235            .null,
 236            .end_of_document,
 237            => unreachable, // Only .number and .string token types are allowed here. Check peekNextTokenType() before calling this.
 238
 239            .allocated_number, .allocated_string => unreachable,
 240        }
 241    }
 242}
 243
 244/// This function is only available after `endInput()` (or `initCompleteInput()`) has been called.
 245/// If the next token type is `.object_begin` or `.array_begin`,
 246/// this function calls `next()` repeatedly until the corresponding `.object_end` or `.array_end` is found.
 247/// If the next token type is `.number` or `.string`,
 248/// this function calls `next()` repeatedly until the (non `.partial_*`) `.number` or `.string` token is found.
 249/// If the next token type is `.true`, `.false`, or `.null`, this function calls `next()` once.
 250/// The next token type must not be `.object_end`, `.array_end`, or `.end_of_document`;
 251/// see `peekNextTokenType()`.
 252pub fn skipValue(self: *@This()) SkipError!void {
 253    assert(self.is_end_of_input); // This function is not available in streaming mode.
 254    switch (self.peekNextTokenType() catch |e| switch (e) {
 255        error.BufferUnderrun => unreachable,
 256        else => |err| return err,
 257    }) {
 258        .object_begin, .array_begin => {
 259            self.skipUntilStackHeight(self.stackHeight()) catch |e| switch (e) {
 260                error.BufferUnderrun => unreachable,
 261                else => |err| return err,
 262            };
 263        },
 264        .number, .string => {
 265            while (true) {
 266                switch (self.next() catch |e| switch (e) {
 267                    error.BufferUnderrun => unreachable,
 268                    else => |err| return err,
 269                }) {
 270                    .partial_number,
 271                    .partial_string,
 272                    .partial_string_escaped_1,
 273                    .partial_string_escaped_2,
 274                    .partial_string_escaped_3,
 275                    .partial_string_escaped_4,
 276                    => continue,
 277
 278                    .number, .string => break,
 279
 280                    else => unreachable,
 281                }
 282            }
 283        },
 284        .true, .false, .null => {
 285            _ = self.next() catch |e| switch (e) {
 286                error.BufferUnderrun => unreachable,
 287                else => |err| return err,
 288            };
 289        },
 290
 291        .object_end, .array_end, .end_of_document => unreachable, // Attempt to skip a non-value token.
 292    }
 293}
 294
 295/// Skip tokens until an `.object_end` or `.array_end` token results in a `stackHeight()` equal the given stack height.
 296/// Unlike `skipValue()`, this function is available in streaming mode.
 297pub fn skipUntilStackHeight(self: *@This(), terminal_stack_height: usize) NextError!void {
 298    while (true) {
 299        switch (try self.next()) {
 300            .object_end, .array_end => {
 301                if (self.stackHeight() == terminal_stack_height) break;
 302            },
 303            .end_of_document => unreachable,
 304            else => continue,
 305        }
 306    }
 307}
 308
 309/// The depth of `{}` or `[]` nesting levels at the current position.
 310pub fn stackHeight(self: *const @This()) usize {
 311    return self.stack.bit_len;
 312}
 313
 314/// Pre allocate memory to hold the given number of nesting levels.
 315/// `stackHeight()` up to the given number will not cause allocations.
 316pub fn ensureTotalStackCapacity(self: *@This(), height: usize) Allocator.Error!void {
 317    try self.stack.ensureTotalCapacity(height);
 318}
 319
 320/// See `std.json.Token` for documentation of this function.
 321pub fn next(self: *@This()) NextError!Token {
 322    state_loop: while (true) {
 323        switch (self.state) {
 324            .value => {
 325                switch (try self.skipWhitespaceExpectByte()) {
 326                    // Object, Array
 327                    '{' => {
 328                        try self.stack.push(OBJECT_MODE);
 329                        self.cursor += 1;
 330                        self.state = .object_start;
 331                        return .object_begin;
 332                    },
 333                    '[' => {
 334                        try self.stack.push(ARRAY_MODE);
 335                        self.cursor += 1;
 336                        self.state = .array_start;
 337                        return .array_begin;
 338                    },
 339
 340                    // String
 341                    '"' => {
 342                        self.cursor += 1;
 343                        self.value_start = self.cursor;
 344                        self.state = .string;
 345                        continue :state_loop;
 346                    },
 347
 348                    // Number
 349                    '1'...'9' => {
 350                        self.value_start = self.cursor;
 351                        self.cursor += 1;
 352                        self.state = .number_int;
 353                        continue :state_loop;
 354                    },
 355                    '0' => {
 356                        self.value_start = self.cursor;
 357                        self.cursor += 1;
 358                        self.state = .number_leading_zero;
 359                        continue :state_loop;
 360                    },
 361                    '-' => {
 362                        self.value_start = self.cursor;
 363                        self.cursor += 1;
 364                        self.state = .number_minus;
 365                        continue :state_loop;
 366                    },
 367
 368                    // literal values
 369                    't' => {
 370                        self.cursor += 1;
 371                        self.state = .literal_t;
 372                        continue :state_loop;
 373                    },
 374                    'f' => {
 375                        self.cursor += 1;
 376                        self.state = .literal_f;
 377                        continue :state_loop;
 378                    },
 379                    'n' => {
 380                        self.cursor += 1;
 381                        self.state = .literal_n;
 382                        continue :state_loop;
 383                    },
 384
 385                    else => return error.SyntaxError,
 386                }
 387            },
 388
 389            .post_value => {
 390                if (try self.skipWhitespaceCheckEnd()) return .end_of_document;
 391
 392                const c = self.input[self.cursor];
 393                if (self.string_is_object_key) {
 394                    self.string_is_object_key = false;
 395                    switch (c) {
 396                        ':' => {
 397                            self.cursor += 1;
 398                            self.state = .value;
 399                            continue :state_loop;
 400                        },
 401                        else => return error.SyntaxError,
 402                    }
 403                }
 404
 405                switch (c) {
 406                    '}' => {
 407                        if (self.stack.pop() != OBJECT_MODE) return error.SyntaxError;
 408                        self.cursor += 1;
 409                        // stay in .post_value state.
 410                        return .object_end;
 411                    },
 412                    ']' => {
 413                        if (self.stack.pop() != ARRAY_MODE) return error.SyntaxError;
 414                        self.cursor += 1;
 415                        // stay in .post_value state.
 416                        return .array_end;
 417                    },
 418                    ',' => {
 419                        switch (self.stack.peek()) {
 420                            OBJECT_MODE => {
 421                                self.state = .object_post_comma;
 422                            },
 423                            ARRAY_MODE => {
 424                                self.state = .value;
 425                            },
 426                        }
 427                        self.cursor += 1;
 428                        continue :state_loop;
 429                    },
 430                    else => return error.SyntaxError,
 431                }
 432            },
 433
 434            .object_start => {
 435                switch (try self.skipWhitespaceExpectByte()) {
 436                    '"' => {
 437                        self.cursor += 1;
 438                        self.value_start = self.cursor;
 439                        self.state = .string;
 440                        self.string_is_object_key = true;
 441                        continue :state_loop;
 442                    },
 443                    '}' => {
 444                        self.cursor += 1;
 445                        _ = self.stack.pop();
 446                        self.state = .post_value;
 447                        return .object_end;
 448                    },
 449                    else => return error.SyntaxError,
 450                }
 451            },
 452            .object_post_comma => {
 453                switch (try self.skipWhitespaceExpectByte()) {
 454                    '"' => {
 455                        self.cursor += 1;
 456                        self.value_start = self.cursor;
 457                        self.state = .string;
 458                        self.string_is_object_key = true;
 459                        continue :state_loop;
 460                    },
 461                    else => return error.SyntaxError,
 462                }
 463            },
 464
 465            .array_start => {
 466                switch (try self.skipWhitespaceExpectByte()) {
 467                    ']' => {
 468                        self.cursor += 1;
 469                        _ = self.stack.pop();
 470                        self.state = .post_value;
 471                        return .array_end;
 472                    },
 473                    else => {
 474                        self.state = .value;
 475                        continue :state_loop;
 476                    },
 477                }
 478            },
 479
 480            .number_minus => {
 481                if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
 482                switch (self.input[self.cursor]) {
 483                    '0' => {
 484                        self.cursor += 1;
 485                        self.state = .number_leading_zero;
 486                        continue :state_loop;
 487                    },
 488                    '1'...'9' => {
 489                        self.cursor += 1;
 490                        self.state = .number_int;
 491                        continue :state_loop;
 492                    },
 493                    else => return error.SyntaxError,
 494                }
 495            },
 496            .number_leading_zero => {
 497                if (self.cursor >= self.input.len) return self.endOfBufferInNumber(true);
 498                switch (self.input[self.cursor]) {
 499                    '.' => {
 500                        self.cursor += 1;
 501                        self.state = .number_post_dot;
 502                        continue :state_loop;
 503                    },
 504                    'e', 'E' => {
 505                        self.cursor += 1;
 506                        self.state = .number_post_e;
 507                        continue :state_loop;
 508                    },
 509                    else => {
 510                        self.state = .post_value;
 511                        return Token{ .number = self.takeValueSlice() };
 512                    },
 513                }
 514            },
 515            .number_int => {
 516                while (self.cursor < self.input.len) : (self.cursor += 1) {
 517                    switch (self.input[self.cursor]) {
 518                        '0'...'9' => continue,
 519                        '.' => {
 520                            self.cursor += 1;
 521                            self.state = .number_post_dot;
 522                            continue :state_loop;
 523                        },
 524                        'e', 'E' => {
 525                            self.cursor += 1;
 526                            self.state = .number_post_e;
 527                            continue :state_loop;
 528                        },
 529                        else => {
 530                            self.state = .post_value;
 531                            return Token{ .number = self.takeValueSlice() };
 532                        },
 533                    }
 534                }
 535                return self.endOfBufferInNumber(true);
 536            },
 537            .number_post_dot => {
 538                if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
 539                switch (self.input[self.cursor]) {
 540                    '0'...'9' => {
 541                        self.cursor += 1;
 542                        self.state = .number_frac;
 543                        continue :state_loop;
 544                    },
 545                    else => return error.SyntaxError,
 546                }
 547            },
 548            .number_frac => {
 549                while (self.cursor < self.input.len) : (self.cursor += 1) {
 550                    switch (self.input[self.cursor]) {
 551                        '0'...'9' => continue,
 552                        'e', 'E' => {
 553                            self.cursor += 1;
 554                            self.state = .number_post_e;
 555                            continue :state_loop;
 556                        },
 557                        else => {
 558                            self.state = .post_value;
 559                            return Token{ .number = self.takeValueSlice() };
 560                        },
 561                    }
 562                }
 563                return self.endOfBufferInNumber(true);
 564            },
 565            .number_post_e => {
 566                if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
 567                switch (self.input[self.cursor]) {
 568                    '0'...'9' => {
 569                        self.cursor += 1;
 570                        self.state = .number_exp;
 571                        continue :state_loop;
 572                    },
 573                    '+', '-' => {
 574                        self.cursor += 1;
 575                        self.state = .number_post_e_sign;
 576                        continue :state_loop;
 577                    },
 578                    else => return error.SyntaxError,
 579                }
 580            },
 581            .number_post_e_sign => {
 582                if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
 583                switch (self.input[self.cursor]) {
 584                    '0'...'9' => {
 585                        self.cursor += 1;
 586                        self.state = .number_exp;
 587                        continue :state_loop;
 588                    },
 589                    else => return error.SyntaxError,
 590                }
 591            },
 592            .number_exp => {
 593                while (self.cursor < self.input.len) : (self.cursor += 1) {
 594                    switch (self.input[self.cursor]) {
 595                        '0'...'9' => continue,
 596                        else => {
 597                            self.state = .post_value;
 598                            return Token{ .number = self.takeValueSlice() };
 599                        },
 600                    }
 601                }
 602                return self.endOfBufferInNumber(true);
 603            },
 604
 605            .string => {
 606                while (self.cursor < self.input.len) : (self.cursor += 1) {
 607                    switch (self.input[self.cursor]) {
 608                        0...0x1f => return error.SyntaxError, // Bare ASCII control code in string.
 609
 610                        // ASCII plain text.
 611                        0x20...('"' - 1), ('"' + 1)...('\\' - 1), ('\\' + 1)...0x7F => continue,
 612
 613                        // Special characters.
 614                        '"' => {
 615                            const result = Token{ .string = self.takeValueSlice() };
 616                            self.cursor += 1;
 617                            self.state = .post_value;
 618                            return result;
 619                        },
 620                        '\\' => {
 621                            const slice = self.takeValueSlice();
 622                            self.cursor += 1;
 623                            self.state = .string_backslash;
 624                            if (slice.len > 0) return Token{ .partial_string = slice };
 625                            continue :state_loop;
 626                        },
 627
 628                        // UTF-8 validation.
 629                        // See http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
 630                        0xC2...0xDF => {
 631                            self.cursor += 1;
 632                            self.state = .string_utf8_last_byte;
 633                            continue :state_loop;
 634                        },
 635                        0xE0 => {
 636                            self.cursor += 1;
 637                            self.state = .string_utf8_second_to_last_byte_guard_against_overlong;
 638                            continue :state_loop;
 639                        },
 640                        0xE1...0xEC, 0xEE...0xEF => {
 641                            self.cursor += 1;
 642                            self.state = .string_utf8_second_to_last_byte;
 643                            continue :state_loop;
 644                        },
 645                        0xED => {
 646                            self.cursor += 1;
 647                            self.state = .string_utf8_second_to_last_byte_guard_against_surrogate_half;
 648                            continue :state_loop;
 649                        },
 650                        0xF0 => {
 651                            self.cursor += 1;
 652                            self.state = .string_utf8_third_to_last_byte_guard_against_overlong;
 653                            continue :state_loop;
 654                        },
 655                        0xF1...0xF3 => {
 656                            self.cursor += 1;
 657                            self.state = .string_utf8_third_to_last_byte;
 658                            continue :state_loop;
 659                        },
 660                        0xF4 => {
 661                            self.cursor += 1;
 662                            self.state = .string_utf8_third_to_last_byte_guard_against_too_large;
 663                            continue :state_loop;
 664                        },
 665                        0x80...0xC1, 0xF5...0xFF => return error.SyntaxError, // Invalid UTF-8.
 666                    }
 667                }
 668                if (self.is_end_of_input) return error.UnexpectedEndOfInput;
 669                const slice = self.takeValueSlice();
 670                if (slice.len > 0) return Token{ .partial_string = slice };
 671                return error.BufferUnderrun;
 672            },
 673            .string_backslash => {
 674                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 675                switch (self.input[self.cursor]) {
 676                    '"', '\\', '/' => {
 677                        // Since these characters now represent themselves literally,
 678                        // we can simply begin the next plaintext slice here.
 679                        self.value_start = self.cursor;
 680                        self.cursor += 1;
 681                        self.state = .string;
 682                        continue :state_loop;
 683                    },
 684                    'b' => {
 685                        self.cursor += 1;
 686                        self.value_start = self.cursor;
 687                        self.state = .string;
 688                        return Token{ .partial_string_escaped_1 = [_]u8{0x08} };
 689                    },
 690                    'f' => {
 691                        self.cursor += 1;
 692                        self.value_start = self.cursor;
 693                        self.state = .string;
 694                        return Token{ .partial_string_escaped_1 = [_]u8{0x0c} };
 695                    },
 696                    'n' => {
 697                        self.cursor += 1;
 698                        self.value_start = self.cursor;
 699                        self.state = .string;
 700                        return Token{ .partial_string_escaped_1 = [_]u8{'\n'} };
 701                    },
 702                    'r' => {
 703                        self.cursor += 1;
 704                        self.value_start = self.cursor;
 705                        self.state = .string;
 706                        return Token{ .partial_string_escaped_1 = [_]u8{'\r'} };
 707                    },
 708                    't' => {
 709                        self.cursor += 1;
 710                        self.value_start = self.cursor;
 711                        self.state = .string;
 712                        return Token{ .partial_string_escaped_1 = [_]u8{'\t'} };
 713                    },
 714                    'u' => {
 715                        self.cursor += 1;
 716                        self.state = .string_backslash_u;
 717                        continue :state_loop;
 718                    },
 719                    else => return error.SyntaxError,
 720                }
 721            },
 722            .string_backslash_u => {
 723                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 724                const c = self.input[self.cursor];
 725                switch (c) {
 726                    '0'...'9' => {
 727                        self.utf16_code_units[0] = @as(u16, c - '0') << 12;
 728                    },
 729                    'A'...'F' => {
 730                        self.utf16_code_units[0] = @as(u16, c - 'A' + 10) << 12;
 731                    },
 732                    'a'...'f' => {
 733                        self.utf16_code_units[0] = @as(u16, c - 'a' + 10) << 12;
 734                    },
 735                    else => return error.SyntaxError,
 736                }
 737                self.cursor += 1;
 738                self.state = .string_backslash_u_1;
 739                continue :state_loop;
 740            },
 741            .string_backslash_u_1 => {
 742                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 743                const c = self.input[self.cursor];
 744                switch (c) {
 745                    '0'...'9' => {
 746                        self.utf16_code_units[0] |= @as(u16, c - '0') << 8;
 747                    },
 748                    'A'...'F' => {
 749                        self.utf16_code_units[0] |= @as(u16, c - 'A' + 10) << 8;
 750                    },
 751                    'a'...'f' => {
 752                        self.utf16_code_units[0] |= @as(u16, c - 'a' + 10) << 8;
 753                    },
 754                    else => return error.SyntaxError,
 755                }
 756                self.cursor += 1;
 757                self.state = .string_backslash_u_2;
 758                continue :state_loop;
 759            },
 760            .string_backslash_u_2 => {
 761                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 762                const c = self.input[self.cursor];
 763                switch (c) {
 764                    '0'...'9' => {
 765                        self.utf16_code_units[0] |= @as(u16, c - '0') << 4;
 766                    },
 767                    'A'...'F' => {
 768                        self.utf16_code_units[0] |= @as(u16, c - 'A' + 10) << 4;
 769                    },
 770                    'a'...'f' => {
 771                        self.utf16_code_units[0] |= @as(u16, c - 'a' + 10) << 4;
 772                    },
 773                    else => return error.SyntaxError,
 774                }
 775                self.cursor += 1;
 776                self.state = .string_backslash_u_3;
 777                continue :state_loop;
 778            },
 779            .string_backslash_u_3 => {
 780                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 781                const c = self.input[self.cursor];
 782                switch (c) {
 783                    '0'...'9' => {
 784                        self.utf16_code_units[0] |= c - '0';
 785                    },
 786                    'A'...'F' => {
 787                        self.utf16_code_units[0] |= c - 'A' + 10;
 788                    },
 789                    'a'...'f' => {
 790                        self.utf16_code_units[0] |= c - 'a' + 10;
 791                    },
 792                    else => return error.SyntaxError,
 793                }
 794                self.cursor += 1;
 795                if (std.unicode.utf16IsHighSurrogate(self.utf16_code_units[0])) {
 796                    self.state = .string_surrogate_half;
 797                    continue :state_loop;
 798                } else if (std.unicode.utf16IsLowSurrogate(self.utf16_code_units[0])) {
 799                    return error.SyntaxError; // Unexpected low surrogate half.
 800                } else {
 801                    self.value_start = self.cursor;
 802                    self.state = .string;
 803                    return partialStringCodepoint(self.utf16_code_units[0]);
 804                }
 805            },
 806            .string_surrogate_half => {
 807                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 808                switch (self.input[self.cursor]) {
 809                    '\\' => {
 810                        self.cursor += 1;
 811                        self.state = .string_surrogate_half_backslash;
 812                        continue :state_loop;
 813                    },
 814                    else => return error.SyntaxError, // Expected low surrogate half.
 815                }
 816            },
 817            .string_surrogate_half_backslash => {
 818                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 819                switch (self.input[self.cursor]) {
 820                    'u' => {
 821                        self.cursor += 1;
 822                        self.state = .string_surrogate_half_backslash_u;
 823                        continue :state_loop;
 824                    },
 825                    else => return error.SyntaxError, // Expected low surrogate half.
 826                }
 827            },
 828            .string_surrogate_half_backslash_u => {
 829                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 830                switch (self.input[self.cursor]) {
 831                    'D', 'd' => {
 832                        self.cursor += 1;
 833                        self.utf16_code_units[1] = 0xD << 12;
 834                        self.state = .string_surrogate_half_backslash_u_1;
 835                        continue :state_loop;
 836                    },
 837                    else => return error.SyntaxError, // Expected low surrogate half.
 838                }
 839            },
 840            .string_surrogate_half_backslash_u_1 => {
 841                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 842                const c = self.input[self.cursor];
 843                switch (c) {
 844                    'C'...'F' => {
 845                        self.cursor += 1;
 846                        self.utf16_code_units[1] |= @as(u16, c - 'A' + 10) << 8;
 847                        self.state = .string_surrogate_half_backslash_u_2;
 848                        continue :state_loop;
 849                    },
 850                    'c'...'f' => {
 851                        self.cursor += 1;
 852                        self.utf16_code_units[1] |= @as(u16, c - 'a' + 10) << 8;
 853                        self.state = .string_surrogate_half_backslash_u_2;
 854                        continue :state_loop;
 855                    },
 856                    else => return error.SyntaxError, // Expected low surrogate half.
 857                }
 858            },
 859            .string_surrogate_half_backslash_u_2 => {
 860                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 861                const c = self.input[self.cursor];
 862                switch (c) {
 863                    '0'...'9' => {
 864                        self.cursor += 1;
 865                        self.utf16_code_units[1] |= @as(u16, c - '0') << 4;
 866                        self.state = .string_surrogate_half_backslash_u_3;
 867                        continue :state_loop;
 868                    },
 869                    'A'...'F' => {
 870                        self.cursor += 1;
 871                        self.utf16_code_units[1] |= @as(u16, c - 'A' + 10) << 4;
 872                        self.state = .string_surrogate_half_backslash_u_3;
 873                        continue :state_loop;
 874                    },
 875                    'a'...'f' => {
 876                        self.cursor += 1;
 877                        self.utf16_code_units[1] |= @as(u16, c - 'a' + 10) << 4;
 878                        self.state = .string_surrogate_half_backslash_u_3;
 879                        continue :state_loop;
 880                    },
 881                    else => return error.SyntaxError,
 882                }
 883            },
 884            .string_surrogate_half_backslash_u_3 => {
 885                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 886                const c = self.input[self.cursor];
 887                switch (c) {
 888                    '0'...'9' => {
 889                        self.utf16_code_units[1] |= c - '0';
 890                    },
 891                    'A'...'F' => {
 892                        self.utf16_code_units[1] |= c - 'A' + 10;
 893                    },
 894                    'a'...'f' => {
 895                        self.utf16_code_units[1] |= c - 'a' + 10;
 896                    },
 897                    else => return error.SyntaxError,
 898                }
 899                self.cursor += 1;
 900                self.value_start = self.cursor;
 901                self.state = .string;
 902                const code_point = std.unicode.utf16DecodeSurrogatePair(&self.utf16_code_units) catch unreachable;
 903                return partialStringCodepoint(code_point);
 904            },
 905
 906            .string_utf8_last_byte => {
 907                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 908                switch (self.input[self.cursor]) {
 909                    0x80...0xBF => {
 910                        self.cursor += 1;
 911                        self.state = .string;
 912                        continue :state_loop;
 913                    },
 914                    else => return error.SyntaxError, // Invalid UTF-8.
 915                }
 916            },
 917            .string_utf8_second_to_last_byte => {
 918                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 919                switch (self.input[self.cursor]) {
 920                    0x80...0xBF => {
 921                        self.cursor += 1;
 922                        self.state = .string_utf8_last_byte;
 923                        continue :state_loop;
 924                    },
 925                    else => return error.SyntaxError, // Invalid UTF-8.
 926                }
 927            },
 928            .string_utf8_second_to_last_byte_guard_against_overlong => {
 929                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 930                switch (self.input[self.cursor]) {
 931                    0xA0...0xBF => {
 932                        self.cursor += 1;
 933                        self.state = .string_utf8_last_byte;
 934                        continue :state_loop;
 935                    },
 936                    else => return error.SyntaxError, // Invalid UTF-8.
 937                }
 938            },
 939            .string_utf8_second_to_last_byte_guard_against_surrogate_half => {
 940                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 941                switch (self.input[self.cursor]) {
 942                    0x80...0x9F => {
 943                        self.cursor += 1;
 944                        self.state = .string_utf8_last_byte;
 945                        continue :state_loop;
 946                    },
 947                    else => return error.SyntaxError, // Invalid UTF-8.
 948                }
 949            },
 950            .string_utf8_third_to_last_byte => {
 951                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 952                switch (self.input[self.cursor]) {
 953                    0x80...0xBF => {
 954                        self.cursor += 1;
 955                        self.state = .string_utf8_second_to_last_byte;
 956                        continue :state_loop;
 957                    },
 958                    else => return error.SyntaxError, // Invalid UTF-8.
 959                }
 960            },
 961            .string_utf8_third_to_last_byte_guard_against_overlong => {
 962                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 963                switch (self.input[self.cursor]) {
 964                    0x90...0xBF => {
 965                        self.cursor += 1;
 966                        self.state = .string_utf8_second_to_last_byte;
 967                        continue :state_loop;
 968                    },
 969                    else => return error.SyntaxError, // Invalid UTF-8.
 970                }
 971            },
 972            .string_utf8_third_to_last_byte_guard_against_too_large => {
 973                if (self.cursor >= self.input.len) return self.endOfBufferInString();
 974                switch (self.input[self.cursor]) {
 975                    0x80...0x8F => {
 976                        self.cursor += 1;
 977                        self.state = .string_utf8_second_to_last_byte;
 978                        continue :state_loop;
 979                    },
 980                    else => return error.SyntaxError, // Invalid UTF-8.
 981                }
 982            },
 983
 984            .literal_t => {
 985                switch (try self.expectByte()) {
 986                    'r' => {
 987                        self.cursor += 1;
 988                        self.state = .literal_tr;
 989                        continue :state_loop;
 990                    },
 991                    else => return error.SyntaxError,
 992                }
 993            },
 994            .literal_tr => {
 995                switch (try self.expectByte()) {
 996                    'u' => {
 997                        self.cursor += 1;
 998                        self.state = .literal_tru;
 999                        continue :state_loop;
1000                    },
1001                    else => return error.SyntaxError,
1002                }
1003            },
1004            .literal_tru => {
1005                switch (try self.expectByte()) {
1006                    'e' => {
1007                        self.cursor += 1;
1008                        self.state = .post_value;
1009                        return .true;
1010                    },
1011                    else => return error.SyntaxError,
1012                }
1013            },
1014            .literal_f => {
1015                switch (try self.expectByte()) {
1016                    'a' => {
1017                        self.cursor += 1;
1018                        self.state = .literal_fa;
1019                        continue :state_loop;
1020                    },
1021                    else => return error.SyntaxError,
1022                }
1023            },
1024            .literal_fa => {
1025                switch (try self.expectByte()) {
1026                    'l' => {
1027                        self.cursor += 1;
1028                        self.state = .literal_fal;
1029                        continue :state_loop;
1030                    },
1031                    else => return error.SyntaxError,
1032                }
1033            },
1034            .literal_fal => {
1035                switch (try self.expectByte()) {
1036                    's' => {
1037                        self.cursor += 1;
1038                        self.state = .literal_fals;
1039                        continue :state_loop;
1040                    },
1041                    else => return error.SyntaxError,
1042                }
1043            },
1044            .literal_fals => {
1045                switch (try self.expectByte()) {
1046                    'e' => {
1047                        self.cursor += 1;
1048                        self.state = .post_value;
1049                        return .false;
1050                    },
1051                    else => return error.SyntaxError,
1052                }
1053            },
1054            .literal_n => {
1055                switch (try self.expectByte()) {
1056                    'u' => {
1057                        self.cursor += 1;
1058                        self.state = .literal_nu;
1059                        continue :state_loop;
1060                    },
1061                    else => return error.SyntaxError,
1062                }
1063            },
1064            .literal_nu => {
1065                switch (try self.expectByte()) {
1066                    'l' => {
1067                        self.cursor += 1;
1068                        self.state = .literal_nul;
1069                        continue :state_loop;
1070                    },
1071                    else => return error.SyntaxError,
1072                }
1073            },
1074            .literal_nul => {
1075                switch (try self.expectByte()) {
1076                    'l' => {
1077                        self.cursor += 1;
1078                        self.state = .post_value;
1079                        return .null;
1080                    },
1081                    else => return error.SyntaxError,
1082                }
1083            },
1084        }
1085        unreachable;
1086    }
1087}
1088
1089/// Seeks ahead in the input until the first byte of the next token (or the end of the input)
1090/// determines which type of token will be returned from the next `next*()` call.
1091/// This function is idempotent, only advancing past commas, colons, and inter-token whitespace.
1092pub fn peekNextTokenType(self: *@This()) PeekError!TokenType {
1093    state_loop: while (true) {
1094        switch (self.state) {
1095            .value => {
1096                switch (try self.skipWhitespaceExpectByte()) {
1097                    '{' => return .object_begin,
1098                    '[' => return .array_begin,
1099                    '"' => return .string,
1100                    '-', '0'...'9' => return .number,
1101                    't' => return .true,
1102                    'f' => return .false,
1103                    'n' => return .null,
1104                    else => return error.SyntaxError,
1105                }
1106            },
1107
1108            .post_value => {
1109                if (try self.skipWhitespaceCheckEnd()) return .end_of_document;
1110
1111                const c = self.input[self.cursor];
1112                if (self.string_is_object_key) {
1113                    self.string_is_object_key = false;
1114                    switch (c) {
1115                        ':' => {
1116                            self.cursor += 1;
1117                            self.state = .value;
1118                            continue :state_loop;
1119                        },
1120                        else => return error.SyntaxError,
1121                    }
1122                }
1123
1124                switch (c) {
1125                    '}' => return .object_end,
1126                    ']' => return .array_end,
1127                    ',' => {
1128                        switch (self.stack.peek()) {
1129                            OBJECT_MODE => {
1130                                self.state = .object_post_comma;
1131                            },
1132                            ARRAY_MODE => {
1133                                self.state = .value;
1134                            },
1135                        }
1136                        self.cursor += 1;
1137                        continue :state_loop;
1138                    },
1139                    else => return error.SyntaxError,
1140                }
1141            },
1142
1143            .object_start => {
1144                switch (try self.skipWhitespaceExpectByte()) {
1145                    '"' => return .string,
1146                    '}' => return .object_end,
1147                    else => return error.SyntaxError,
1148                }
1149            },
1150            .object_post_comma => {
1151                switch (try self.skipWhitespaceExpectByte()) {
1152                    '"' => return .string,
1153                    else => return error.SyntaxError,
1154                }
1155            },
1156
1157            .array_start => {
1158                switch (try self.skipWhitespaceExpectByte()) {
1159                    ']' => return .array_end,
1160                    else => {
1161                        self.state = .value;
1162                        continue :state_loop;
1163                    },
1164                }
1165            },
1166
1167            .number_minus,
1168            .number_leading_zero,
1169            .number_int,
1170            .number_post_dot,
1171            .number_frac,
1172            .number_post_e,
1173            .number_post_e_sign,
1174            .number_exp,
1175            => return .number,
1176
1177            .string,
1178            .string_backslash,
1179            .string_backslash_u,
1180            .string_backslash_u_1,
1181            .string_backslash_u_2,
1182            .string_backslash_u_3,
1183            .string_surrogate_half,
1184            .string_surrogate_half_backslash,
1185            .string_surrogate_half_backslash_u,
1186            .string_surrogate_half_backslash_u_1,
1187            .string_surrogate_half_backslash_u_2,
1188            .string_surrogate_half_backslash_u_3,
1189            => return .string,
1190
1191            .string_utf8_last_byte,
1192            .string_utf8_second_to_last_byte,
1193            .string_utf8_second_to_last_byte_guard_against_overlong,
1194            .string_utf8_second_to_last_byte_guard_against_surrogate_half,
1195            .string_utf8_third_to_last_byte,
1196            .string_utf8_third_to_last_byte_guard_against_overlong,
1197            .string_utf8_third_to_last_byte_guard_against_too_large,
1198            => return .string,
1199
1200            .literal_t,
1201            .literal_tr,
1202            .literal_tru,
1203            => return .true,
1204            .literal_f,
1205            .literal_fa,
1206            .literal_fal,
1207            .literal_fals,
1208            => return .false,
1209            .literal_n,
1210            .literal_nu,
1211            .literal_nul,
1212            => return .null,
1213        }
1214        unreachable;
1215    }
1216}
1217
1218const State = enum {
1219    value,
1220    post_value,
1221
1222    object_start,
1223    object_post_comma,
1224
1225    array_start,
1226
1227    number_minus,
1228    number_leading_zero,
1229    number_int,
1230    number_post_dot,
1231    number_frac,
1232    number_post_e,
1233    number_post_e_sign,
1234    number_exp,
1235
1236    string,
1237    string_backslash,
1238    string_backslash_u,
1239    string_backslash_u_1,
1240    string_backslash_u_2,
1241    string_backslash_u_3,
1242    string_surrogate_half,
1243    string_surrogate_half_backslash,
1244    string_surrogate_half_backslash_u,
1245    string_surrogate_half_backslash_u_1,
1246    string_surrogate_half_backslash_u_2,
1247    string_surrogate_half_backslash_u_3,
1248
1249    // From http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
1250    string_utf8_last_byte, // State A
1251    string_utf8_second_to_last_byte, // State B
1252    string_utf8_second_to_last_byte_guard_against_overlong, // State C
1253    string_utf8_second_to_last_byte_guard_against_surrogate_half, // State D
1254    string_utf8_third_to_last_byte, // State E
1255    string_utf8_third_to_last_byte_guard_against_overlong, // State F
1256    string_utf8_third_to_last_byte_guard_against_too_large, // State G
1257
1258    literal_t,
1259    literal_tr,
1260    literal_tru,
1261    literal_f,
1262    literal_fa,
1263    literal_fal,
1264    literal_fals,
1265    literal_n,
1266    literal_nu,
1267    literal_nul,
1268};
1269
1270fn expectByte(self: *const @This()) !u8 {
1271    if (self.cursor < self.input.len) {
1272        return self.input[self.cursor];
1273    }
1274    // No byte.
1275    if (self.is_end_of_input) return error.UnexpectedEndOfInput;
1276    return error.BufferUnderrun;
1277}
1278
1279fn skipWhitespace(self: *@This()) void {
1280    while (self.cursor < self.input.len) : (self.cursor += 1) {
1281        switch (self.input[self.cursor]) {
1282            // Whitespace
1283            ' ', '\t', '\r' => continue,
1284            '\n' => {
1285                if (self.diagnostics) |diag| {
1286                    diag.line_number += 1;
1287                    // This will count the newline itself,
1288                    // which means a straight-forward subtraction will give a 1-based column number.
1289                    diag.line_start_cursor = self.cursor;
1290                }
1291                continue;
1292            },
1293            else => return,
1294        }
1295    }
1296}
1297
1298fn skipWhitespaceExpectByte(self: *@This()) !u8 {
1299    self.skipWhitespace();
1300    return self.expectByte();
1301}
1302
1303fn skipWhitespaceCheckEnd(self: *@This()) !bool {
1304    self.skipWhitespace();
1305    if (self.cursor >= self.input.len) {
1306        // End of buffer.
1307        if (self.is_end_of_input) {
1308            // End of everything.
1309            if (self.stackHeight() == 0) {
1310                // We did it!
1311                return true;
1312            }
1313            return error.UnexpectedEndOfInput;
1314        }
1315        return error.BufferUnderrun;
1316    }
1317    if (self.stackHeight() == 0) return error.SyntaxError;
1318    return false;
1319}
1320
1321fn takeValueSlice(self: *@This()) []const u8 {
1322    const slice = self.input[self.value_start..self.cursor];
1323    self.value_start = self.cursor;
1324    return slice;
1325}
1326fn takeValueSliceMinusTrailingOffset(self: *@This(), trailing_negative_offset: usize) []const u8 {
1327    // Check if the escape sequence started before the current input buffer.
1328    // (The algebra here is awkward to avoid unsigned underflow,
1329    //  but it's just making sure the slice on the next line isn't UB.)
1330    if (self.cursor <= self.value_start + trailing_negative_offset) return "";
1331    const slice = self.input[self.value_start .. self.cursor - trailing_negative_offset];
1332    // When trailing_negative_offset is non-zero, setting self.value_start doesn't matter,
1333    // because we always set it again while emitting the .partial_string_escaped_*.
1334    self.value_start = self.cursor;
1335    return slice;
1336}
1337
1338fn endOfBufferInNumber(self: *@This(), allow_end: bool) !Token {
1339    const slice = self.takeValueSlice();
1340    if (self.is_end_of_input) {
1341        if (!allow_end) return error.UnexpectedEndOfInput;
1342        self.state = .post_value;
1343        return Token{ .number = slice };
1344    }
1345    if (slice.len == 0) return error.BufferUnderrun;
1346    return Token{ .partial_number = slice };
1347}
1348
1349fn endOfBufferInString(self: *@This()) !Token {
1350    if (self.is_end_of_input) return error.UnexpectedEndOfInput;
1351    const slice = self.takeValueSliceMinusTrailingOffset(switch (self.state) {
1352        // Don't include the escape sequence in the partial string.
1353        .string_backslash => 1,
1354        .string_backslash_u => 2,
1355        .string_backslash_u_1 => 3,
1356        .string_backslash_u_2 => 4,
1357        .string_backslash_u_3 => 5,
1358        .string_surrogate_half => 6,
1359        .string_surrogate_half_backslash => 7,
1360        .string_surrogate_half_backslash_u => 8,
1361        .string_surrogate_half_backslash_u_1 => 9,
1362        .string_surrogate_half_backslash_u_2 => 10,
1363        .string_surrogate_half_backslash_u_3 => 11,
1364
1365        // Include everything up to the cursor otherwise.
1366        .string,
1367        .string_utf8_last_byte,
1368        .string_utf8_second_to_last_byte,
1369        .string_utf8_second_to_last_byte_guard_against_overlong,
1370        .string_utf8_second_to_last_byte_guard_against_surrogate_half,
1371        .string_utf8_third_to_last_byte,
1372        .string_utf8_third_to_last_byte_guard_against_overlong,
1373        .string_utf8_third_to_last_byte_guard_against_too_large,
1374        => 0,
1375
1376        else => unreachable,
1377    });
1378    if (slice.len == 0) return error.BufferUnderrun;
1379    return Token{ .partial_string = slice };
1380}
1381
1382fn partialStringCodepoint(code_point: u21) Token {
1383    var buf: [4]u8 = undefined;
1384    switch (std.unicode.utf8Encode(code_point, &buf) catch unreachable) {
1385        1 => return Token{ .partial_string_escaped_1 = buf[0..1].* },
1386        2 => return Token{ .partial_string_escaped_2 = buf[0..2].* },
1387        3 => return Token{ .partial_string_escaped_3 = buf[0..3].* },
1388        4 => return Token{ .partial_string_escaped_4 = buf[0..4].* },
1389        else => unreachable,
1390    }
1391}
1392
1393/// Scan the input and check for malformed JSON.
1394/// On `SyntaxError` or `UnexpectedEndOfInput`, returns `false`.
1395/// Returns any errors from the allocator as-is, which is unlikely,
1396/// but can be caused by extreme nesting depth in the input.
1397pub fn validate(allocator: Allocator, s: []const u8) Allocator.Error!bool {
1398    var scanner = Scanner.initCompleteInput(allocator, s);
1399    defer scanner.deinit();
1400
1401    while (true) {
1402        const token = scanner.next() catch |err| switch (err) {
1403            error.SyntaxError, error.UnexpectedEndOfInput => return false,
1404            error.OutOfMemory => return error.OutOfMemory,
1405            error.BufferUnderrun => unreachable,
1406        };
1407        if (token == .end_of_document) break;
1408    }
1409
1410    return true;
1411}
1412
1413/// The parsing errors are divided into two categories:
1414///  * `SyntaxError` is for clearly malformed JSON documents,
1415///    such as giving an input document that isn't JSON at all.
1416///  * `UnexpectedEndOfInput` is for signaling that everything's been
1417///    valid so far, but the input appears to be truncated for some reason.
1418/// Note that a completely empty (or whitespace-only) input will give `UnexpectedEndOfInput`.
1419pub const Error = error{ SyntaxError, UnexpectedEndOfInput };
1420
1421/// Used by `json.reader`.
1422pub const default_buffer_size = 0x1000;
1423
1424/// The tokens emitted by `std.json.Scanner` and `std.json.Reader` `.next*()` functions follow this grammar:
1425/// ```
1426///  <document> = <value> .end_of_document
1427///  <value> =
1428///    | <object>
1429///    | <array>
1430///    | <number>
1431///    | <string>
1432///    | .true
1433///    | .false
1434///    | .null
1435///  <object> = .object_begin ( <string> <value> )* .object_end
1436///  <array> = .array_begin ( <value> )* .array_end
1437///  <number> = <It depends. See below.>
1438///  <string> = <It depends. See below.>
1439/// ```
1440///
1441/// What you get for `<number>` and `<string>` values depends on which `next*()` method you call:
1442///
1443/// ```
1444/// next():
1445///  <number> = ( .partial_number )* .number
1446///  <string> = ( <partial_string> )* .string
1447///  <partial_string> =
1448///    | .partial_string
1449///    | .partial_string_escaped_1
1450///    | .partial_string_escaped_2
1451///    | .partial_string_escaped_3
1452///    | .partial_string_escaped_4
1453///
1454/// nextAlloc*(..., .alloc_always):
1455///  <number> = .allocated_number
1456///  <string> = .allocated_string
1457///
1458/// nextAlloc*(..., .alloc_if_needed):
1459///  <number> =
1460///    | .number
1461///    | .allocated_number
1462///  <string> =
1463///    | .string
1464///    | .allocated_string
1465/// ```
1466///
1467/// For all tokens with a `[]const u8`, `[]u8`, or `[n]u8` payload, the payload represents the content of the value.
1468/// For number values, this is the representation of the number exactly as it appears in the input.
1469/// For strings, this is the content of the string after resolving escape sequences.
1470///
1471/// For `.allocated_number` and `.allocated_string`, the `[]u8` payloads are allocations made with the given allocator.
1472/// You are responsible for managing that memory. `json.Reader.deinit()` does *not* free those allocations.
1473///
1474/// The `.partial_*` tokens indicate that a value spans multiple input buffers or that a string contains escape sequences.
1475/// To get a complete value in memory, you need to concatenate the values yourself.
1476/// Calling `nextAlloc*()` does this for you, and returns an `.allocated_*` token with the result.
1477///
1478/// For tokens with a `[]const u8` payload, the payload is a slice into the current input buffer.
1479/// The memory may become undefined during the next call to `json.Scanner.feedInput()`
1480/// or any `json.Reader` method whose return error set includes `json.Error`.
1481/// To keep the value persistently, it recommended to make a copy or to use `.alloc_always`,
1482/// which makes a copy for you.
1483///
1484/// Note that `.number` and `.string` tokens that follow `.partial_*` tokens may have `0` length to indicate that
1485/// the previously partial value is completed with no additional bytes.
1486/// (This can happen when the break between input buffers happens to land on the exact end of a value. E.g. `"[1234"`, `"]"`.)
1487/// `.partial_*` tokens never have `0` length.
1488///
1489/// The recommended strategy for using the different `next*()` methods is something like this:
1490///
1491/// When you're expecting an object key, use `.alloc_if_needed`.
1492/// You often don't need a copy of the key string to persist; you might just check which field it is.
1493/// In the case that the key happens to require an allocation, free it immediately after checking it.
1494///
1495/// When you're expecting a meaningful string value (such as on the right of a `:`),
1496/// use `.alloc_always` in order to keep the value valid throughout parsing the rest of the document.
1497///
1498/// When you're expecting a number value, use `.alloc_if_needed`.
1499/// You're probably going to be parsing the string representation of the number into a numeric representation,
1500/// so you need the complete string representation only temporarily.
1501///
1502/// When you're skipping an unrecognized value, use `skipValue()`.
1503pub const Token = union(enum) {
1504    object_begin,
1505    object_end,
1506    array_begin,
1507    array_end,
1508
1509    true,
1510    false,
1511    null,
1512
1513    number: []const u8,
1514    partial_number: []const u8,
1515    allocated_number: []u8,
1516
1517    string: []const u8,
1518    partial_string: []const u8,
1519    partial_string_escaped_1: [1]u8,
1520    partial_string_escaped_2: [2]u8,
1521    partial_string_escaped_3: [3]u8,
1522    partial_string_escaped_4: [4]u8,
1523    allocated_string: []u8,
1524
1525    end_of_document,
1526};
1527
1528/// This is only used in `peekNextTokenType()` and gives a categorization based on the first byte of the next token that will be emitted from a `next*()` call.
1529pub const TokenType = enum {
1530    object_begin,
1531    object_end,
1532    array_begin,
1533    array_end,
1534    true,
1535    false,
1536    null,
1537    number,
1538    string,
1539    end_of_document,
1540};
1541
1542/// To enable diagnostics, declare `var diagnostics = Diagnostics{};` then call `source.enableDiagnostics(&diagnostics);`
1543/// where `source` is either a `std.json.Reader` or a `std.json.Scanner` that has just been initialized.
1544/// At any time, notably just after an error, call `getLine()`, `getColumn()`, and/or `getByteOffset()`
1545/// to get meaningful information from this.
1546pub const Diagnostics = struct {
1547    line_number: u64 = 1,
1548    line_start_cursor: usize = @as(usize, @bitCast(@as(isize, -1))), // Start just "before" the input buffer to get a 1-based column for line 1.
1549    total_bytes_before_current_input: u64 = 0,
1550    cursor_pointer: *const usize = undefined,
1551
1552    /// Starts at 1.
1553    pub fn getLine(self: *const @This()) u64 {
1554        return self.line_number;
1555    }
1556    /// Starts at 1.
1557    pub fn getColumn(self: *const @This()) u64 {
1558        return self.cursor_pointer.* -% self.line_start_cursor;
1559    }
1560    /// Starts at 0. Measures the byte offset since the start of the input.
1561    pub fn getByteOffset(self: *const @This()) u64 {
1562        return self.total_bytes_before_current_input + self.cursor_pointer.*;
1563    }
1564};
1565
1566/// See the documentation for `std.json.Token`.
1567pub const AllocWhen = enum { alloc_if_needed, alloc_always };
1568
1569/// For security, the maximum size allocated to store a single string or number value is limited to 4MiB by default.
1570/// This limit can be specified by calling `nextAllocMax()` instead of `nextAlloc()`.
1571pub const default_max_value_len = 4 * 1024 * 1024;
1572
1573/// All `next*()` methods here handle `error.BufferUnderrun` from `std.json.Scanner`, and then read from the reader.
1574pub const Reader = struct {
1575    scanner: Scanner,
1576    reader: *std.Io.Reader,
1577
1578    /// The allocator is only used to track `[]` and `{}` nesting levels.
1579    pub fn init(allocator: Allocator, io_reader: *std.Io.Reader) @This() {
1580        return .{
1581            .scanner = Scanner.initStreaming(allocator),
1582            .reader = io_reader,
1583        };
1584    }
1585    pub fn deinit(self: *@This()) void {
1586        self.scanner.deinit();
1587        self.* = undefined;
1588    }
1589
1590    /// Calls `std.json.Scanner.enableDiagnostics`.
1591    pub fn enableDiagnostics(self: *@This(), diagnostics: *Diagnostics) void {
1592        self.scanner.enableDiagnostics(diagnostics);
1593    }
1594
1595    pub const NextError = std.Io.Reader.Error || Error || Allocator.Error;
1596    pub const SkipError = Reader.NextError;
1597    pub const AllocError = Reader.NextError || error{ValueTooLong};
1598    pub const PeekError = std.Io.Reader.Error || Error;
1599
1600    /// Equivalent to `nextAllocMax(allocator, when, default_max_value_len);`
1601    /// See also `std.json.Token` for documentation of `nextAlloc*()` function behavior.
1602    pub fn nextAlloc(self: *@This(), allocator: Allocator, when: AllocWhen) Reader.AllocError!Token {
1603        return self.nextAllocMax(allocator, when, default_max_value_len);
1604    }
1605    /// See also `std.json.Token` for documentation of `nextAlloc*()` function behavior.
1606    pub fn nextAllocMax(self: *@This(), allocator: Allocator, when: AllocWhen, max_value_len: usize) Reader.AllocError!Token {
1607        const token_type = try self.peekNextTokenType();
1608        switch (token_type) {
1609            .number, .string => {
1610                var value_list = std.array_list.Managed(u8).init(allocator);
1611                errdefer {
1612                    value_list.deinit();
1613                }
1614                if (try self.allocNextIntoArrayListMax(&value_list, when, max_value_len)) |slice| {
1615                    return if (token_type == .number)
1616                        Token{ .number = slice }
1617                    else
1618                        Token{ .string = slice };
1619                } else {
1620                    return if (token_type == .number)
1621                        Token{ .allocated_number = try value_list.toOwnedSlice() }
1622                    else
1623                        Token{ .allocated_string = try value_list.toOwnedSlice() };
1624                }
1625            },
1626
1627            // Simple tokens never alloc.
1628            .object_begin,
1629            .object_end,
1630            .array_begin,
1631            .array_end,
1632            .true,
1633            .false,
1634            .null,
1635            .end_of_document,
1636            => return try self.next(),
1637        }
1638    }
1639
1640    /// Equivalent to `allocNextIntoArrayListMax(value_list, when, default_max_value_len);`
1641    pub fn allocNextIntoArrayList(self: *@This(), value_list: *std.array_list.Managed(u8), when: AllocWhen) Reader.AllocError!?[]const u8 {
1642        return self.allocNextIntoArrayListMax(value_list, when, default_max_value_len);
1643    }
1644    /// Calls `std.json.Scanner.allocNextIntoArrayListMax` and handles `error.BufferUnderrun`.
1645    pub fn allocNextIntoArrayListMax(self: *@This(), value_list: *std.array_list.Managed(u8), when: AllocWhen, max_value_len: usize) Reader.AllocError!?[]const u8 {
1646        while (true) {
1647            return self.scanner.allocNextIntoArrayListMax(value_list, when, max_value_len) catch |err| switch (err) {
1648                error.BufferUnderrun => {
1649                    try self.refillBuffer();
1650                    continue;
1651                },
1652                else => |other_err| return other_err,
1653            };
1654        }
1655    }
1656
1657    /// Like `std.json.Scanner.skipValue`, but handles `error.BufferUnderrun`.
1658    pub fn skipValue(self: *@This()) Reader.SkipError!void {
1659        switch (try self.peekNextTokenType()) {
1660            .object_begin, .array_begin => {
1661                try self.skipUntilStackHeight(self.stackHeight());
1662            },
1663            .number, .string => {
1664                while (true) {
1665                    switch (try self.next()) {
1666                        .partial_number,
1667                        .partial_string,
1668                        .partial_string_escaped_1,
1669                        .partial_string_escaped_2,
1670                        .partial_string_escaped_3,
1671                        .partial_string_escaped_4,
1672                        => continue,
1673
1674                        .number, .string => break,
1675
1676                        else => unreachable,
1677                    }
1678                }
1679            },
1680            .true, .false, .null => {
1681                _ = try self.next();
1682            },
1683
1684            .object_end, .array_end, .end_of_document => unreachable, // Attempt to skip a non-value token.
1685        }
1686    }
1687    /// Like `std.json.Scanner.skipUntilStackHeight()` but handles `error.BufferUnderrun`.
1688    pub fn skipUntilStackHeight(self: *@This(), terminal_stack_height: usize) Reader.NextError!void {
1689        while (true) {
1690            return self.scanner.skipUntilStackHeight(terminal_stack_height) catch |err| switch (err) {
1691                error.BufferUnderrun => {
1692                    try self.refillBuffer();
1693                    continue;
1694                },
1695                else => |other_err| return other_err,
1696            };
1697        }
1698    }
1699
1700    /// Calls `std.json.Scanner.stackHeight`.
1701    pub fn stackHeight(self: *const @This()) usize {
1702        return self.scanner.stackHeight();
1703    }
1704    /// Calls `std.json.Scanner.ensureTotalStackCapacity`.
1705    pub fn ensureTotalStackCapacity(self: *@This(), height: usize) Allocator.Error!void {
1706        try self.scanner.ensureTotalStackCapacity(height);
1707    }
1708
1709    /// See `std.json.Token` for documentation of this function.
1710    pub fn next(self: *@This()) Reader.NextError!Token {
1711        while (true) {
1712            return self.scanner.next() catch |err| switch (err) {
1713                error.BufferUnderrun => {
1714                    try self.refillBuffer();
1715                    continue;
1716                },
1717                else => |other_err| return other_err,
1718            };
1719        }
1720    }
1721
1722    /// See `std.json.Scanner.peekNextTokenType()`.
1723    pub fn peekNextTokenType(self: *@This()) Reader.PeekError!TokenType {
1724        while (true) {
1725            return self.scanner.peekNextTokenType() catch |err| switch (err) {
1726                error.BufferUnderrun => {
1727                    try self.refillBuffer();
1728                    continue;
1729                },
1730                else => |other_err| return other_err,
1731            };
1732        }
1733    }
1734
1735    fn refillBuffer(self: *@This()) std.Io.Reader.Error!void {
1736        const input = self.reader.peekGreedy(1) catch |err| switch (err) {
1737            error.ReadFailed => return error.ReadFailed,
1738            error.EndOfStream => return self.scanner.endInput(),
1739        };
1740        self.reader.toss(input.len);
1741        self.scanner.feedInput(input);
1742    }
1743};
1744
1745const OBJECT_MODE = 0;
1746const ARRAY_MODE = 1;
1747
1748fn appendSlice(list: *std.array_list.Managed(u8), buf: []const u8, max_value_len: usize) !void {
1749    const new_len = std.math.add(usize, list.items.len, buf.len) catch return error.ValueTooLong;
1750    if (new_len > max_value_len) return error.ValueTooLong;
1751    try list.appendSlice(buf);
1752}
1753
1754/// For the slice you get from a `Token.number` or `Token.allocated_number`,
1755/// this function returns true if the number doesn't contain any fraction or exponent components, and is not `-0`.
1756/// Note, the numeric value encoded by the value may still be an integer, such as `1.0`.
1757/// This function is meant to give a hint about whether integer parsing or float parsing should be used on the value.
1758/// This function will not give meaningful results on non-numeric input.
1759pub fn isNumberFormattedLikeAnInteger(value: []const u8) bool {
1760    if (std.mem.eql(u8, value, "-0")) return false;
1761    return std.mem.indexOfAny(u8, value, ".eE") == null;
1762}
1763
1764test {
1765    _ = @import("./scanner_test.zig");
1766}