zig/lib/compiler/resinator/lex.zig at master

   1//! Expects to be run after the C preprocessor and after `removeComments`.
   2//! This means that the lexer assumes that:
   3//! - Splices ('\' at the end of a line) have been handled/collapsed.
   4//! - Preprocessor directives and macros have been expanded (any remaining should be skipped with the exception of `#pragma code_page`).
   5//! - All comments have been removed.
   6
   7const std = @import("std");
   8const ErrorDetails = @import("errors.zig").ErrorDetails;
   9const columnWidth = @import("literals.zig").columnWidth;
  10const code_pages = @import("code_pages.zig");
  11const SupportedCodePage = code_pages.SupportedCodePage;
  12const SourceMappings = @import("source_mapping.zig").SourceMappings;
  13const isNonAsciiDigit = @import("utils.zig").isNonAsciiDigit;
  14
  15const dumpTokensDuringTests = false;
  16
  17pub const default_max_string_literal_codepoints = 4097;
  18
  19pub const Token = struct {
  20    id: Id,
  21    start: usize,
  22    end: usize,
  23    line_number: usize,
  24
  25    pub const Id = enum {
  26        literal,
  27        number,
  28        quoted_ascii_string,
  29        quoted_wide_string,
  30        operator,
  31        begin,
  32        end,
  33        comma,
  34        open_paren,
  35        close_paren,
  36        /// This Id is only used for errors, the Lexer will never return one
  37        /// of these from a `next` call.
  38        preprocessor_command,
  39        invalid,
  40        eof,
  41
  42        pub fn nameForErrorDisplay(self: Id) []const u8 {
  43            return switch (self) {
  44                .literal => "<literal>",
  45                .number => "<number>",
  46                .quoted_ascii_string => "<quoted ascii string>",
  47                .quoted_wide_string => "<quoted wide string>",
  48                .operator => "<operator>",
  49                .begin => "<'{' or BEGIN>",
  50                .end => "<'}' or END>",
  51                .comma => ",",
  52                .open_paren => "(",
  53                .close_paren => ")",
  54                .preprocessor_command => "<preprocessor command>",
  55                .invalid => unreachable,
  56                .eof => "<eof>",
  57            };
  58        }
  59    };
  60
  61    pub fn slice(self: Token, buffer: []const u8) []const u8 {
  62        return buffer[self.start..self.end];
  63    }
  64
  65    /// Returns 0-based column
  66    pub fn calculateColumn(token: Token, source: []const u8, tab_columns: usize, maybe_line_start: ?usize) usize {
  67        const line_start = maybe_line_start orelse token.getLineStartForColumnCalc(source);
  68
  69        var i: usize = line_start;
  70        var column: usize = 0;
  71        while (i < token.start) : (i += 1) {
  72            column += columnWidth(column, source[i], tab_columns);
  73        }
  74        return column;
  75    }
  76
  77    // TODO: More testing is needed to determine if this can be merged with getLineStartForErrorDisplay
  78    //       (the TODO in currentIndexFormsLineEndingPair should be taken into account as well)
  79    pub fn getLineStartForColumnCalc(token: Token, source: []const u8) usize {
  80        const line_start = line_start: {
  81            if (token.start != 0) {
  82                // start checking at the byte before the token
  83                var index = token.start - 1;
  84                while (true) {
  85                    if (source[index] == '\n') break :line_start @min(source.len - 1, index + 1);
  86                    if (index != 0) index -= 1 else break;
  87                }
  88            }
  89            break :line_start 0;
  90        };
  91        return line_start;
  92    }
  93
  94    pub fn getLineStartForErrorDisplay(token: Token, source: []const u8) usize {
  95        const line_start = line_start: {
  96            if (token.start != 0) {
  97                // start checking at the byte before the token
  98                var index = token.start - 1;
  99                while (true) {
 100                    if (source[index] == '\r' or source[index] == '\n') break :line_start @min(source.len - 1, index + 1);
 101                    if (index != 0) index -= 1 else break;
 102                }
 103            }
 104            break :line_start 0;
 105        };
 106        return line_start;
 107    }
 108
 109    pub fn getLineForErrorDisplay(token: Token, source: []const u8, maybe_line_start: ?usize) []const u8 {
 110        const line_start = maybe_line_start orelse token.getLineStartForErrorDisplay(source);
 111
 112        var line_end = line_start;
 113        while (line_end < source.len and source[line_end] != '\r' and source[line_end] != '\n') : (line_end += 1) {}
 114        return source[line_start..line_end];
 115    }
 116
 117    pub fn isStringLiteral(token: Token) bool {
 118        return token.id == .quoted_ascii_string or token.id == .quoted_wide_string;
 119    }
 120};
 121
 122pub const LineHandler = struct {
 123    line_number: usize = 1,
 124    buffer: []const u8,
 125    last_line_ending_index: ?usize = null,
 126
 127    /// Like incrementLineNumber but checks that the current char is a line ending first.
 128    /// Returns the new line number if it was incremented, null otherwise.
 129    pub fn maybeIncrementLineNumber(self: *LineHandler, cur_index: usize) ?usize {
 130        const c = self.buffer[cur_index];
 131        if (c == '\r' or c == '\n') {
 132            return self.incrementLineNumber(cur_index);
 133        }
 134        return null;
 135    }
 136
 137    /// Increments line_number appropriately (handling line ending pairs)
 138    /// and returns the new line number if it was incremented, or null otherwise.
 139    pub fn incrementLineNumber(self: *LineHandler, cur_index: usize) ?usize {
 140        if (self.currentIndexFormsLineEndingPair(cur_index)) {
 141            self.last_line_ending_index = null;
 142            return null;
 143        } else {
 144            self.line_number += 1;
 145            self.last_line_ending_index = cur_index;
 146            return self.line_number;
 147        }
 148    }
 149
 150    /// \r\n and \n\r pairs are treated as a single line ending (but not \r\r \n\n)
 151    /// expects self.index and last_line_ending_index (if non-null) to contain line endings
 152    ///
 153    /// TODO: This is not really how the Win32 RC compiler handles line endings. Instead, it
 154    ///       seems to drop all carriage returns during preprocessing and then replace all
 155    ///       remaining line endings with well-formed CRLF pairs (e.g. `<CR>a<CR>b<LF>c` becomes `ab<CR><LF>c`).
 156    ///       Handling this the same as the Win32 RC compiler would need control over the preprocessor,
 157    ///       since Clang converts unpaired <CR> into unpaired <LF>.
 158    pub fn currentIndexFormsLineEndingPair(self: *const LineHandler, cur_index: usize) bool {
 159        if (self.last_line_ending_index == null) return false;
 160
 161        // must immediately precede the current index, we know cur_index must
 162        // be >= 1 since last_line_ending_index is non-null (so if the subtraction
 163        // overflows it is a bug at the callsite of this function).
 164        if (self.last_line_ending_index.? != cur_index - 1) return false;
 165
 166        const cur_line_ending = self.buffer[cur_index];
 167        const last_line_ending = self.buffer[self.last_line_ending_index.?];
 168
 169        // sanity check
 170        std.debug.assert(cur_line_ending == '\r' or cur_line_ending == '\n');
 171        std.debug.assert(last_line_ending == '\r' or last_line_ending == '\n');
 172
 173        // can't be \n\n or \r\r
 174        if (last_line_ending == cur_line_ending) return false;
 175
 176        return true;
 177    }
 178};
 179
 180pub const LexError = error{
 181    UnfinishedStringLiteral,
 182    StringLiteralTooLong,
 183    InvalidNumberWithExponent,
 184    InvalidDigitCharacterInNumberLiteral,
 185    IllegalByte,
 186    IllegalByteOutsideStringLiterals,
 187    IllegalCodepointOutsideStringLiterals,
 188    IllegalByteOrderMark,
 189    IllegalPrivateUseCharacter,
 190    FoundCStyleEscapedQuote,
 191    CodePagePragmaMissingLeftParen,
 192    CodePagePragmaMissingRightParen,
 193    /// Can be caught and ignored
 194    CodePagePragmaInvalidCodePage,
 195    CodePagePragmaNotInteger,
 196    CodePagePragmaOverflow,
 197    CodePagePragmaUnsupportedCodePage,
 198    /// Can be caught and ignored
 199    CodePagePragmaInIncludedFile,
 200};
 201
 202pub const Lexer = struct {
 203    const Self = @This();
 204
 205    buffer: []const u8,
 206    index: usize,
 207    line_handler: LineHandler,
 208    at_start_of_line: bool = true,
 209    error_context_token: ?Token = null,
 210    current_code_page: SupportedCodePage,
 211    default_code_page: SupportedCodePage,
 212    source_mappings: ?*SourceMappings,
 213    max_string_literal_codepoints: u15,
 214    /// Needed to determine whether or not the output code page should
 215    /// be set in the parser.
 216    seen_pragma_code_pages: u2 = 0,
 217    last_pragma_code_page_token: ?Token = null,
 218
 219    pub const Error = LexError;
 220
 221    pub const LexerOptions = struct {
 222        default_code_page: SupportedCodePage = .windows1252,
 223        source_mappings: ?*SourceMappings = null,
 224        max_string_literal_codepoints: u15 = default_max_string_literal_codepoints,
 225    };
 226
 227    pub fn init(buffer: []const u8, options: LexerOptions) Self {
 228        return Self{
 229            .buffer = buffer,
 230            .index = 0,
 231            .current_code_page = options.default_code_page,
 232            .default_code_page = options.default_code_page,
 233            .source_mappings = options.source_mappings,
 234            .max_string_literal_codepoints = options.max_string_literal_codepoints,
 235            .line_handler = .{ .buffer = buffer },
 236        };
 237    }
 238
 239    pub fn dump(self: *Self, token: *const Token) void {
 240        std.debug.print("{s}:{d}: {f}\n", .{
 241            @tagName(token.id), token.line_number, std.ascii.hexEscape(token.slice(self.buffer), .lower),
 242        });
 243    }
 244
 245    pub const LexMethod = enum {
 246        whitespace_delimiter_only,
 247        normal,
 248        normal_expect_operator,
 249    };
 250
 251    pub fn next(self: *Self, comptime method: LexMethod) LexError!Token {
 252        switch (method) {
 253            .whitespace_delimiter_only => return self.nextWhitespaceDelimeterOnly(),
 254            .normal => return self.nextNormal(),
 255            .normal_expect_operator => return self.nextNormalWithContext(.expect_operator),
 256        }
 257    }
 258
 259    const StateWhitespaceDelimiterOnly = enum {
 260        start,
 261        literal,
 262        preprocessor,
 263        semicolon,
 264    };
 265
 266    pub fn nextWhitespaceDelimeterOnly(self: *Self) LexError!Token {
 267        const start_index = self.index;
 268        var result = Token{
 269            .id = .eof,
 270            .start = start_index,
 271            .end = undefined,
 272            .line_number = self.line_handler.line_number,
 273        };
 274        var state = StateWhitespaceDelimiterOnly.start;
 275
 276        while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) {
 277            const c = codepoint.value;
 278            try self.checkForIllegalCodepoint(codepoint, false);
 279            switch (state) {
 280                .start => switch (c) {
 281                    '\r', '\n' => {
 282                        result.start = self.index + 1;
 283                        result.line_number = self.incrementLineNumber();
 284                    },
 285                    ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
 286                        result.start = self.index + 1;
 287                    },
 288                    // NBSP only counts as whitespace at the start of a line (but
 289                    // can be intermixed with other whitespace). Who knows why.
 290                    // TODO: This should either be removed, or it should also include
 291                    //       the codepoints listed in disjoint_code_page.zig
 292                    '\xA0' => if (self.at_start_of_line) {
 293                        result.start = self.index + codepoint.byte_len;
 294                    } else {
 295                        state = .literal;
 296                        self.at_start_of_line = false;
 297                    },
 298                    '#' => {
 299                        if (self.at_start_of_line) {
 300                            state = .preprocessor;
 301                        } else {
 302                            state = .literal;
 303                        }
 304                        self.at_start_of_line = false;
 305                    },
 306                    ';' => {
 307                        state = .semicolon;
 308                        self.at_start_of_line = false;
 309                    },
 310                    else => {
 311                        state = .literal;
 312                        self.at_start_of_line = false;
 313                    },
 314                },
 315                .literal => switch (c) {
 316                    '\r', '\n', ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
 317                        result.id = .literal;
 318                        break;
 319                    },
 320                    else => {},
 321                },
 322                .preprocessor => switch (c) {
 323                    '\r', '\n' => {
 324                        try self.evaluatePreprocessorCommand(result.start, self.index);
 325                        result.start = self.index + 1;
 326                        state = .start;
 327                        result.line_number = self.incrementLineNumber();
 328                    },
 329                    else => {},
 330                },
 331                .semicolon => switch (c) {
 332                    '\r', '\n' => {
 333                        result.start = self.index + 1;
 334                        state = .start;
 335                        result.line_number = self.incrementLineNumber();
 336                    },
 337                    else => {},
 338                },
 339            }
 340        } else { // got EOF
 341            switch (state) {
 342                .start => {},
 343                .semicolon => {
 344                    // Skip past everything up to the EOF
 345                    result.start = self.index;
 346                },
 347                .literal => {
 348                    result.id = .literal;
 349                },
 350                .preprocessor => {
 351                    try self.evaluatePreprocessorCommand(result.start, self.index);
 352                    result.start = self.index;
 353                },
 354            }
 355        }
 356
 357        result.end = self.index;
 358
 359        // EOF tokens must have their start index match the end index
 360        std.debug.assert(result.id != .eof or result.start == result.end);
 361
 362        return result;
 363    }
 364
 365    const StateNormal = enum {
 366        start,
 367        literal_or_quoted_wide_string,
 368        quoted_ascii_string,
 369        quoted_wide_string,
 370        quoted_ascii_string_escape,
 371        quoted_wide_string_escape,
 372        quoted_ascii_string_maybe_end,
 373        quoted_wide_string_maybe_end,
 374        literal,
 375        number_literal,
 376        preprocessor,
 377        semicolon,
 378        // end
 379        e,
 380        en,
 381        // begin
 382        b,
 383        be,
 384        beg,
 385        begi,
 386    };
 387
 388    /// TODO: A not-terrible name
 389    pub fn nextNormal(self: *Self) LexError!Token {
 390        return self.nextNormalWithContext(.any);
 391    }
 392
 393    pub fn nextNormalWithContext(self: *Self, context: enum { expect_operator, any }) LexError!Token {
 394        const start_index = self.index;
 395        var result = Token{
 396            .id = .eof,
 397            .start = start_index,
 398            .end = undefined,
 399            .line_number = self.line_handler.line_number,
 400        };
 401        var state = StateNormal.start;
 402
 403        // Note: The Windows RC compiler uses a non-standard method of computing
 404        //       length for its 'string literal too long' errors; it isn't easily
 405        //       explained or intuitive (it's sort-of pre-parsed byte length but with
 406        //       a few of exceptions/edge cases).
 407        //
 408        // It also behaves strangely with non-ASCII codepoints, e.g. even though the default
 409        // limit is 4097, you can only have 4094 € codepoints (1 UTF-16 code unit each),
 410        // and 2048 𐐷 codepoints (2 UTF-16 code units each).
 411        //
 412        // TODO: Understand this more, bring it more in line with how the Win32 limits work.
 413        //       Alternatively, do something that makes more sense but may be more permissive.
 414        var string_literal_length: usize = 0;
 415        // Keeping track of the string literal column prevents pathological edge cases when
 416        // there are tons of tab stop characters within a string literal.
 417        var string_literal_column: usize = 0;
 418        var string_literal_collapsing_whitespace: bool = false;
 419        var still_could_have_exponent: bool = true;
 420        var exponent_index: ?usize = null;
 421        while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) {
 422            const c = codepoint.value;
 423            const in_string_literal = switch (state) {
 424                .quoted_ascii_string,
 425                .quoted_wide_string,
 426                .quoted_ascii_string_escape,
 427                .quoted_wide_string_escape,
 428                .quoted_ascii_string_maybe_end,
 429                .quoted_wide_string_maybe_end,
 430                =>
 431                // If the current line is not the same line as the start of the string literal,
 432                // then we want to treat the current codepoint as 'not in a string literal'
 433                // for the purposes of detecting illegal codepoints. This means that we will
 434                // error on illegal-outside-string-literal characters that are outside string
 435                // literals from the perspective of a C preprocessor, but that may be
 436                // inside string literals from the perspective of the RC lexer. For example,
 437                // "hello
 438                // @"
 439                // will be treated as a single string literal by the RC lexer but the Win32
 440                // preprocessor will consider this an unclosed string literal followed by
 441                // the character @ and ", and will therefore error since the Win32 RC preprocessor
 442                // errors on the @ character outside string literals.
 443                //
 444                // By doing this here, we can effectively emulate the Win32 RC preprocessor behavior
 445                // at lex-time, and avoid the need for a separate step that checks for this edge-case
 446                // specifically.
 447                result.line_number == self.line_handler.line_number,
 448                else => false,
 449            };
 450            try self.checkForIllegalCodepoint(codepoint, in_string_literal);
 451            switch (state) {
 452                .start => switch (c) {
 453                    '\r', '\n' => {
 454                        result.start = self.index + 1;
 455                        result.line_number = self.incrementLineNumber();
 456                    },
 457                    ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
 458                        result.start = self.index + 1;
 459                    },
 460                    // NBSP only counts as whitespace at the start of a line (but
 461                    // can be intermixed with other whitespace). Who knows why.
 462                    '\xA0' => if (self.at_start_of_line) {
 463                        result.start = self.index + codepoint.byte_len;
 464                    } else {
 465                        state = .literal;
 466                        self.at_start_of_line = false;
 467                    },
 468                    'L', 'l' => {
 469                        state = .literal_or_quoted_wide_string;
 470                        self.at_start_of_line = false;
 471                    },
 472                    'E', 'e' => {
 473                        state = .e;
 474                        self.at_start_of_line = false;
 475                    },
 476                    'B', 'b' => {
 477                        state = .b;
 478                        self.at_start_of_line = false;
 479                    },
 480                    '"' => {
 481                        state = .quoted_ascii_string;
 482                        self.at_start_of_line = false;
 483                        string_literal_collapsing_whitespace = false;
 484                        string_literal_length = 0;
 485
 486                        var dummy_token = Token{
 487                            .start = self.index,
 488                            .end = self.index,
 489                            .line_number = self.line_handler.line_number,
 490                            .id = .invalid,
 491                        };
 492                        string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null);
 493                    },
 494                    '+', '&', '|' => {
 495                        self.index += 1;
 496                        result.id = .operator;
 497                        self.at_start_of_line = false;
 498                        break;
 499                    },
 500                    '-' => {
 501                        if (context == .expect_operator) {
 502                            self.index += 1;
 503                            result.id = .operator;
 504                            self.at_start_of_line = false;
 505                            break;
 506                        } else {
 507                            state = .number_literal;
 508                            still_could_have_exponent = true;
 509                            exponent_index = null;
 510                            self.at_start_of_line = false;
 511                        }
 512                    },
 513                    '0'...'9', '~' => {
 514                        state = .number_literal;
 515                        still_could_have_exponent = true;
 516                        exponent_index = null;
 517                        self.at_start_of_line = false;
 518                    },
 519                    '#' => {
 520                        if (self.at_start_of_line) {
 521                            state = .preprocessor;
 522                        } else {
 523                            state = .literal;
 524                        }
 525                        self.at_start_of_line = false;
 526                    },
 527                    ';' => {
 528                        state = .semicolon;
 529                        self.at_start_of_line = false;
 530                    },
 531                    '{', '}' => {
 532                        self.index += 1;
 533                        result.id = if (c == '{') .begin else .end;
 534                        self.at_start_of_line = false;
 535                        break;
 536                    },
 537                    '(', ')' => {
 538                        self.index += 1;
 539                        result.id = if (c == '(') .open_paren else .close_paren;
 540                        self.at_start_of_line = false;
 541                        break;
 542                    },
 543                    ',' => {
 544                        self.index += 1;
 545                        result.id = .comma;
 546                        self.at_start_of_line = false;
 547                        break;
 548                    },
 549                    else => {
 550                        if (isNonAsciiDigit(c)) {
 551                            self.error_context_token = .{
 552                                .id = .number,
 553                                .start = result.start,
 554                                .end = self.index + 1,
 555                                .line_number = self.line_handler.line_number,
 556                            };
 557                            return error.InvalidDigitCharacterInNumberLiteral;
 558                        }
 559                        state = .literal;
 560                        self.at_start_of_line = false;
 561                    },
 562                },
 563                .preprocessor => switch (c) {
 564                    '\r', '\n' => {
 565                        try self.evaluatePreprocessorCommand(result.start, self.index);
 566                        result.start = self.index + 1;
 567                        state = .start;
 568                        result.line_number = self.incrementLineNumber();
 569                    },
 570                    else => {},
 571                },
 572                // Semi-colon acts as a line-terminator--everything is skipped until
 573                // the next line.
 574                .semicolon => switch (c) {
 575                    '\r', '\n' => {
 576                        result.start = self.index + 1;
 577                        state = .start;
 578                        result.line_number = self.incrementLineNumber();
 579                    },
 580                    else => {},
 581                },
 582                .number_literal => switch (c) {
 583                    // zig fmt: off
 584                    ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
 585                    '\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
 586                    '\'', ';', '=',
 587                    => {
 588                    // zig fmt: on
 589                        result.id = .number;
 590                        break;
 591                    },
 592                    '0'...'9' => {
 593                        if (exponent_index) |exp_i| {
 594                            if (self.index - 1 == exp_i) {
 595                                // Note: This being an error is a quirk of the preprocessor used by
 596                                //       the Win32 RC compiler.
 597                                self.error_context_token = .{
 598                                    .id = .number,
 599                                    .start = result.start,
 600                                    .end = self.index + 1,
 601                                    .line_number = self.line_handler.line_number,
 602                                };
 603                                return error.InvalidNumberWithExponent;
 604                            }
 605                        }
 606                    },
 607                    'e', 'E' => {
 608                        if (still_could_have_exponent) {
 609                            exponent_index = self.index;
 610                            still_could_have_exponent = false;
 611                        }
 612                    },
 613                    else => {
 614                        if (isNonAsciiDigit(c)) {
 615                            self.error_context_token = .{
 616                                .id = .number,
 617                                .start = result.start,
 618                                .end = self.index + 1,
 619                                .line_number = self.line_handler.line_number,
 620                            };
 621                            return error.InvalidDigitCharacterInNumberLiteral;
 622                        }
 623                        still_could_have_exponent = false;
 624                    },
 625                },
 626                .literal_or_quoted_wide_string => switch (c) {
 627                    // zig fmt: off
 628                    ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
 629                    '\r', '\n', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
 630                    '\'', ';', '=',
 631                    // zig fmt: on
 632                    => {
 633                        result.id = .literal;
 634                        break;
 635                    },
 636                    '"' => {
 637                        state = .quoted_wide_string;
 638                        string_literal_collapsing_whitespace = false;
 639                        string_literal_length = 0;
 640
 641                        var dummy_token = Token{
 642                            .start = self.index,
 643                            .end = self.index,
 644                            .line_number = self.line_handler.line_number,
 645                            .id = .invalid,
 646                        };
 647                        string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null);
 648                    },
 649                    else => {
 650                        state = .literal;
 651                    },
 652                },
 653                .literal => switch (c) {
 654                    // zig fmt: off
 655                    ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
 656                    '\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
 657                    '\'', ';', '=',
 658                    => {
 659                    // zig fmt: on
 660                        result.id = .literal;
 661                        break;
 662                    },
 663                    else => {},
 664                },
 665                .e => switch (c) {
 666                    'N', 'n' => {
 667                        state = .en;
 668                    },
 669                    else => {
 670                        state = .literal;
 671                        self.index -= 1;
 672                    },
 673                },
 674                .en => switch (c) {
 675                    'D', 'd' => {
 676                        result.id = .end;
 677                        self.index += 1;
 678                        break;
 679                    },
 680                    else => {
 681                        state = .literal;
 682                        self.index -= 1;
 683                    },
 684                },
 685                .b => switch (c) {
 686                    'E', 'e' => {
 687                        state = .be;
 688                    },
 689                    else => {
 690                        state = .literal;
 691                        self.index -= 1;
 692                    },
 693                },
 694                .be => switch (c) {
 695                    'G', 'g' => {
 696                        state = .beg;
 697                    },
 698                    else => {
 699                        state = .literal;
 700                        self.index -= 1;
 701                    },
 702                },
 703                .beg => switch (c) {
 704                    'I', 'i' => {
 705                        state = .begi;
 706                    },
 707                    else => {
 708                        state = .literal;
 709                        self.index -= 1;
 710                    },
 711                },
 712                .begi => switch (c) {
 713                    'N', 'n' => {
 714                        result.id = .begin;
 715                        self.index += 1;
 716                        break;
 717                    },
 718                    else => {
 719                        state = .literal;
 720                        self.index -= 1;
 721                    },
 722                },
 723                .quoted_ascii_string, .quoted_wide_string => switch (c) {
 724                    '"' => {
 725                        string_literal_column += 1;
 726                        state = if (state == .quoted_ascii_string) .quoted_ascii_string_maybe_end else .quoted_wide_string_maybe_end;
 727                    },
 728                    '\\' => {
 729                        string_literal_length += 1;
 730                        string_literal_column += 1;
 731                        state = if (state == .quoted_ascii_string) .quoted_ascii_string_escape else .quoted_wide_string_escape;
 732                    },
 733                    '\r' => {
 734                        string_literal_column = 0;
 735                        // \r doesn't count towards string literal length
 736
 737                        // Increment line number but don't affect the result token's line number
 738                        _ = self.incrementLineNumber();
 739                    },
 740                    '\n' => {
 741                        string_literal_column = 0;
 742                        // first \n expands to <space><\n>
 743                        if (!string_literal_collapsing_whitespace) {
 744                            string_literal_length += 2;
 745                            string_literal_collapsing_whitespace = true;
 746                        }
 747                        // the rest are collapsed into the <space><\n>
 748
 749                        // Increment line number but don't affect the result token's line number
 750                        _ = self.incrementLineNumber();
 751                    },
 752                    // only \t, space, Vertical Tab, and Form Feed count as whitespace when collapsing
 753                    '\t', ' ', '\x0b', '\x0c' => {
 754                        if (!string_literal_collapsing_whitespace) {
 755                            // Literal tab characters are counted as the number of space characters
 756                            // needed to reach the next 8-column tab stop.
 757                            const width = columnWidth(string_literal_column, @intCast(c), 8);
 758                            string_literal_length += width;
 759                            string_literal_column += width;
 760                        }
 761                    },
 762                    else => {
 763                        string_literal_collapsing_whitespace = false;
 764                        string_literal_length += 1;
 765                        string_literal_column += 1;
 766                    },
 767                },
 768                .quoted_ascii_string_escape, .quoted_wide_string_escape => switch (c) {
 769                    '"' => {
 770                        self.error_context_token = .{
 771                            .id = .invalid,
 772                            .start = self.index - 1,
 773                            .end = self.index + 1,
 774                            .line_number = self.line_handler.line_number,
 775                        };
 776                        return error.FoundCStyleEscapedQuote;
 777                    },
 778                    else => {
 779                        string_literal_length += 1;
 780                        string_literal_column += 1;
 781                        state = if (state == .quoted_ascii_string_escape) .quoted_ascii_string else .quoted_wide_string;
 782                    },
 783                },
 784                .quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => switch (c) {
 785                    '"' => {
 786                        state = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
 787                        // Escaped quotes count as 1 char for string literal length checks.
 788                        // Since we did not increment on the first " (because it could have been
 789                        // the end of the quoted string), we increment here
 790                        string_literal_length += 1;
 791                        string_literal_column += 1;
 792                    },
 793                    else => {
 794                        result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
 795                        break;
 796                    },
 797                },
 798            }
 799        } else { // got EOF
 800            switch (state) {
 801                .start => {},
 802                .semicolon => {
 803                    // Skip past everything up to the EOF
 804                    result.start = self.index;
 805                },
 806                .literal_or_quoted_wide_string, .literal, .e, .en, .b, .be, .beg, .begi => {
 807                    result.id = .literal;
 808                },
 809                .preprocessor => {
 810                    try self.evaluatePreprocessorCommand(result.start, self.index);
 811                    result.start = self.index;
 812                },
 813                .number_literal => {
 814                    result.id = .number;
 815                },
 816                .quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => {
 817                    result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
 818                },
 819                .quoted_ascii_string,
 820                .quoted_wide_string,
 821                .quoted_ascii_string_escape,
 822                .quoted_wide_string_escape,
 823                => {
 824                    self.error_context_token = .{
 825                        .id = .eof,
 826                        .start = self.index,
 827                        .end = self.index,
 828                        .line_number = self.line_handler.line_number,
 829                    };
 830                    return LexError.UnfinishedStringLiteral;
 831                },
 832            }
 833        }
 834
 835        result.end = self.index;
 836
 837        if (result.id == .quoted_ascii_string or result.id == .quoted_wide_string) {
 838            if (string_literal_length > self.max_string_literal_codepoints) {
 839                self.error_context_token = result;
 840                return LexError.StringLiteralTooLong;
 841            }
 842        }
 843
 844        // EOF tokens must have their start index match the end index
 845        std.debug.assert(result.id != .eof or result.start == result.end);
 846
 847        return result;
 848    }
 849
 850    /// Increments line_number appropriately (handling line ending pairs)
 851    /// and returns the new line number.
 852    fn incrementLineNumber(self: *Self) usize {
 853        _ = self.line_handler.incrementLineNumber(self.index);
 854        self.at_start_of_line = true;
 855        return self.line_handler.line_number;
 856    }
 857
 858    fn checkForIllegalCodepoint(self: *Self, codepoint: code_pages.Codepoint, in_string_literal: bool) LexError!void {
 859        const err = switch (codepoint.value) {
 860            // 0x00 = NUL
 861            // 0x1A = Substitute (treated as EOF)
 862            // NOTE: 0x1A gets treated as EOF by the clang preprocessor so after a .rc file
 863            //       is run through the clang preprocessor it will no longer have 0x1A characters in it.
 864            // 0x7F = DEL (treated as a context-specific terminator by the Windows RC compiler)
 865            0x00, 0x1A, 0x7F => error.IllegalByte,
 866            // 0x01...0x03 result in strange 'macro definition too big' errors when used outside of string literals
 867            // 0x04 is valid but behaves strangely (sort of acts as a 'skip the next character' instruction)
 868            0x01...0x04 => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return,
 869            // @ and ` both result in error RC2018: unknown character '0x60' (and subsequently
 870            // fatal error RC1116: RC terminating after preprocessor errors) if they are ever used
 871            // outside of string literals. Not exactly sure why this would be the case, though.
 872            // TODO: Make sure there aren't any exceptions
 873            '@', '`' => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return,
 874            // The Byte Order Mark is mostly skipped over by the Windows RC compiler, but
 875            // there are edge cases where it leads to cryptic 'compiler limit : macro definition too big'
 876            // errors (e.g. a BOM within a number literal). By making this illegal we avoid having to
 877            // deal with a lot of edge cases and remove the potential footgun of the bytes of a BOM
 878            // being 'missing' when included in a string literal (the Windows RC compiler acts as
 879            // if the codepoint was never part of the string literal).
 880            '\u{FEFF}' => error.IllegalByteOrderMark,
 881            // Similar deal with this private use codepoint, it gets skipped/ignored by the
 882            // RC compiler (but without the cryptic errors). Silently dropping bytes still seems like
 883            // enough of a footgun with no real use-cases that it's still worth erroring instead of
 884            // emulating the RC compiler's behavior, though.
 885            '\u{E000}' => error.IllegalPrivateUseCharacter,
 886            // These codepoints lead to strange errors when used outside of string literals,
 887            // and miscompilations when used within string literals. We avoid the miscompilation
 888            // within string literals and emit a warning, but outside of string literals it makes
 889            // more sense to just disallow these codepoints.
 890            0x900, 0xA00, 0xA0D, 0x2000, 0xD00, 0xFFFE, 0xFFFF => if (!in_string_literal) error.IllegalCodepointOutsideStringLiterals else return,
 891            else => return,
 892        };
 893        self.error_context_token = .{
 894            .id = .invalid,
 895            .start = self.index,
 896            .end = self.index + codepoint.byte_len,
 897            .line_number = self.line_handler.line_number,
 898        };
 899        return err;
 900    }
 901
 902    fn evaluatePreprocessorCommand(self: *Self, start: usize, end: usize) !void {
 903        const token = Token{
 904            .id = .preprocessor_command,
 905            .start = start,
 906            .end = end,
 907            .line_number = self.line_handler.line_number,
 908        };
 909        errdefer self.error_context_token = token;
 910        const full_command = self.buffer[start..end];
 911
 912        const code_page = (parsePragmaCodePage(full_command) catch |err| switch (err) {
 913            error.NotPragma, error.NotCodePagePragma => return,
 914            else => |e| return e,
 915        }) orelse self.default_code_page;
 916
 917        // https://learn.microsoft.com/en-us/windows/win32/menurc/pragma-directives
 918        // > This pragma is not supported in an included resource file (.rc)
 919        //
 920        // Even though the Win32 behavior is to just ignore such directives silently,
 921        // this is an error in the lexer to allow for emitting warnings/errors when
 922        // such directives are found if that's wanted. The intention is for the lexer
 923        // to still be able to work correctly after this error is returned.
 924        if (self.source_mappings) |source_mappings| {
 925            if (!source_mappings.isRootFile(token.line_number)) {
 926                return error.CodePagePragmaInIncludedFile;
 927            }
 928        }
 929
 930        self.seen_pragma_code_pages +|= 1;
 931        self.last_pragma_code_page_token = token;
 932        self.current_code_page = code_page;
 933    }
 934
 935    pub fn getErrorDetails(self: Self, lex_err: LexError) ErrorDetails {
 936        const err = switch (lex_err) {
 937            error.UnfinishedStringLiteral => ErrorDetails.Error.unfinished_string_literal,
 938            error.StringLiteralTooLong => return .{
 939                .err = .string_literal_too_long,
 940                .code_page = self.current_code_page,
 941                .token = self.error_context_token.?,
 942                .extra = .{ .number = self.max_string_literal_codepoints },
 943            },
 944            error.InvalidNumberWithExponent => ErrorDetails.Error.invalid_number_with_exponent,
 945            error.InvalidDigitCharacterInNumberLiteral => ErrorDetails.Error.invalid_digit_character_in_number_literal,
 946            error.IllegalByte => ErrorDetails.Error.illegal_byte,
 947            error.IllegalByteOutsideStringLiterals => ErrorDetails.Error.illegal_byte_outside_string_literals,
 948            error.IllegalCodepointOutsideStringLiterals => ErrorDetails.Error.illegal_codepoint_outside_string_literals,
 949            error.IllegalByteOrderMark => ErrorDetails.Error.illegal_byte_order_mark,
 950            error.IllegalPrivateUseCharacter => ErrorDetails.Error.illegal_private_use_character,
 951            error.FoundCStyleEscapedQuote => ErrorDetails.Error.found_c_style_escaped_quote,
 952            error.CodePagePragmaMissingLeftParen => ErrorDetails.Error.code_page_pragma_missing_left_paren,
 953            error.CodePagePragmaMissingRightParen => ErrorDetails.Error.code_page_pragma_missing_right_paren,
 954            error.CodePagePragmaInvalidCodePage => ErrorDetails.Error.code_page_pragma_invalid_code_page,
 955            error.CodePagePragmaNotInteger => ErrorDetails.Error.code_page_pragma_not_integer,
 956            error.CodePagePragmaOverflow => ErrorDetails.Error.code_page_pragma_overflow,
 957            error.CodePagePragmaUnsupportedCodePage => ErrorDetails.Error.code_page_pragma_unsupported_code_page,
 958            error.CodePagePragmaInIncludedFile => ErrorDetails.Error.code_page_pragma_in_included_file,
 959        };
 960        return .{
 961            .err = err,
 962            .code_page = self.current_code_page,
 963            .token = self.error_context_token.?,
 964        };
 965    }
 966};
 967
 968fn parseCodePageNum(str: []const u8) !u32 {
 969    var x: u32 = 0;
 970    for (str) |c| {
 971        const digit = try std.fmt.charToDigit(c, 10);
 972        if (x != 0) x = try std.math.mul(u32, x, 10);
 973        x = try std.math.add(u32, x, digit);
 974    }
 975    return x;
 976}
 977
 978/// Returns `null` when the code_page is set to DEFAULT
 979pub fn parsePragmaCodePage(full_command: []const u8) !?SupportedCodePage {
 980    var command = full_command;
 981
 982    // Anything besides exactly this is ignored by the Windows RC implementation
 983    const expected_directive = "#pragma";
 984    if (!std.mem.startsWith(u8, command, expected_directive)) return error.NotPragma;
 985    command = command[expected_directive.len..];
 986
 987    if (command.len == 0 or !std.ascii.isWhitespace(command[0])) return error.NotCodePagePragma;
 988    while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
 989        command = command[1..];
 990    }
 991
 992    // Note: CoDe_PaGeZ is also treated as "code_page" by the Windows RC implementation,
 993    //       and it will error with 'Missing left parenthesis in code_page #pragma'
 994    const expected_extension = "code_page";
 995    if (!std.ascii.startsWithIgnoreCase(command, expected_extension)) return error.NotCodePagePragma;
 996    command = command[expected_extension.len..];
 997
 998    while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
 999        command = command[1..];
1000    }
1001
1002    if (command.len == 0 or command[0] != '(') {
1003        return error.CodePagePragmaMissingLeftParen;
1004    }
1005    command = command[1..];
1006
1007    while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
1008        command = command[1..];
1009    }
1010
1011    var num_str: []u8 = command[0..0];
1012    while (command.len > 0 and (command[0] != ')' and !std.ascii.isWhitespace(command[0]))) {
1013        command = command[1..];
1014        num_str.len += 1;
1015    }
1016
1017    if (num_str.len == 0) {
1018        return error.CodePagePragmaNotInteger;
1019    }
1020
1021    while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
1022        command = command[1..];
1023    }
1024
1025    if (command.len == 0 or command[0] != ')') {
1026        return error.CodePagePragmaMissingRightParen;
1027    }
1028
1029    const code_page: ?SupportedCodePage = code_page: {
1030        if (std.ascii.eqlIgnoreCase("DEFAULT", num_str)) {
1031            break :code_page null;
1032        }
1033
1034        // The Win32 compiler behaves fairly strangely around maxInt(u32):
1035        // - If the overflowed u32 wraps and becomes a known code page ID, then
1036        //   it will error/warn with "Codepage not valid:  ignored" (depending on /w)
1037        // - If the overflowed u32 wraps and does not become a known code page ID,
1038        //   then it will error with 'constant too big' and 'Codepage not integer'
1039        //
1040        // Instead of that, we just have a separate error specifically for overflow.
1041        const num = parseCodePageNum(num_str) catch |err| switch (err) {
1042            error.InvalidCharacter => return error.CodePagePragmaNotInteger,
1043            error.Overflow => return error.CodePagePragmaOverflow,
1044        };
1045
1046        // Anything that starts with 0 but does not resolve to 0 is treated as invalid, e.g. 01252
1047        if (num_str[0] == '0' and num != 0) {
1048            return error.CodePagePragmaInvalidCodePage;
1049        }
1050        // Anything that resolves to 0 is treated as 'not an integer' by the Win32 implementation.
1051        else if (num == 0) {
1052            return error.CodePagePragmaNotInteger;
1053        }
1054        // Anything above u16 max is not going to be found since our CodePage enum is backed by a u16.
1055        if (num > std.math.maxInt(u16)) {
1056            return error.CodePagePragmaInvalidCodePage;
1057        }
1058
1059        break :code_page code_pages.getByIdentifierEnsureSupported(@intCast(num)) catch |err| switch (err) {
1060            error.InvalidCodePage => return error.CodePagePragmaInvalidCodePage,
1061            error.UnsupportedCodePage => return error.CodePagePragmaUnsupportedCodePage,
1062        };
1063    };
1064
1065    return code_page;
1066}
1067
1068fn testLexNormal(source: []const u8, expected_tokens: []const Token.Id) !void {
1069    var lexer = Lexer.init(source, .{});
1070    if (dumpTokensDuringTests) std.debug.print("\n----------------------\n{s}\n----------------------\n", .{lexer.buffer});
1071    for (expected_tokens) |expected_token_id| {
1072        const token = try lexer.nextNormal();
1073        if (dumpTokensDuringTests) lexer.dump(&token);
1074        try std.testing.expectEqual(expected_token_id, token.id);
1075    }
1076    const last_token = try lexer.nextNormal();
1077    try std.testing.expectEqual(Token.Id.eof, last_token.id);
1078}
1079
1080fn expectLexError(expected: LexError, actual: anytype) !void {
1081    try std.testing.expectError(expected, actual);
1082    if (dumpTokensDuringTests) std.debug.print("{!}\n", .{actual});
1083}
1084
1085test "normal: numbers" {
1086    try testLexNormal("1", &.{.number});
1087    try testLexNormal("-1", &.{.number});
1088    try testLexNormal("- 1", &.{ .number, .number });
1089    try testLexNormal("-a", &.{.number});
1090}
1091
1092test "normal: string literals" {
1093    try testLexNormal("\"\"", &.{.quoted_ascii_string});
1094    // "" is an escaped "
1095    try testLexNormal("\" \"\" \"", &.{.quoted_ascii_string});
1096}
1097
1098test "superscript chars and code pages" {
1099    const firstToken = struct {
1100        pub fn firstToken(source: []const u8, default_code_page: SupportedCodePage, comptime lex_method: Lexer.LexMethod) LexError!Token {
1101            var lexer = Lexer.init(source, .{ .default_code_page = default_code_page });
1102            return lexer.next(lex_method);
1103        }
1104    }.firstToken;
1105    const utf8_source = "²";
1106    const windows1252_source = "\xB2";
1107
1108    const windows1252_encoded_as_windows1252 = firstToken(windows1252_source, .windows1252, .normal);
1109    try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, windows1252_encoded_as_windows1252);
1110
1111    const utf8_encoded_as_windows1252 = try firstToken(utf8_source, .windows1252, .normal);
1112    try std.testing.expectEqual(Token{
1113        .id = .literal,
1114        .start = 0,
1115        .end = 2,
1116        .line_number = 1,
1117    }, utf8_encoded_as_windows1252);
1118
1119    const utf8_encoded_as_utf8 = firstToken(utf8_source, .utf8, .normal);
1120    try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, utf8_encoded_as_utf8);
1121
1122    const windows1252_encoded_as_utf8 = try firstToken(windows1252_source, .utf8, .normal);
1123    try std.testing.expectEqual(Token{
1124        .id = .literal,
1125        .start = 0,
1126        .end = 1,
1127        .line_number = 1,
1128    }, windows1252_encoded_as_utf8);
1129}