master
1//! Expects to be run after the C preprocessor and after `removeComments`.
2//! This means that the lexer assumes that:
3//! - Splices ('\' at the end of a line) have been handled/collapsed.
4//! - Preprocessor directives and macros have been expanded (any remaining should be skipped with the exception of `#pragma code_page`).
5//! - All comments have been removed.
6
7const std = @import("std");
8const ErrorDetails = @import("errors.zig").ErrorDetails;
9const columnWidth = @import("literals.zig").columnWidth;
10const code_pages = @import("code_pages.zig");
11const SupportedCodePage = code_pages.SupportedCodePage;
12const SourceMappings = @import("source_mapping.zig").SourceMappings;
13const isNonAsciiDigit = @import("utils.zig").isNonAsciiDigit;
14
15const dumpTokensDuringTests = false;
16
17pub const default_max_string_literal_codepoints = 4097;
18
19pub const Token = struct {
20 id: Id,
21 start: usize,
22 end: usize,
23 line_number: usize,
24
25 pub const Id = enum {
26 literal,
27 number,
28 quoted_ascii_string,
29 quoted_wide_string,
30 operator,
31 begin,
32 end,
33 comma,
34 open_paren,
35 close_paren,
36 /// This Id is only used for errors, the Lexer will never return one
37 /// of these from a `next` call.
38 preprocessor_command,
39 invalid,
40 eof,
41
42 pub fn nameForErrorDisplay(self: Id) []const u8 {
43 return switch (self) {
44 .literal => "<literal>",
45 .number => "<number>",
46 .quoted_ascii_string => "<quoted ascii string>",
47 .quoted_wide_string => "<quoted wide string>",
48 .operator => "<operator>",
49 .begin => "<'{' or BEGIN>",
50 .end => "<'}' or END>",
51 .comma => ",",
52 .open_paren => "(",
53 .close_paren => ")",
54 .preprocessor_command => "<preprocessor command>",
55 .invalid => unreachable,
56 .eof => "<eof>",
57 };
58 }
59 };
60
61 pub fn slice(self: Token, buffer: []const u8) []const u8 {
62 return buffer[self.start..self.end];
63 }
64
65 /// Returns 0-based column
66 pub fn calculateColumn(token: Token, source: []const u8, tab_columns: usize, maybe_line_start: ?usize) usize {
67 const line_start = maybe_line_start orelse token.getLineStartForColumnCalc(source);
68
69 var i: usize = line_start;
70 var column: usize = 0;
71 while (i < token.start) : (i += 1) {
72 column += columnWidth(column, source[i], tab_columns);
73 }
74 return column;
75 }
76
77 // TODO: More testing is needed to determine if this can be merged with getLineStartForErrorDisplay
78 // (the TODO in currentIndexFormsLineEndingPair should be taken into account as well)
79 pub fn getLineStartForColumnCalc(token: Token, source: []const u8) usize {
80 const line_start = line_start: {
81 if (token.start != 0) {
82 // start checking at the byte before the token
83 var index = token.start - 1;
84 while (true) {
85 if (source[index] == '\n') break :line_start @min(source.len - 1, index + 1);
86 if (index != 0) index -= 1 else break;
87 }
88 }
89 break :line_start 0;
90 };
91 return line_start;
92 }
93
94 pub fn getLineStartForErrorDisplay(token: Token, source: []const u8) usize {
95 const line_start = line_start: {
96 if (token.start != 0) {
97 // start checking at the byte before the token
98 var index = token.start - 1;
99 while (true) {
100 if (source[index] == '\r' or source[index] == '\n') break :line_start @min(source.len - 1, index + 1);
101 if (index != 0) index -= 1 else break;
102 }
103 }
104 break :line_start 0;
105 };
106 return line_start;
107 }
108
109 pub fn getLineForErrorDisplay(token: Token, source: []const u8, maybe_line_start: ?usize) []const u8 {
110 const line_start = maybe_line_start orelse token.getLineStartForErrorDisplay(source);
111
112 var line_end = line_start;
113 while (line_end < source.len and source[line_end] != '\r' and source[line_end] != '\n') : (line_end += 1) {}
114 return source[line_start..line_end];
115 }
116
117 pub fn isStringLiteral(token: Token) bool {
118 return token.id == .quoted_ascii_string or token.id == .quoted_wide_string;
119 }
120};
121
122pub const LineHandler = struct {
123 line_number: usize = 1,
124 buffer: []const u8,
125 last_line_ending_index: ?usize = null,
126
127 /// Like incrementLineNumber but checks that the current char is a line ending first.
128 /// Returns the new line number if it was incremented, null otherwise.
129 pub fn maybeIncrementLineNumber(self: *LineHandler, cur_index: usize) ?usize {
130 const c = self.buffer[cur_index];
131 if (c == '\r' or c == '\n') {
132 return self.incrementLineNumber(cur_index);
133 }
134 return null;
135 }
136
137 /// Increments line_number appropriately (handling line ending pairs)
138 /// and returns the new line number if it was incremented, or null otherwise.
139 pub fn incrementLineNumber(self: *LineHandler, cur_index: usize) ?usize {
140 if (self.currentIndexFormsLineEndingPair(cur_index)) {
141 self.last_line_ending_index = null;
142 return null;
143 } else {
144 self.line_number += 1;
145 self.last_line_ending_index = cur_index;
146 return self.line_number;
147 }
148 }
149
150 /// \r\n and \n\r pairs are treated as a single line ending (but not \r\r \n\n)
151 /// expects self.index and last_line_ending_index (if non-null) to contain line endings
152 ///
153 /// TODO: This is not really how the Win32 RC compiler handles line endings. Instead, it
154 /// seems to drop all carriage returns during preprocessing and then replace all
155 /// remaining line endings with well-formed CRLF pairs (e.g. `<CR>a<CR>b<LF>c` becomes `ab<CR><LF>c`).
156 /// Handling this the same as the Win32 RC compiler would need control over the preprocessor,
157 /// since Clang converts unpaired <CR> into unpaired <LF>.
158 pub fn currentIndexFormsLineEndingPair(self: *const LineHandler, cur_index: usize) bool {
159 if (self.last_line_ending_index == null) return false;
160
161 // must immediately precede the current index, we know cur_index must
162 // be >= 1 since last_line_ending_index is non-null (so if the subtraction
163 // overflows it is a bug at the callsite of this function).
164 if (self.last_line_ending_index.? != cur_index - 1) return false;
165
166 const cur_line_ending = self.buffer[cur_index];
167 const last_line_ending = self.buffer[self.last_line_ending_index.?];
168
169 // sanity check
170 std.debug.assert(cur_line_ending == '\r' or cur_line_ending == '\n');
171 std.debug.assert(last_line_ending == '\r' or last_line_ending == '\n');
172
173 // can't be \n\n or \r\r
174 if (last_line_ending == cur_line_ending) return false;
175
176 return true;
177 }
178};
179
180pub const LexError = error{
181 UnfinishedStringLiteral,
182 StringLiteralTooLong,
183 InvalidNumberWithExponent,
184 InvalidDigitCharacterInNumberLiteral,
185 IllegalByte,
186 IllegalByteOutsideStringLiterals,
187 IllegalCodepointOutsideStringLiterals,
188 IllegalByteOrderMark,
189 IllegalPrivateUseCharacter,
190 FoundCStyleEscapedQuote,
191 CodePagePragmaMissingLeftParen,
192 CodePagePragmaMissingRightParen,
193 /// Can be caught and ignored
194 CodePagePragmaInvalidCodePage,
195 CodePagePragmaNotInteger,
196 CodePagePragmaOverflow,
197 CodePagePragmaUnsupportedCodePage,
198 /// Can be caught and ignored
199 CodePagePragmaInIncludedFile,
200};
201
202pub const Lexer = struct {
203 const Self = @This();
204
205 buffer: []const u8,
206 index: usize,
207 line_handler: LineHandler,
208 at_start_of_line: bool = true,
209 error_context_token: ?Token = null,
210 current_code_page: SupportedCodePage,
211 default_code_page: SupportedCodePage,
212 source_mappings: ?*SourceMappings,
213 max_string_literal_codepoints: u15,
214 /// Needed to determine whether or not the output code page should
215 /// be set in the parser.
216 seen_pragma_code_pages: u2 = 0,
217 last_pragma_code_page_token: ?Token = null,
218
219 pub const Error = LexError;
220
221 pub const LexerOptions = struct {
222 default_code_page: SupportedCodePage = .windows1252,
223 source_mappings: ?*SourceMappings = null,
224 max_string_literal_codepoints: u15 = default_max_string_literal_codepoints,
225 };
226
227 pub fn init(buffer: []const u8, options: LexerOptions) Self {
228 return Self{
229 .buffer = buffer,
230 .index = 0,
231 .current_code_page = options.default_code_page,
232 .default_code_page = options.default_code_page,
233 .source_mappings = options.source_mappings,
234 .max_string_literal_codepoints = options.max_string_literal_codepoints,
235 .line_handler = .{ .buffer = buffer },
236 };
237 }
238
239 pub fn dump(self: *Self, token: *const Token) void {
240 std.debug.print("{s}:{d}: {f}\n", .{
241 @tagName(token.id), token.line_number, std.ascii.hexEscape(token.slice(self.buffer), .lower),
242 });
243 }
244
245 pub const LexMethod = enum {
246 whitespace_delimiter_only,
247 normal,
248 normal_expect_operator,
249 };
250
251 pub fn next(self: *Self, comptime method: LexMethod) LexError!Token {
252 switch (method) {
253 .whitespace_delimiter_only => return self.nextWhitespaceDelimeterOnly(),
254 .normal => return self.nextNormal(),
255 .normal_expect_operator => return self.nextNormalWithContext(.expect_operator),
256 }
257 }
258
259 const StateWhitespaceDelimiterOnly = enum {
260 start,
261 literal,
262 preprocessor,
263 semicolon,
264 };
265
266 pub fn nextWhitespaceDelimeterOnly(self: *Self) LexError!Token {
267 const start_index = self.index;
268 var result = Token{
269 .id = .eof,
270 .start = start_index,
271 .end = undefined,
272 .line_number = self.line_handler.line_number,
273 };
274 var state = StateWhitespaceDelimiterOnly.start;
275
276 while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) {
277 const c = codepoint.value;
278 try self.checkForIllegalCodepoint(codepoint, false);
279 switch (state) {
280 .start => switch (c) {
281 '\r', '\n' => {
282 result.start = self.index + 1;
283 result.line_number = self.incrementLineNumber();
284 },
285 ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
286 result.start = self.index + 1;
287 },
288 // NBSP only counts as whitespace at the start of a line (but
289 // can be intermixed with other whitespace). Who knows why.
290 // TODO: This should either be removed, or it should also include
291 // the codepoints listed in disjoint_code_page.zig
292 '\xA0' => if (self.at_start_of_line) {
293 result.start = self.index + codepoint.byte_len;
294 } else {
295 state = .literal;
296 self.at_start_of_line = false;
297 },
298 '#' => {
299 if (self.at_start_of_line) {
300 state = .preprocessor;
301 } else {
302 state = .literal;
303 }
304 self.at_start_of_line = false;
305 },
306 ';' => {
307 state = .semicolon;
308 self.at_start_of_line = false;
309 },
310 else => {
311 state = .literal;
312 self.at_start_of_line = false;
313 },
314 },
315 .literal => switch (c) {
316 '\r', '\n', ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
317 result.id = .literal;
318 break;
319 },
320 else => {},
321 },
322 .preprocessor => switch (c) {
323 '\r', '\n' => {
324 try self.evaluatePreprocessorCommand(result.start, self.index);
325 result.start = self.index + 1;
326 state = .start;
327 result.line_number = self.incrementLineNumber();
328 },
329 else => {},
330 },
331 .semicolon => switch (c) {
332 '\r', '\n' => {
333 result.start = self.index + 1;
334 state = .start;
335 result.line_number = self.incrementLineNumber();
336 },
337 else => {},
338 },
339 }
340 } else { // got EOF
341 switch (state) {
342 .start => {},
343 .semicolon => {
344 // Skip past everything up to the EOF
345 result.start = self.index;
346 },
347 .literal => {
348 result.id = .literal;
349 },
350 .preprocessor => {
351 try self.evaluatePreprocessorCommand(result.start, self.index);
352 result.start = self.index;
353 },
354 }
355 }
356
357 result.end = self.index;
358
359 // EOF tokens must have their start index match the end index
360 std.debug.assert(result.id != .eof or result.start == result.end);
361
362 return result;
363 }
364
365 const StateNormal = enum {
366 start,
367 literal_or_quoted_wide_string,
368 quoted_ascii_string,
369 quoted_wide_string,
370 quoted_ascii_string_escape,
371 quoted_wide_string_escape,
372 quoted_ascii_string_maybe_end,
373 quoted_wide_string_maybe_end,
374 literal,
375 number_literal,
376 preprocessor,
377 semicolon,
378 // end
379 e,
380 en,
381 // begin
382 b,
383 be,
384 beg,
385 begi,
386 };
387
388 /// TODO: A not-terrible name
389 pub fn nextNormal(self: *Self) LexError!Token {
390 return self.nextNormalWithContext(.any);
391 }
392
393 pub fn nextNormalWithContext(self: *Self, context: enum { expect_operator, any }) LexError!Token {
394 const start_index = self.index;
395 var result = Token{
396 .id = .eof,
397 .start = start_index,
398 .end = undefined,
399 .line_number = self.line_handler.line_number,
400 };
401 var state = StateNormal.start;
402
403 // Note: The Windows RC compiler uses a non-standard method of computing
404 // length for its 'string literal too long' errors; it isn't easily
405 // explained or intuitive (it's sort-of pre-parsed byte length but with
406 // a few of exceptions/edge cases).
407 //
408 // It also behaves strangely with non-ASCII codepoints, e.g. even though the default
409 // limit is 4097, you can only have 4094 € codepoints (1 UTF-16 code unit each),
410 // and 2048 𐐷 codepoints (2 UTF-16 code units each).
411 //
412 // TODO: Understand this more, bring it more in line with how the Win32 limits work.
413 // Alternatively, do something that makes more sense but may be more permissive.
414 var string_literal_length: usize = 0;
415 // Keeping track of the string literal column prevents pathological edge cases when
416 // there are tons of tab stop characters within a string literal.
417 var string_literal_column: usize = 0;
418 var string_literal_collapsing_whitespace: bool = false;
419 var still_could_have_exponent: bool = true;
420 var exponent_index: ?usize = null;
421 while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) {
422 const c = codepoint.value;
423 const in_string_literal = switch (state) {
424 .quoted_ascii_string,
425 .quoted_wide_string,
426 .quoted_ascii_string_escape,
427 .quoted_wide_string_escape,
428 .quoted_ascii_string_maybe_end,
429 .quoted_wide_string_maybe_end,
430 =>
431 // If the current line is not the same line as the start of the string literal,
432 // then we want to treat the current codepoint as 'not in a string literal'
433 // for the purposes of detecting illegal codepoints. This means that we will
434 // error on illegal-outside-string-literal characters that are outside string
435 // literals from the perspective of a C preprocessor, but that may be
436 // inside string literals from the perspective of the RC lexer. For example,
437 // "hello
438 // @"
439 // will be treated as a single string literal by the RC lexer but the Win32
440 // preprocessor will consider this an unclosed string literal followed by
441 // the character @ and ", and will therefore error since the Win32 RC preprocessor
442 // errors on the @ character outside string literals.
443 //
444 // By doing this here, we can effectively emulate the Win32 RC preprocessor behavior
445 // at lex-time, and avoid the need for a separate step that checks for this edge-case
446 // specifically.
447 result.line_number == self.line_handler.line_number,
448 else => false,
449 };
450 try self.checkForIllegalCodepoint(codepoint, in_string_literal);
451 switch (state) {
452 .start => switch (c) {
453 '\r', '\n' => {
454 result.start = self.index + 1;
455 result.line_number = self.incrementLineNumber();
456 },
457 ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
458 result.start = self.index + 1;
459 },
460 // NBSP only counts as whitespace at the start of a line (but
461 // can be intermixed with other whitespace). Who knows why.
462 '\xA0' => if (self.at_start_of_line) {
463 result.start = self.index + codepoint.byte_len;
464 } else {
465 state = .literal;
466 self.at_start_of_line = false;
467 },
468 'L', 'l' => {
469 state = .literal_or_quoted_wide_string;
470 self.at_start_of_line = false;
471 },
472 'E', 'e' => {
473 state = .e;
474 self.at_start_of_line = false;
475 },
476 'B', 'b' => {
477 state = .b;
478 self.at_start_of_line = false;
479 },
480 '"' => {
481 state = .quoted_ascii_string;
482 self.at_start_of_line = false;
483 string_literal_collapsing_whitespace = false;
484 string_literal_length = 0;
485
486 var dummy_token = Token{
487 .start = self.index,
488 .end = self.index,
489 .line_number = self.line_handler.line_number,
490 .id = .invalid,
491 };
492 string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null);
493 },
494 '+', '&', '|' => {
495 self.index += 1;
496 result.id = .operator;
497 self.at_start_of_line = false;
498 break;
499 },
500 '-' => {
501 if (context == .expect_operator) {
502 self.index += 1;
503 result.id = .operator;
504 self.at_start_of_line = false;
505 break;
506 } else {
507 state = .number_literal;
508 still_could_have_exponent = true;
509 exponent_index = null;
510 self.at_start_of_line = false;
511 }
512 },
513 '0'...'9', '~' => {
514 state = .number_literal;
515 still_could_have_exponent = true;
516 exponent_index = null;
517 self.at_start_of_line = false;
518 },
519 '#' => {
520 if (self.at_start_of_line) {
521 state = .preprocessor;
522 } else {
523 state = .literal;
524 }
525 self.at_start_of_line = false;
526 },
527 ';' => {
528 state = .semicolon;
529 self.at_start_of_line = false;
530 },
531 '{', '}' => {
532 self.index += 1;
533 result.id = if (c == '{') .begin else .end;
534 self.at_start_of_line = false;
535 break;
536 },
537 '(', ')' => {
538 self.index += 1;
539 result.id = if (c == '(') .open_paren else .close_paren;
540 self.at_start_of_line = false;
541 break;
542 },
543 ',' => {
544 self.index += 1;
545 result.id = .comma;
546 self.at_start_of_line = false;
547 break;
548 },
549 else => {
550 if (isNonAsciiDigit(c)) {
551 self.error_context_token = .{
552 .id = .number,
553 .start = result.start,
554 .end = self.index + 1,
555 .line_number = self.line_handler.line_number,
556 };
557 return error.InvalidDigitCharacterInNumberLiteral;
558 }
559 state = .literal;
560 self.at_start_of_line = false;
561 },
562 },
563 .preprocessor => switch (c) {
564 '\r', '\n' => {
565 try self.evaluatePreprocessorCommand(result.start, self.index);
566 result.start = self.index + 1;
567 state = .start;
568 result.line_number = self.incrementLineNumber();
569 },
570 else => {},
571 },
572 // Semi-colon acts as a line-terminator--everything is skipped until
573 // the next line.
574 .semicolon => switch (c) {
575 '\r', '\n' => {
576 result.start = self.index + 1;
577 state = .start;
578 result.line_number = self.incrementLineNumber();
579 },
580 else => {},
581 },
582 .number_literal => switch (c) {
583 // zig fmt: off
584 ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
585 '\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
586 '\'', ';', '=',
587 => {
588 // zig fmt: on
589 result.id = .number;
590 break;
591 },
592 '0'...'9' => {
593 if (exponent_index) |exp_i| {
594 if (self.index - 1 == exp_i) {
595 // Note: This being an error is a quirk of the preprocessor used by
596 // the Win32 RC compiler.
597 self.error_context_token = .{
598 .id = .number,
599 .start = result.start,
600 .end = self.index + 1,
601 .line_number = self.line_handler.line_number,
602 };
603 return error.InvalidNumberWithExponent;
604 }
605 }
606 },
607 'e', 'E' => {
608 if (still_could_have_exponent) {
609 exponent_index = self.index;
610 still_could_have_exponent = false;
611 }
612 },
613 else => {
614 if (isNonAsciiDigit(c)) {
615 self.error_context_token = .{
616 .id = .number,
617 .start = result.start,
618 .end = self.index + 1,
619 .line_number = self.line_handler.line_number,
620 };
621 return error.InvalidDigitCharacterInNumberLiteral;
622 }
623 still_could_have_exponent = false;
624 },
625 },
626 .literal_or_quoted_wide_string => switch (c) {
627 // zig fmt: off
628 ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
629 '\r', '\n', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
630 '\'', ';', '=',
631 // zig fmt: on
632 => {
633 result.id = .literal;
634 break;
635 },
636 '"' => {
637 state = .quoted_wide_string;
638 string_literal_collapsing_whitespace = false;
639 string_literal_length = 0;
640
641 var dummy_token = Token{
642 .start = self.index,
643 .end = self.index,
644 .line_number = self.line_handler.line_number,
645 .id = .invalid,
646 };
647 string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null);
648 },
649 else => {
650 state = .literal;
651 },
652 },
653 .literal => switch (c) {
654 // zig fmt: off
655 ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
656 '\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
657 '\'', ';', '=',
658 => {
659 // zig fmt: on
660 result.id = .literal;
661 break;
662 },
663 else => {},
664 },
665 .e => switch (c) {
666 'N', 'n' => {
667 state = .en;
668 },
669 else => {
670 state = .literal;
671 self.index -= 1;
672 },
673 },
674 .en => switch (c) {
675 'D', 'd' => {
676 result.id = .end;
677 self.index += 1;
678 break;
679 },
680 else => {
681 state = .literal;
682 self.index -= 1;
683 },
684 },
685 .b => switch (c) {
686 'E', 'e' => {
687 state = .be;
688 },
689 else => {
690 state = .literal;
691 self.index -= 1;
692 },
693 },
694 .be => switch (c) {
695 'G', 'g' => {
696 state = .beg;
697 },
698 else => {
699 state = .literal;
700 self.index -= 1;
701 },
702 },
703 .beg => switch (c) {
704 'I', 'i' => {
705 state = .begi;
706 },
707 else => {
708 state = .literal;
709 self.index -= 1;
710 },
711 },
712 .begi => switch (c) {
713 'N', 'n' => {
714 result.id = .begin;
715 self.index += 1;
716 break;
717 },
718 else => {
719 state = .literal;
720 self.index -= 1;
721 },
722 },
723 .quoted_ascii_string, .quoted_wide_string => switch (c) {
724 '"' => {
725 string_literal_column += 1;
726 state = if (state == .quoted_ascii_string) .quoted_ascii_string_maybe_end else .quoted_wide_string_maybe_end;
727 },
728 '\\' => {
729 string_literal_length += 1;
730 string_literal_column += 1;
731 state = if (state == .quoted_ascii_string) .quoted_ascii_string_escape else .quoted_wide_string_escape;
732 },
733 '\r' => {
734 string_literal_column = 0;
735 // \r doesn't count towards string literal length
736
737 // Increment line number but don't affect the result token's line number
738 _ = self.incrementLineNumber();
739 },
740 '\n' => {
741 string_literal_column = 0;
742 // first \n expands to <space><\n>
743 if (!string_literal_collapsing_whitespace) {
744 string_literal_length += 2;
745 string_literal_collapsing_whitespace = true;
746 }
747 // the rest are collapsed into the <space><\n>
748
749 // Increment line number but don't affect the result token's line number
750 _ = self.incrementLineNumber();
751 },
752 // only \t, space, Vertical Tab, and Form Feed count as whitespace when collapsing
753 '\t', ' ', '\x0b', '\x0c' => {
754 if (!string_literal_collapsing_whitespace) {
755 // Literal tab characters are counted as the number of space characters
756 // needed to reach the next 8-column tab stop.
757 const width = columnWidth(string_literal_column, @intCast(c), 8);
758 string_literal_length += width;
759 string_literal_column += width;
760 }
761 },
762 else => {
763 string_literal_collapsing_whitespace = false;
764 string_literal_length += 1;
765 string_literal_column += 1;
766 },
767 },
768 .quoted_ascii_string_escape, .quoted_wide_string_escape => switch (c) {
769 '"' => {
770 self.error_context_token = .{
771 .id = .invalid,
772 .start = self.index - 1,
773 .end = self.index + 1,
774 .line_number = self.line_handler.line_number,
775 };
776 return error.FoundCStyleEscapedQuote;
777 },
778 else => {
779 string_literal_length += 1;
780 string_literal_column += 1;
781 state = if (state == .quoted_ascii_string_escape) .quoted_ascii_string else .quoted_wide_string;
782 },
783 },
784 .quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => switch (c) {
785 '"' => {
786 state = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
787 // Escaped quotes count as 1 char for string literal length checks.
788 // Since we did not increment on the first " (because it could have been
789 // the end of the quoted string), we increment here
790 string_literal_length += 1;
791 string_literal_column += 1;
792 },
793 else => {
794 result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
795 break;
796 },
797 },
798 }
799 } else { // got EOF
800 switch (state) {
801 .start => {},
802 .semicolon => {
803 // Skip past everything up to the EOF
804 result.start = self.index;
805 },
806 .literal_or_quoted_wide_string, .literal, .e, .en, .b, .be, .beg, .begi => {
807 result.id = .literal;
808 },
809 .preprocessor => {
810 try self.evaluatePreprocessorCommand(result.start, self.index);
811 result.start = self.index;
812 },
813 .number_literal => {
814 result.id = .number;
815 },
816 .quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => {
817 result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
818 },
819 .quoted_ascii_string,
820 .quoted_wide_string,
821 .quoted_ascii_string_escape,
822 .quoted_wide_string_escape,
823 => {
824 self.error_context_token = .{
825 .id = .eof,
826 .start = self.index,
827 .end = self.index,
828 .line_number = self.line_handler.line_number,
829 };
830 return LexError.UnfinishedStringLiteral;
831 },
832 }
833 }
834
835 result.end = self.index;
836
837 if (result.id == .quoted_ascii_string or result.id == .quoted_wide_string) {
838 if (string_literal_length > self.max_string_literal_codepoints) {
839 self.error_context_token = result;
840 return LexError.StringLiteralTooLong;
841 }
842 }
843
844 // EOF tokens must have their start index match the end index
845 std.debug.assert(result.id != .eof or result.start == result.end);
846
847 return result;
848 }
849
850 /// Increments line_number appropriately (handling line ending pairs)
851 /// and returns the new line number.
852 fn incrementLineNumber(self: *Self) usize {
853 _ = self.line_handler.incrementLineNumber(self.index);
854 self.at_start_of_line = true;
855 return self.line_handler.line_number;
856 }
857
858 fn checkForIllegalCodepoint(self: *Self, codepoint: code_pages.Codepoint, in_string_literal: bool) LexError!void {
859 const err = switch (codepoint.value) {
860 // 0x00 = NUL
861 // 0x1A = Substitute (treated as EOF)
862 // NOTE: 0x1A gets treated as EOF by the clang preprocessor so after a .rc file
863 // is run through the clang preprocessor it will no longer have 0x1A characters in it.
864 // 0x7F = DEL (treated as a context-specific terminator by the Windows RC compiler)
865 0x00, 0x1A, 0x7F => error.IllegalByte,
866 // 0x01...0x03 result in strange 'macro definition too big' errors when used outside of string literals
867 // 0x04 is valid but behaves strangely (sort of acts as a 'skip the next character' instruction)
868 0x01...0x04 => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return,
869 // @ and ` both result in error RC2018: unknown character '0x60' (and subsequently
870 // fatal error RC1116: RC terminating after preprocessor errors) if they are ever used
871 // outside of string literals. Not exactly sure why this would be the case, though.
872 // TODO: Make sure there aren't any exceptions
873 '@', '`' => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return,
874 // The Byte Order Mark is mostly skipped over by the Windows RC compiler, but
875 // there are edge cases where it leads to cryptic 'compiler limit : macro definition too big'
876 // errors (e.g. a BOM within a number literal). By making this illegal we avoid having to
877 // deal with a lot of edge cases and remove the potential footgun of the bytes of a BOM
878 // being 'missing' when included in a string literal (the Windows RC compiler acts as
879 // if the codepoint was never part of the string literal).
880 '\u{FEFF}' => error.IllegalByteOrderMark,
881 // Similar deal with this private use codepoint, it gets skipped/ignored by the
882 // RC compiler (but without the cryptic errors). Silently dropping bytes still seems like
883 // enough of a footgun with no real use-cases that it's still worth erroring instead of
884 // emulating the RC compiler's behavior, though.
885 '\u{E000}' => error.IllegalPrivateUseCharacter,
886 // These codepoints lead to strange errors when used outside of string literals,
887 // and miscompilations when used within string literals. We avoid the miscompilation
888 // within string literals and emit a warning, but outside of string literals it makes
889 // more sense to just disallow these codepoints.
890 0x900, 0xA00, 0xA0D, 0x2000, 0xD00, 0xFFFE, 0xFFFF => if (!in_string_literal) error.IllegalCodepointOutsideStringLiterals else return,
891 else => return,
892 };
893 self.error_context_token = .{
894 .id = .invalid,
895 .start = self.index,
896 .end = self.index + codepoint.byte_len,
897 .line_number = self.line_handler.line_number,
898 };
899 return err;
900 }
901
902 fn evaluatePreprocessorCommand(self: *Self, start: usize, end: usize) !void {
903 const token = Token{
904 .id = .preprocessor_command,
905 .start = start,
906 .end = end,
907 .line_number = self.line_handler.line_number,
908 };
909 errdefer self.error_context_token = token;
910 const full_command = self.buffer[start..end];
911
912 const code_page = (parsePragmaCodePage(full_command) catch |err| switch (err) {
913 error.NotPragma, error.NotCodePagePragma => return,
914 else => |e| return e,
915 }) orelse self.default_code_page;
916
917 // https://learn.microsoft.com/en-us/windows/win32/menurc/pragma-directives
918 // > This pragma is not supported in an included resource file (.rc)
919 //
920 // Even though the Win32 behavior is to just ignore such directives silently,
921 // this is an error in the lexer to allow for emitting warnings/errors when
922 // such directives are found if that's wanted. The intention is for the lexer
923 // to still be able to work correctly after this error is returned.
924 if (self.source_mappings) |source_mappings| {
925 if (!source_mappings.isRootFile(token.line_number)) {
926 return error.CodePagePragmaInIncludedFile;
927 }
928 }
929
930 self.seen_pragma_code_pages +|= 1;
931 self.last_pragma_code_page_token = token;
932 self.current_code_page = code_page;
933 }
934
935 pub fn getErrorDetails(self: Self, lex_err: LexError) ErrorDetails {
936 const err = switch (lex_err) {
937 error.UnfinishedStringLiteral => ErrorDetails.Error.unfinished_string_literal,
938 error.StringLiteralTooLong => return .{
939 .err = .string_literal_too_long,
940 .code_page = self.current_code_page,
941 .token = self.error_context_token.?,
942 .extra = .{ .number = self.max_string_literal_codepoints },
943 },
944 error.InvalidNumberWithExponent => ErrorDetails.Error.invalid_number_with_exponent,
945 error.InvalidDigitCharacterInNumberLiteral => ErrorDetails.Error.invalid_digit_character_in_number_literal,
946 error.IllegalByte => ErrorDetails.Error.illegal_byte,
947 error.IllegalByteOutsideStringLiterals => ErrorDetails.Error.illegal_byte_outside_string_literals,
948 error.IllegalCodepointOutsideStringLiterals => ErrorDetails.Error.illegal_codepoint_outside_string_literals,
949 error.IllegalByteOrderMark => ErrorDetails.Error.illegal_byte_order_mark,
950 error.IllegalPrivateUseCharacter => ErrorDetails.Error.illegal_private_use_character,
951 error.FoundCStyleEscapedQuote => ErrorDetails.Error.found_c_style_escaped_quote,
952 error.CodePagePragmaMissingLeftParen => ErrorDetails.Error.code_page_pragma_missing_left_paren,
953 error.CodePagePragmaMissingRightParen => ErrorDetails.Error.code_page_pragma_missing_right_paren,
954 error.CodePagePragmaInvalidCodePage => ErrorDetails.Error.code_page_pragma_invalid_code_page,
955 error.CodePagePragmaNotInteger => ErrorDetails.Error.code_page_pragma_not_integer,
956 error.CodePagePragmaOverflow => ErrorDetails.Error.code_page_pragma_overflow,
957 error.CodePagePragmaUnsupportedCodePage => ErrorDetails.Error.code_page_pragma_unsupported_code_page,
958 error.CodePagePragmaInIncludedFile => ErrorDetails.Error.code_page_pragma_in_included_file,
959 };
960 return .{
961 .err = err,
962 .code_page = self.current_code_page,
963 .token = self.error_context_token.?,
964 };
965 }
966};
967
968fn parseCodePageNum(str: []const u8) !u32 {
969 var x: u32 = 0;
970 for (str) |c| {
971 const digit = try std.fmt.charToDigit(c, 10);
972 if (x != 0) x = try std.math.mul(u32, x, 10);
973 x = try std.math.add(u32, x, digit);
974 }
975 return x;
976}
977
978/// Returns `null` when the code_page is set to DEFAULT
979pub fn parsePragmaCodePage(full_command: []const u8) !?SupportedCodePage {
980 var command = full_command;
981
982 // Anything besides exactly this is ignored by the Windows RC implementation
983 const expected_directive = "#pragma";
984 if (!std.mem.startsWith(u8, command, expected_directive)) return error.NotPragma;
985 command = command[expected_directive.len..];
986
987 if (command.len == 0 or !std.ascii.isWhitespace(command[0])) return error.NotCodePagePragma;
988 while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
989 command = command[1..];
990 }
991
992 // Note: CoDe_PaGeZ is also treated as "code_page" by the Windows RC implementation,
993 // and it will error with 'Missing left parenthesis in code_page #pragma'
994 const expected_extension = "code_page";
995 if (!std.ascii.startsWithIgnoreCase(command, expected_extension)) return error.NotCodePagePragma;
996 command = command[expected_extension.len..];
997
998 while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
999 command = command[1..];
1000 }
1001
1002 if (command.len == 0 or command[0] != '(') {
1003 return error.CodePagePragmaMissingLeftParen;
1004 }
1005 command = command[1..];
1006
1007 while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
1008 command = command[1..];
1009 }
1010
1011 var num_str: []u8 = command[0..0];
1012 while (command.len > 0 and (command[0] != ')' and !std.ascii.isWhitespace(command[0]))) {
1013 command = command[1..];
1014 num_str.len += 1;
1015 }
1016
1017 if (num_str.len == 0) {
1018 return error.CodePagePragmaNotInteger;
1019 }
1020
1021 while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
1022 command = command[1..];
1023 }
1024
1025 if (command.len == 0 or command[0] != ')') {
1026 return error.CodePagePragmaMissingRightParen;
1027 }
1028
1029 const code_page: ?SupportedCodePage = code_page: {
1030 if (std.ascii.eqlIgnoreCase("DEFAULT", num_str)) {
1031 break :code_page null;
1032 }
1033
1034 // The Win32 compiler behaves fairly strangely around maxInt(u32):
1035 // - If the overflowed u32 wraps and becomes a known code page ID, then
1036 // it will error/warn with "Codepage not valid: ignored" (depending on /w)
1037 // - If the overflowed u32 wraps and does not become a known code page ID,
1038 // then it will error with 'constant too big' and 'Codepage not integer'
1039 //
1040 // Instead of that, we just have a separate error specifically for overflow.
1041 const num = parseCodePageNum(num_str) catch |err| switch (err) {
1042 error.InvalidCharacter => return error.CodePagePragmaNotInteger,
1043 error.Overflow => return error.CodePagePragmaOverflow,
1044 };
1045
1046 // Anything that starts with 0 but does not resolve to 0 is treated as invalid, e.g. 01252
1047 if (num_str[0] == '0' and num != 0) {
1048 return error.CodePagePragmaInvalidCodePage;
1049 }
1050 // Anything that resolves to 0 is treated as 'not an integer' by the Win32 implementation.
1051 else if (num == 0) {
1052 return error.CodePagePragmaNotInteger;
1053 }
1054 // Anything above u16 max is not going to be found since our CodePage enum is backed by a u16.
1055 if (num > std.math.maxInt(u16)) {
1056 return error.CodePagePragmaInvalidCodePage;
1057 }
1058
1059 break :code_page code_pages.getByIdentifierEnsureSupported(@intCast(num)) catch |err| switch (err) {
1060 error.InvalidCodePage => return error.CodePagePragmaInvalidCodePage,
1061 error.UnsupportedCodePage => return error.CodePagePragmaUnsupportedCodePage,
1062 };
1063 };
1064
1065 return code_page;
1066}
1067
1068fn testLexNormal(source: []const u8, expected_tokens: []const Token.Id) !void {
1069 var lexer = Lexer.init(source, .{});
1070 if (dumpTokensDuringTests) std.debug.print("\n----------------------\n{s}\n----------------------\n", .{lexer.buffer});
1071 for (expected_tokens) |expected_token_id| {
1072 const token = try lexer.nextNormal();
1073 if (dumpTokensDuringTests) lexer.dump(&token);
1074 try std.testing.expectEqual(expected_token_id, token.id);
1075 }
1076 const last_token = try lexer.nextNormal();
1077 try std.testing.expectEqual(Token.Id.eof, last_token.id);
1078}
1079
1080fn expectLexError(expected: LexError, actual: anytype) !void {
1081 try std.testing.expectError(expected, actual);
1082 if (dumpTokensDuringTests) std.debug.print("{!}\n", .{actual});
1083}
1084
1085test "normal: numbers" {
1086 try testLexNormal("1", &.{.number});
1087 try testLexNormal("-1", &.{.number});
1088 try testLexNormal("- 1", &.{ .number, .number });
1089 try testLexNormal("-a", &.{.number});
1090}
1091
1092test "normal: string literals" {
1093 try testLexNormal("\"\"", &.{.quoted_ascii_string});
1094 // "" is an escaped "
1095 try testLexNormal("\" \"\" \"", &.{.quoted_ascii_string});
1096}
1097
1098test "superscript chars and code pages" {
1099 const firstToken = struct {
1100 pub fn firstToken(source: []const u8, default_code_page: SupportedCodePage, comptime lex_method: Lexer.LexMethod) LexError!Token {
1101 var lexer = Lexer.init(source, .{ .default_code_page = default_code_page });
1102 return lexer.next(lex_method);
1103 }
1104 }.firstToken;
1105 const utf8_source = "²";
1106 const windows1252_source = "\xB2";
1107
1108 const windows1252_encoded_as_windows1252 = firstToken(windows1252_source, .windows1252, .normal);
1109 try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, windows1252_encoded_as_windows1252);
1110
1111 const utf8_encoded_as_windows1252 = try firstToken(utf8_source, .windows1252, .normal);
1112 try std.testing.expectEqual(Token{
1113 .id = .literal,
1114 .start = 0,
1115 .end = 2,
1116 .line_number = 1,
1117 }, utf8_encoded_as_windows1252);
1118
1119 const utf8_encoded_as_utf8 = firstToken(utf8_source, .utf8, .normal);
1120 try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, utf8_encoded_as_utf8);
1121
1122 const windows1252_encoded_as_utf8 = try firstToken(windows1252_source, .utf8, .normal);
1123 try std.testing.expectEqual(Token{
1124 .id = .literal,
1125 .start = 0,
1126 .end = 1,
1127 .line_number = 1,
1128 }, windows1252_encoded_as_utf8);
1129}