master
1const std = @import("std");
2const code_pages = @import("code_pages.zig");
3const SupportedCodePage = code_pages.SupportedCodePage;
4const windows1252 = @import("windows1252.zig");
5const ErrorDetails = @import("errors.zig").ErrorDetails;
6const DiagnosticsContext = @import("errors.zig").DiagnosticsContext;
7const Token = @import("lex.zig").Token;
8
9/// rc is maximally liberal in terms of what it accepts as a number literal
10/// for data values. As long as it starts with a number or - or ~, that's good enough.
11pub fn isValidNumberDataLiteral(str: []const u8) bool {
12 if (str.len == 0) return false;
13 switch (str[0]) {
14 '~', '-', '0'...'9' => return true,
15 else => return false,
16 }
17}
18
19pub const SourceBytes = struct {
20 slice: []const u8,
21 code_page: SupportedCodePage,
22};
23
24pub const StringType = enum { ascii, wide };
25
26/// Valid escapes:
27/// "" -> "
28/// \a, \A => 0x08 (not 0x07 like in C)
29/// \n => 0x0A
30/// \r => 0x0D
31/// \t, \T => 0x09
32/// \\ => \
33/// \nnn => byte with numeric value given by nnn interpreted as octal
34/// (wraps on overflow, number of digits can be 1-3 for ASCII strings
35/// and 1-7 for wide strings)
36/// \xhh => byte with numeric value given by hh interpreted as hex
37/// (number of digits can be 0-2 for ASCII strings and 0-4 for
38/// wide strings)
39/// \<\r+> => \
40/// \<[\r\n\t ]+> => <nothing>
41///
42/// Special cases:
43/// <\t> => 1-8 spaces, dependent on columns in the source rc file itself
44/// <\r> => <nothing>
45/// <\n+><\w+?\n?> => <space><\n>
46///
47/// Special, especially weird case:
48/// \"" => "
49/// NOTE: This leads to footguns because the preprocessor can start parsing things
50/// out-of-sync with the RC compiler, expanding macros within string literals, etc.
51/// This parse function handles this case the same as the Windows RC compiler, but
52/// \" within a string literal is treated as an error by the lexer, so the relevant
53/// branches should never actually be hit during this function.
54pub const IterativeStringParser = struct {
55 source: []const u8,
56 code_page: SupportedCodePage,
57 /// The type of the string inferred by the prefix (L"" or "")
58 /// This is what matters for things like the maximum digits in an
59 /// escape sequence, whether or not invalid escape sequences are skipped, etc.
60 declared_string_type: StringType,
61 pending_codepoint: ?u21 = null,
62 num_pending_spaces: u8 = 0,
63 index: usize = 0,
64 column: usize = 0,
65 diagnostics: ?DiagnosticsContext = null,
66 seen_tab: bool = false,
67
68 const State = enum {
69 normal,
70 quote,
71 newline,
72 escaped,
73 escaped_cr,
74 escaped_newlines,
75 escaped_octal,
76 escaped_hex,
77 };
78
79 pub fn init(bytes: SourceBytes, options: StringParseOptions) IterativeStringParser {
80 const declared_string_type: StringType = switch (bytes.slice[0]) {
81 'L', 'l' => .wide,
82 else => .ascii,
83 };
84 var source = bytes.slice[1 .. bytes.slice.len - 1]; // remove ""
85 var column = options.start_column + 1; // for the removed "
86 if (declared_string_type == .wide) {
87 source = source[1..]; // remove L
88 column += 1; // for the removed L
89 }
90 return .{
91 .source = source,
92 .code_page = bytes.code_page,
93 .declared_string_type = declared_string_type,
94 .column = column,
95 .diagnostics = options.diagnostics,
96 };
97 }
98
99 pub const ParsedCodepoint = struct {
100 codepoint: u21,
101 /// Note: If this is true, `codepoint` will have an effective maximum value
102 /// of 0xFFFF, as `codepoint` is calculated using wrapping arithmetic on a u16.
103 /// If the value needs to be truncated to a smaller integer (e.g. for ASCII string
104 /// literals), then that must be done by the caller.
105 from_escaped_integer: bool = false,
106 /// Denotes that the codepoint is:
107 /// - Escaped (has a \ in front of it), and
108 /// - Has a value >= U+10000, meaning it would be encoded as a surrogate
109 /// pair in UTF-16, and
110 /// - Is part of a wide string literal
111 ///
112 /// Normally in wide string literals, invalid escapes are omitted
113 /// during parsing (the codepoints are not returned at all during
114 /// the `next` call), but this is a special case in which the
115 /// escape only applies to the high surrogate pair of the codepoint.
116 ///
117 /// TODO: Maybe just return the low surrogate codepoint by itself in this case.
118 escaped_surrogate_pair: bool = false,
119 };
120
121 pub fn next(self: *IterativeStringParser) std.mem.Allocator.Error!?ParsedCodepoint {
122 const result = try self.nextUnchecked();
123 if (self.diagnostics != null and result != null and !result.?.from_escaped_integer) {
124 switch (result.?.codepoint) {
125 0x0900, 0x0A00, 0x0A0D, 0x2000, 0x0D00 => {
126 const err: ErrorDetails.Error = if (result.?.codepoint == 0xD00)
127 .rc_would_miscompile_codepoint_skip
128 else
129 .rc_would_miscompile_codepoint_whitespace;
130 try self.diagnostics.?.diagnostics.append(ErrorDetails{
131 .err = err,
132 .type = .warning,
133 .code_page = self.code_page,
134 .token = self.diagnostics.?.token,
135 .extra = .{ .number = result.?.codepoint },
136 });
137 },
138 0xFFFE, 0xFFFF => {
139 try self.diagnostics.?.diagnostics.append(ErrorDetails{
140 .err = .rc_would_miscompile_codepoint_bom,
141 .type = .warning,
142 .code_page = self.code_page,
143 .token = self.diagnostics.?.token,
144 .extra = .{ .number = result.?.codepoint },
145 });
146 try self.diagnostics.?.diagnostics.append(ErrorDetails{
147 .err = .rc_would_miscompile_codepoint_bom,
148 .type = .note,
149 .code_page = self.code_page,
150 .token = self.diagnostics.?.token,
151 .print_source_line = false,
152 .extra = .{ .number = result.?.codepoint },
153 });
154 },
155 else => {},
156 }
157 }
158 return result;
159 }
160
161 pub fn nextUnchecked(self: *IterativeStringParser) std.mem.Allocator.Error!?ParsedCodepoint {
162 if (self.num_pending_spaces > 0) {
163 // Ensure that we don't get into this predicament so we can ensure that
164 // the order of processing any pending stuff doesn't matter
165 std.debug.assert(self.pending_codepoint == null);
166 self.num_pending_spaces -= 1;
167 return .{ .codepoint = ' ' };
168 }
169 if (self.pending_codepoint) |pending_codepoint| {
170 self.pending_codepoint = null;
171 return .{ .codepoint = pending_codepoint };
172 }
173 if (self.index >= self.source.len) return null;
174
175 var state: State = .normal;
176 var string_escape_n: u16 = 0;
177 var string_escape_i: u8 = 0;
178 const max_octal_escape_digits: u8 = switch (self.declared_string_type) {
179 .ascii => 3,
180 .wide => 7,
181 };
182 const max_hex_escape_digits: u8 = switch (self.declared_string_type) {
183 .ascii => 2,
184 .wide => 4,
185 };
186
187 var backtrack: bool = undefined;
188 while (self.code_page.codepointAt(self.index, self.source)) |codepoint| : ({
189 if (!backtrack) self.index += codepoint.byte_len;
190 }) {
191 backtrack = false;
192 const c = codepoint.value;
193 defer {
194 if (!backtrack) {
195 if (c == '\t') {
196 self.column += columnsUntilTabStop(self.column, 8);
197 } else {
198 self.column += codepoint.byte_len;
199 }
200 }
201 }
202 switch (state) {
203 .normal => switch (c) {
204 '\\' => state = .escaped,
205 '"' => state = .quote,
206 '\r' => {},
207 '\n' => state = .newline,
208 '\t' => {
209 // Only warn about a tab getting converted to spaces once per string
210 if (self.diagnostics != null and !self.seen_tab) {
211 try self.diagnostics.?.diagnostics.append(ErrorDetails{
212 .err = .tab_converted_to_spaces,
213 .type = .warning,
214 .code_page = self.code_page,
215 .token = self.diagnostics.?.token,
216 });
217 try self.diagnostics.?.diagnostics.append(ErrorDetails{
218 .err = .tab_converted_to_spaces,
219 .type = .note,
220 .code_page = self.code_page,
221 .token = self.diagnostics.?.token,
222 .print_source_line = false,
223 });
224 self.seen_tab = true;
225 }
226 const cols = columnsUntilTabStop(self.column, 8);
227 self.num_pending_spaces = @intCast(cols - 1);
228 self.index += codepoint.byte_len;
229 return .{ .codepoint = ' ' };
230 },
231 else => {
232 self.index += codepoint.byte_len;
233 return .{ .codepoint = c };
234 },
235 },
236 .quote => switch (c) {
237 '"' => {
238 // "" => "
239 self.index += codepoint.byte_len;
240 return .{ .codepoint = '"' };
241 },
242 else => unreachable, // this is a bug in the lexer
243 },
244 .newline => switch (c) {
245 '\r', ' ', '\t', '\n', '\x0b', '\x0c', '\xa0' => {},
246 else => {
247 // we intentionally avoid incrementing self.index
248 // to handle the current char in the next call,
249 // and we set backtrack so column count is handled correctly
250 backtrack = true;
251
252 // <space><newline>
253 self.pending_codepoint = '\n';
254 return .{ .codepoint = ' ' };
255 },
256 },
257 .escaped => switch (c) {
258 '\r' => state = .escaped_cr,
259 '\n' => state = .escaped_newlines,
260 '0'...'7' => {
261 string_escape_n = std.fmt.charToDigit(@intCast(c), 8) catch unreachable;
262 string_escape_i = 1;
263 state = .escaped_octal;
264 },
265 'x', 'X' => {
266 string_escape_n = 0;
267 string_escape_i = 0;
268 state = .escaped_hex;
269 },
270 else => {
271 switch (c) {
272 'a', 'A' => {
273 self.index += codepoint.byte_len;
274 // might be a bug in RC, but matches its behavior
275 return .{ .codepoint = '\x08' };
276 },
277 'n' => {
278 self.index += codepoint.byte_len;
279 return .{ .codepoint = '\n' };
280 },
281 'r' => {
282 self.index += codepoint.byte_len;
283 return .{ .codepoint = '\r' };
284 },
285 't', 'T' => {
286 self.index += codepoint.byte_len;
287 return .{ .codepoint = '\t' };
288 },
289 '\\' => {
290 self.index += codepoint.byte_len;
291 return .{ .codepoint = '\\' };
292 },
293 '"' => {
294 // \" is a special case that doesn't get the \ included,
295 backtrack = true;
296 },
297 else => switch (self.declared_string_type) {
298 .wide => {
299 // All invalid escape sequences are skipped in wide strings,
300 // but there is a special case around \<tab> where the \
301 // is skipped but the tab character is processed.
302 // It's actually a bit weirder than that, though, since
303 // the preprocessor is the one that does the <tab> -> spaces
304 // conversion, so it goes something like this:
305 //
306 // Before preprocessing: L"\<tab>"
307 // After preprocessing: L"\ "
308 //
309 // So the parser only sees an escaped space character followed
310 // by some other number of spaces >= 0.
311 //
312 // However, our preprocessor keeps tab characters intact, so we emulate
313 // the above behavior by skipping the \ and then outputting one less
314 // space than normal for the <tab> character.
315 if (c == '\t') {
316 // Only warn about a tab getting converted to spaces once per string
317 if (self.diagnostics != null and !self.seen_tab) {
318 try self.diagnostics.?.diagnostics.append(ErrorDetails{
319 .err = .tab_converted_to_spaces,
320 .type = .warning,
321 .code_page = self.code_page,
322 .token = self.diagnostics.?.token,
323 });
324 try self.diagnostics.?.diagnostics.append(ErrorDetails{
325 .err = .tab_converted_to_spaces,
326 .type = .note,
327 .code_page = self.code_page,
328 .token = self.diagnostics.?.token,
329 .print_source_line = false,
330 });
331 self.seen_tab = true;
332 }
333
334 const cols = columnsUntilTabStop(self.column, 8);
335 // If the tab character would only be converted to a single space,
336 // then we can just skip both the \ and the <tab> and move on.
337 if (cols > 1) {
338 self.num_pending_spaces = @intCast(cols - 2);
339 self.index += codepoint.byte_len;
340 return .{ .codepoint = ' ' };
341 }
342 }
343 // There's a second special case when the codepoint would be encoded
344 // as a surrogate pair in UTF-16, as the escape 'applies' to the
345 // high surrogate pair only in this instance. This is a side-effect
346 // of the Win32 RC compiler preprocessor outputting UTF-16 and the
347 // compiler itself seemingly working on code units instead of code points
348 // in this particular instance.
349 //
350 // We emulate this behavior by emitting the codepoint, but with a marker
351 // that indicates that it needs to be handled specially.
352 if (c >= 0x10000 and c != code_pages.Codepoint.invalid) {
353 self.index += codepoint.byte_len;
354 return .{ .codepoint = c, .escaped_surrogate_pair = true };
355 }
356 },
357 .ascii => {
358 // we intentionally avoid incrementing self.index
359 // to handle the current char in the next call,
360 // and we set backtrack so column count is handled correctly
361 backtrack = true;
362 return .{ .codepoint = '\\' };
363 },
364 },
365 }
366 state = .normal;
367 },
368 },
369 .escaped_cr => switch (c) {
370 '\r' => {},
371 '\n' => state = .escaped_newlines,
372 else => {
373 // we intentionally avoid incrementing self.index
374 // to handle the current char in the next call,
375 // and we set backtrack so column count is handled correctly
376 backtrack = true;
377 return .{ .codepoint = '\\' };
378 },
379 },
380 .escaped_newlines => switch (c) {
381 '\r', '\n', '\t', ' ', '\x0b', '\x0c', '\xa0' => {},
382 else => {
383 // backtrack so that we handle the current char properly
384 backtrack = true;
385 state = .normal;
386 },
387 },
388 .escaped_octal => switch (c) {
389 '0'...'7' => {
390 // Note: We use wrapping arithmetic on a u16 here since there's been no observed
391 // string parsing scenario where an escaped integer with a value >= the u16
392 // max is interpreted as anything but the truncated u16 value.
393 string_escape_n *%= 8;
394 string_escape_n +%= std.fmt.charToDigit(@intCast(c), 8) catch unreachable;
395 string_escape_i += 1;
396 if (string_escape_i == max_octal_escape_digits) {
397 self.index += codepoint.byte_len;
398 return .{ .codepoint = string_escape_n, .from_escaped_integer = true };
399 }
400 },
401 else => {
402 // we intentionally avoid incrementing self.index
403 // to handle the current char in the next call,
404 // and we set backtrack so column count is handled correctly
405 backtrack = true;
406
407 // write out whatever byte we have parsed so far
408 return .{ .codepoint = string_escape_n, .from_escaped_integer = true };
409 },
410 },
411 .escaped_hex => switch (c) {
412 '0'...'9', 'a'...'f', 'A'...'F' => {
413 string_escape_n *= 16;
414 string_escape_n += std.fmt.charToDigit(@intCast(c), 16) catch unreachable;
415 string_escape_i += 1;
416 if (string_escape_i == max_hex_escape_digits) {
417 self.index += codepoint.byte_len;
418 return .{ .codepoint = string_escape_n, .from_escaped_integer = true };
419 }
420 },
421 else => {
422 // we intentionally avoid incrementing self.index
423 // to handle the current char in the next call,
424 // and we set backtrack so column count is handled correctly
425 backtrack = true;
426
427 // write out whatever byte we have parsed so far
428 // (even with 0 actual digits, \x alone parses to 0)
429 const escaped_value = string_escape_n;
430 return .{ .codepoint = escaped_value, .from_escaped_integer = true };
431 },
432 },
433 }
434 }
435
436 switch (state) {
437 .normal, .escaped_newlines => {},
438 .newline => {
439 // <space><newline>
440 self.pending_codepoint = '\n';
441 return .{ .codepoint = ' ' };
442 },
443 .escaped, .escaped_cr => return .{ .codepoint = '\\' },
444 .escaped_octal, .escaped_hex => {
445 return .{ .codepoint = string_escape_n, .from_escaped_integer = true };
446 },
447 .quote => unreachable, // this is a bug in the lexer
448 }
449
450 return null;
451 }
452};
453
454pub const StringParseOptions = struct {
455 start_column: usize = 0,
456 diagnostics: ?DiagnosticsContext = null,
457 output_code_page: SupportedCodePage,
458};
459
460pub fn parseQuotedString(
461 comptime literal_type: StringType,
462 allocator: std.mem.Allocator,
463 bytes: SourceBytes,
464 options: StringParseOptions,
465) !(switch (literal_type) {
466 .ascii => []u8,
467 .wide => [:0]u16,
468}) {
469 const T = if (literal_type == .ascii) u8 else u16;
470 std.debug.assert(bytes.slice.len >= 2); // must at least have 2 double quote chars
471
472 var buf = try std.ArrayList(T).initCapacity(allocator, bytes.slice.len);
473 errdefer buf.deinit(allocator);
474
475 var iterative_parser = IterativeStringParser.init(bytes, options);
476
477 while (try iterative_parser.next()) |parsed| {
478 const c = parsed.codepoint;
479 switch (literal_type) {
480 .ascii => switch (options.output_code_page) {
481 .windows1252 => {
482 if (parsed.from_escaped_integer) {
483 try buf.append(allocator, @truncate(c));
484 } else if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
485 try buf.append(allocator, best_fit);
486 } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) {
487 try buf.append(allocator, '?');
488 } else {
489 try buf.appendSlice(allocator, "??");
490 }
491 },
492 .utf8 => {
493 var codepoint_to_encode = c;
494 if (parsed.from_escaped_integer) {
495 codepoint_to_encode = @as(T, @truncate(c));
496 }
497 const escaped_integer_outside_ascii_range = parsed.from_escaped_integer and codepoint_to_encode > 0x7F;
498 if (escaped_integer_outside_ascii_range or c == code_pages.Codepoint.invalid) {
499 codepoint_to_encode = '�';
500 }
501 var utf8_buf: [4]u8 = undefined;
502 const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable;
503 try buf.appendSlice(allocator, utf8_buf[0..utf8_len]);
504 },
505 },
506 .wide => {
507 // Parsing any string type as a wide string is handled separately, see parseQuotedStringAsWideString
508 std.debug.assert(iterative_parser.declared_string_type == .wide);
509 if (parsed.from_escaped_integer) {
510 try buf.append(allocator, std.mem.nativeToLittle(u16, @truncate(c)));
511 } else if (c == code_pages.Codepoint.invalid) {
512 try buf.append(allocator, std.mem.nativeToLittle(u16, '�'));
513 } else if (c < 0x10000) {
514 const short: u16 = @intCast(c);
515 try buf.append(allocator, std.mem.nativeToLittle(u16, short));
516 } else {
517 if (!parsed.escaped_surrogate_pair) {
518 const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800;
519 try buf.append(allocator, std.mem.nativeToLittle(u16, high));
520 }
521 const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00;
522 try buf.append(allocator, std.mem.nativeToLittle(u16, low));
523 }
524 },
525 }
526 }
527
528 if (literal_type == .wide) {
529 return buf.toOwnedSliceSentinel(allocator, 0);
530 } else {
531 return buf.toOwnedSlice(allocator);
532 }
533}
534
535pub fn parseQuotedAsciiString(allocator: std.mem.Allocator, bytes: SourceBytes, options: StringParseOptions) ![]u8 {
536 std.debug.assert(bytes.slice.len >= 2); // ""
537 return parseQuotedString(.ascii, allocator, bytes, options);
538}
539
540pub fn parseQuotedWideString(allocator: std.mem.Allocator, bytes: SourceBytes, options: StringParseOptions) ![:0]u16 {
541 std.debug.assert(bytes.slice.len >= 3); // L""
542 return parseQuotedString(.wide, allocator, bytes, options);
543}
544
545/// Parses any string type into a wide string.
546/// If the string is declared as a wide string (L""), then it is handled normally.
547/// Otherwise, things are fairly normal with the exception of escaped integers.
548/// Escaped integers are handled by:
549/// - Truncating the escape to a u8
550/// - Reinterpeting the u8 as a byte from the *output* code page
551/// - Outputting the codepoint that corresponds to the interpreted byte, or � if no such
552/// interpretation is possible
553/// For example, if the code page is UTF-8, then while \x80 is a valid start byte, it's
554/// interpreted as a single byte, so it ends up being seen as invalid and � is outputted.
555/// If the code page is Windows-1252, then \x80 is interpreted to be € which has the
556/// codepoint U+20AC, so the UTF-16 encoding of U+20AC is outputted.
557pub fn parseQuotedStringAsWideString(allocator: std.mem.Allocator, bytes: SourceBytes, options: StringParseOptions) ![:0]u16 {
558 std.debug.assert(bytes.slice.len >= 2); // ""
559
560 if (bytes.slice[0] == 'l' or bytes.slice[0] == 'L') {
561 return parseQuotedWideString(allocator, bytes, options);
562 }
563
564 // Note: We're only handling the case of parsing an ASCII string into a wide string from here on out.
565 // TODO: The logic below is similar to that in AcceleratorKeyCodepointTranslator, might be worth merging the two
566
567 var buf = try std.ArrayList(u16).initCapacity(allocator, bytes.slice.len);
568 errdefer buf.deinit(allocator);
569
570 var iterative_parser = IterativeStringParser.init(bytes, options);
571
572 while (try iterative_parser.next()) |parsed| {
573 const c = parsed.codepoint;
574 if (parsed.from_escaped_integer) {
575 std.debug.assert(c != code_pages.Codepoint.invalid);
576 const byte_to_interpret: u8 = @truncate(c);
577 const code_unit_to_encode: u16 = switch (options.output_code_page) {
578 .windows1252 => windows1252.toCodepoint(byte_to_interpret),
579 .utf8 => if (byte_to_interpret > 0x7F) '�' else byte_to_interpret,
580 };
581 try buf.append(allocator, std.mem.nativeToLittle(u16, code_unit_to_encode));
582 } else if (c == code_pages.Codepoint.invalid) {
583 try buf.append(allocator, std.mem.nativeToLittle(u16, '�'));
584 } else if (c < 0x10000) {
585 const short: u16 = @intCast(c);
586 try buf.append(allocator, std.mem.nativeToLittle(u16, short));
587 } else {
588 if (!parsed.escaped_surrogate_pair) {
589 const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800;
590 try buf.append(allocator, std.mem.nativeToLittle(u16, high));
591 }
592 const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00;
593 try buf.append(allocator, std.mem.nativeToLittle(u16, low));
594 }
595 }
596
597 return buf.toOwnedSliceSentinel(allocator, 0);
598}
599
600test "parse quoted ascii string" {
601 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
602 defer arena_allocator.deinit();
603 const arena = arena_allocator.allocator();
604
605 try std.testing.expectEqualSlices(u8, "hello", try parseQuotedAsciiString(arena, .{
606 .slice =
607 \\"hello"
608 ,
609 .code_page = .windows1252,
610 }, .{
611 .output_code_page = .windows1252,
612 }));
613 // hex with 0 digits
614 try std.testing.expectEqualSlices(u8, "\x00", try parseQuotedAsciiString(arena, .{
615 .slice =
616 \\"\x"
617 ,
618 .code_page = .windows1252,
619 }, .{
620 .output_code_page = .windows1252,
621 }));
622 // hex max of 2 digits
623 try std.testing.expectEqualSlices(u8, "\xFFf", try parseQuotedAsciiString(arena, .{
624 .slice =
625 \\"\XfFf"
626 ,
627 .code_page = .windows1252,
628 }, .{
629 .output_code_page = .windows1252,
630 }));
631 // octal with invalid octal digit
632 try std.testing.expectEqualSlices(u8, "\x019", try parseQuotedAsciiString(arena, .{
633 .slice =
634 \\"\19"
635 ,
636 .code_page = .windows1252,
637 }, .{
638 .output_code_page = .windows1252,
639 }));
640 // escaped quotes
641 try std.testing.expectEqualSlices(u8, " \" ", try parseQuotedAsciiString(arena, .{
642 .slice =
643 \\" "" "
644 ,
645 .code_page = .windows1252,
646 }, .{
647 .output_code_page = .windows1252,
648 }));
649 // backslash right before escaped quotes
650 try std.testing.expectEqualSlices(u8, "\"", try parseQuotedAsciiString(arena, .{
651 .slice =
652 \\"\"""
653 ,
654 .code_page = .windows1252,
655 }, .{
656 .output_code_page = .windows1252,
657 }));
658 // octal overflow
659 try std.testing.expectEqualSlices(u8, "\x01", try parseQuotedAsciiString(arena, .{
660 .slice =
661 \\"\401"
662 ,
663 .code_page = .windows1252,
664 }, .{
665 .output_code_page = .windows1252,
666 }));
667 // escapes
668 try std.testing.expectEqualSlices(u8, "\x08\n\r\t\\", try parseQuotedAsciiString(arena, .{
669 .slice =
670 \\"\a\n\r\t\\"
671 ,
672 .code_page = .windows1252,
673 }, .{
674 .output_code_page = .windows1252,
675 }));
676 // uppercase escapes
677 try std.testing.expectEqualSlices(u8, "\x08\\N\\R\t\\", try parseQuotedAsciiString(arena, .{
678 .slice =
679 \\"\A\N\R\T\\"
680 ,
681 .code_page = .windows1252,
682 }, .{
683 .output_code_page = .windows1252,
684 }));
685 // backslash on its own
686 try std.testing.expectEqualSlices(u8, "\\", try parseQuotedAsciiString(arena, .{
687 .slice =
688 \\"\"
689 ,
690 .code_page = .windows1252,
691 }, .{
692 .output_code_page = .windows1252,
693 }));
694 // unrecognized escapes
695 try std.testing.expectEqualSlices(u8, "\\b", try parseQuotedAsciiString(arena, .{
696 .slice =
697 \\"\b"
698 ,
699 .code_page = .windows1252,
700 }, .{
701 .output_code_page = .windows1252,
702 }));
703 // escaped carriage returns
704 try std.testing.expectEqualSlices(u8, "\\", try parseQuotedAsciiString(
705 arena,
706 .{ .slice = "\"\\\r\r\r\r\r\"", .code_page = .windows1252 },
707 .{ .output_code_page = .windows1252 },
708 ));
709 // escaped newlines
710 try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString(
711 arena,
712 .{ .slice = "\"\\\n\n\n\n\n\"", .code_page = .windows1252 },
713 .{ .output_code_page = .windows1252 },
714 ));
715 // escaped CRLF pairs
716 try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString(
717 arena,
718 .{ .slice = "\"\\\r\n\r\n\r\n\r\n\r\n\"", .code_page = .windows1252 },
719 .{ .output_code_page = .windows1252 },
720 ));
721 // escaped newlines with other whitespace
722 try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString(
723 arena,
724 .{ .slice = "\"\\\n \t\r\n \r\t\n \t\"", .code_page = .windows1252 },
725 .{ .output_code_page = .windows1252 },
726 ));
727 // literal tab characters get converted to spaces (dependent on source file columns)
728 try std.testing.expectEqualSlices(u8, " ", try parseQuotedAsciiString(
729 arena,
730 .{ .slice = "\"\t\"", .code_page = .windows1252 },
731 .{ .output_code_page = .windows1252 },
732 ));
733 try std.testing.expectEqualSlices(u8, "abc ", try parseQuotedAsciiString(
734 arena,
735 .{ .slice = "\"abc\t\"", .code_page = .windows1252 },
736 .{ .output_code_page = .windows1252 },
737 ));
738 try std.testing.expectEqualSlices(u8, "abcdefg ", try parseQuotedAsciiString(
739 arena,
740 .{ .slice = "\"abcdefg\t\"", .code_page = .windows1252 },
741 .{ .output_code_page = .windows1252 },
742 ));
743 try std.testing.expectEqualSlices(u8, "\\ ", try parseQuotedAsciiString(
744 arena,
745 .{ .slice = "\"\\\t\"", .code_page = .windows1252 },
746 .{ .output_code_page = .windows1252 },
747 ));
748 // literal CR's get dropped
749 try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString(
750 arena,
751 .{ .slice = "\"\r\r\r\r\r\"", .code_page = .windows1252 },
752 .{ .output_code_page = .windows1252 },
753 ));
754 // contiguous newlines and whitespace get collapsed to <space><newline>
755 try std.testing.expectEqualSlices(u8, " \n", try parseQuotedAsciiString(
756 arena,
757 .{ .slice = "\"\n\r\r \r\n \t \"", .code_page = .windows1252 },
758 .{ .output_code_page = .windows1252 },
759 ));
760}
761
762test "parse quoted ascii string with utf8 code page" {
763 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
764 defer arena_allocator.deinit();
765 const arena = arena_allocator.allocator();
766
767 try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString(
768 arena,
769 .{ .slice = "\"\"", .code_page = .utf8 },
770 .{ .output_code_page = .windows1252 },
771 ));
772 // Codepoints that don't have a Windows-1252 representation get converted to ?
773 try std.testing.expectEqualSlices(u8, "?????????", try parseQuotedAsciiString(
774 arena,
775 .{ .slice = "\"кириллица\"", .code_page = .utf8 },
776 .{ .output_code_page = .windows1252 },
777 ));
778 // Codepoints that have a best fit mapping get converted accordingly,
779 // these are box drawing codepoints
780 try std.testing.expectEqualSlices(u8, "\x2b\x2d\x2b", try parseQuotedAsciiString(
781 arena,
782 .{ .slice = "\"┌─┐\"", .code_page = .utf8 },
783 .{ .output_code_page = .windows1252 },
784 ));
785 // Invalid UTF-8 gets converted to ? depending on well-formedness
786 try std.testing.expectEqualSlices(u8, "????", try parseQuotedAsciiString(
787 arena,
788 .{ .slice = "\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 },
789 .{ .output_code_page = .windows1252 },
790 ));
791 // Codepoints that would require a UTF-16 surrogate pair get converted to ??
792 try std.testing.expectEqualSlices(u8, "??", try parseQuotedAsciiString(
793 arena,
794 .{ .slice = "\"\xF2\xAF\xBA\xB4\"", .code_page = .utf8 },
795 .{ .output_code_page = .windows1252 },
796 ));
797
798 // Output code page changes how invalid UTF-8 gets converted, since it
799 // now encodes the result as UTF-8 so it can write replacement characters.
800 try std.testing.expectEqualSlices(u8, "����", try parseQuotedAsciiString(
801 arena,
802 .{ .slice = "\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 },
803 .{ .output_code_page = .utf8 },
804 ));
805 try std.testing.expectEqualSlices(u8, "\xF2\xAF\xBA\xB4", try parseQuotedAsciiString(
806 arena,
807 .{ .slice = "\"\xF2\xAF\xBA\xB4\"", .code_page = .utf8 },
808 .{ .output_code_page = .utf8 },
809 ));
810
811 // This used to cause integer overflow when reconsuming the 4-byte long codepoint
812 // after the escaped CRLF pair.
813 try std.testing.expectEqualSlices(u8, "\u{10348}", try parseQuotedAsciiString(
814 arena,
815 .{ .slice = "\"\\\r\n\u{10348}\"", .code_page = .utf8 },
816 .{ .output_code_page = .utf8 },
817 ));
818}
819
820test "parse quoted string with different input/output code pages" {
821 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
822 defer arena_allocator.deinit();
823 const arena = arena_allocator.allocator();
824
825 try std.testing.expectEqualSlices(u8, "€���\x60\x7F", try parseQuotedAsciiString(
826 arena,
827 .{ .slice = "\"\x80\\x8a\\600\\612\\540\\577\"", .code_page = .windows1252 },
828 .{ .output_code_page = .utf8 },
829 ));
830}
831
832test "parse quoted wide string" {
833 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
834 defer arena_allocator.deinit();
835 const arena = arena_allocator.allocator();
836
837 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("hello"), try parseQuotedWideString(arena, .{
838 .slice =
839 \\L"hello"
840 ,
841 .code_page = .windows1252,
842 }, .{
843 .output_code_page = .windows1252,
844 }));
845 // hex with 0 digits
846 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{0x0}, try parseQuotedWideString(arena, .{
847 .slice =
848 \\L"\x"
849 ,
850 .code_page = .windows1252,
851 }, .{
852 .output_code_page = .windows1252,
853 }));
854 // hex max of 4 digits
855 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0xFFFF), std.mem.nativeToLittle(u16, 'f') }, try parseQuotedWideString(arena, .{
856 .slice =
857 \\L"\XfFfFf"
858 ,
859 .code_page = .windows1252,
860 }, .{
861 .output_code_page = .windows1252,
862 }));
863 // octal max of 7 digits
864 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0x9493), std.mem.nativeToLittle(u16, '3'), std.mem.nativeToLittle(u16, '3') }, try parseQuotedWideString(arena, .{
865 .slice =
866 \\L"\111222333"
867 ,
868 .code_page = .windows1252,
869 }, .{
870 .output_code_page = .windows1252,
871 }));
872 // octal overflow
873 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{std.mem.nativeToLittle(u16, 0xFF01)}, try parseQuotedWideString(arena, .{
874 .slice =
875 \\L"\777401"
876 ,
877 .code_page = .windows1252,
878 }, .{
879 .output_code_page = .windows1252,
880 }));
881 // literal tab characters get converted to spaces (dependent on source file columns)
882 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("abcdefg "), try parseQuotedWideString(
883 arena,
884 .{ .slice = "L\"abcdefg\t\"", .code_page = .windows1252 },
885 .{ .output_code_page = .windows1252 },
886 ));
887 // Windows-1252 conversion
888 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("ðð€€€"), try parseQuotedWideString(
889 arena,
890 .{ .slice = "L\"\xf0\xf0\x80\x80\x80\"", .code_page = .windows1252 },
891 .{ .output_code_page = .windows1252 },
892 ));
893 // Invalid escape sequences are skipped
894 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral(""), try parseQuotedWideString(
895 arena,
896 .{ .slice = "L\"\\H\"", .code_page = .windows1252 },
897 .{ .output_code_page = .windows1252 },
898 ));
899}
900
901test "parse quoted wide string with utf8 code page" {
902 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
903 defer arena_allocator.deinit();
904 const arena = arena_allocator.allocator();
905
906 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{}, try parseQuotedWideString(
907 arena,
908 .{ .slice = "L\"\"", .code_page = .utf8 },
909 .{ .output_code_page = .windows1252 },
910 ));
911 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("кириллица"), try parseQuotedWideString(
912 arena,
913 .{ .slice = "L\"кириллица\"", .code_page = .utf8 },
914 .{ .output_code_page = .windows1252 },
915 ));
916 // Invalid UTF-8 gets converted to � depending on well-formedness
917 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("����"), try parseQuotedWideString(
918 arena,
919 .{ .slice = "L\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 },
920 .{ .output_code_page = .windows1252 },
921 ));
922}
923
924test "parse quoted ascii string as wide string" {
925 var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
926 defer arena_allocator.deinit();
927 const arena = arena_allocator.allocator();
928
929 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("кириллица"), try parseQuotedStringAsWideString(
930 arena,
931 .{ .slice = "\"кириллица\"", .code_page = .utf8 },
932 .{ .output_code_page = .windows1252 },
933 ));
934 // Whether or not invalid escapes are skipped is still determined by the L prefix
935 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("\\H"), try parseQuotedStringAsWideString(
936 arena,
937 .{ .slice = "\"\\H\"", .code_page = .windows1252 },
938 .{ .output_code_page = .windows1252 },
939 ));
940 try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral(""), try parseQuotedStringAsWideString(
941 arena,
942 .{ .slice = "L\"\\H\"", .code_page = .windows1252 },
943 .{ .output_code_page = .windows1252 },
944 ));
945 // Maximum escape sequence value is also determined by the L prefix
946 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0x12), std.mem.nativeToLittle(u16, '3'), std.mem.nativeToLittle(u16, '4') }, try parseQuotedStringAsWideString(
947 arena,
948 .{ .slice = "\"\\x1234\"", .code_page = .windows1252 },
949 .{ .output_code_page = .windows1252 },
950 ));
951 try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{std.mem.nativeToLittle(u16, 0x1234)}, try parseQuotedStringAsWideString(
952 arena,
953 .{ .slice = "L\"\\x1234\"", .code_page = .windows1252 },
954 .{ .output_code_page = .windows1252 },
955 ));
956}
957
958pub fn columnsUntilTabStop(column: usize, tab_columns: usize) usize {
959 // 0 => 8, 1 => 7, 2 => 6, 3 => 5, 4 => 4
960 // 5 => 3, 6 => 2, 7 => 1, 8 => 8
961 return tab_columns - (column % tab_columns);
962}
963
964pub fn columnWidth(cur_column: usize, c: u8, tab_columns: usize) usize {
965 return switch (c) {
966 '\t' => columnsUntilTabStop(cur_column, tab_columns),
967 else => 1,
968 };
969}
970
971pub const Number = struct {
972 value: u32,
973 is_long: bool = false,
974
975 pub fn asWord(self: Number) u16 {
976 return @truncate(self.value);
977 }
978
979 pub fn evaluateOperator(lhs: Number, operator_char: u8, rhs: Number) Number {
980 const result = switch (operator_char) {
981 '-' => lhs.value -% rhs.value,
982 '+' => lhs.value +% rhs.value,
983 '|' => lhs.value | rhs.value,
984 '&' => lhs.value & rhs.value,
985 else => unreachable, // invalid operator, this would be a lexer/parser bug
986 };
987 return .{
988 .value = result,
989 .is_long = lhs.is_long or rhs.is_long,
990 };
991 }
992};
993
994/// Assumes that number literals normally rejected by RC's preprocessor
995/// are similarly rejected before being parsed.
996///
997/// Relevant RC preprocessor errors:
998/// RC2021: expected exponent value, not '<digit>'
999/// example that is rejected: 1e1
1000/// example that is accepted: 1ea
1001/// (this function will parse the two examples above the same)
1002pub fn parseNumberLiteral(bytes: SourceBytes) Number {
1003 std.debug.assert(bytes.slice.len > 0);
1004 var result = Number{ .value = 0, .is_long = false };
1005 var radix: u8 = 10;
1006 var buf = bytes.slice;
1007
1008 const Prefix = enum { none, minus, complement };
1009 var prefix: Prefix = .none;
1010 switch (buf[0]) {
1011 '-' => {
1012 prefix = .minus;
1013 buf = buf[1..];
1014 },
1015 '~' => {
1016 prefix = .complement;
1017 buf = buf[1..];
1018 },
1019 else => {},
1020 }
1021
1022 if (buf.len > 2 and buf[0] == '0') {
1023 switch (buf[1]) {
1024 'o' => { // octal radix prefix is case-sensitive
1025 radix = 8;
1026 buf = buf[2..];
1027 },
1028 'x', 'X' => {
1029 radix = 16;
1030 buf = buf[2..];
1031 },
1032 else => {},
1033 }
1034 }
1035
1036 var i: usize = 0;
1037 while (bytes.code_page.codepointAt(i, buf)) |codepoint| : (i += codepoint.byte_len) {
1038 const c = codepoint.value;
1039 if (c == 'L' or c == 'l') {
1040 result.is_long = true;
1041 break;
1042 }
1043 const digit = switch (c) {
1044 // On invalid digit for the radix, just stop parsing but don't fail
1045 0x00...0x7F => std.fmt.charToDigit(@intCast(c), radix) catch break,
1046 else => break,
1047 };
1048
1049 if (result.value != 0) {
1050 result.value *%= radix;
1051 }
1052 result.value +%= digit;
1053 }
1054
1055 switch (prefix) {
1056 .none => {},
1057 .minus => result.value = 0 -% result.value,
1058 .complement => result.value = ~result.value,
1059 }
1060
1061 return result;
1062}
1063
1064test "parse number literal" {
1065 try std.testing.expectEqual(Number{ .value = 0, .is_long = false }, parseNumberLiteral(.{ .slice = "0", .code_page = .windows1252 }));
1066 try std.testing.expectEqual(Number{ .value = 1, .is_long = false }, parseNumberLiteral(.{ .slice = "1", .code_page = .windows1252 }));
1067 try std.testing.expectEqual(Number{ .value = 1, .is_long = true }, parseNumberLiteral(.{ .slice = "1L", .code_page = .windows1252 }));
1068 try std.testing.expectEqual(Number{ .value = 1, .is_long = true }, parseNumberLiteral(.{ .slice = "1l", .code_page = .windows1252 }));
1069 try std.testing.expectEqual(Number{ .value = 1, .is_long = false }, parseNumberLiteral(.{ .slice = "1garbageL", .code_page = .windows1252 }));
1070 try std.testing.expectEqual(Number{ .value = 4294967295, .is_long = false }, parseNumberLiteral(.{ .slice = "4294967295", .code_page = .windows1252 }));
1071 try std.testing.expectEqual(Number{ .value = 0, .is_long = false }, parseNumberLiteral(.{ .slice = "4294967296", .code_page = .windows1252 }));
1072 try std.testing.expectEqual(Number{ .value = 1, .is_long = true }, parseNumberLiteral(.{ .slice = "4294967297L", .code_page = .windows1252 }));
1073
1074 // can handle any length of number, wraps on overflow appropriately
1075 const big_overflow = parseNumberLiteral(.{ .slice = "1000000000000000000000000000000000000000000000000000000000000000000000000000000090000000001", .code_page = .windows1252 });
1076 try std.testing.expectEqual(Number{ .value = 4100654081, .is_long = false }, big_overflow);
1077 try std.testing.expectEqual(@as(u16, 1025), big_overflow.asWord());
1078
1079 try std.testing.expectEqual(Number{ .value = 0x20, .is_long = false }, parseNumberLiteral(.{ .slice = "0x20", .code_page = .windows1252 }));
1080 try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2AL", .code_page = .windows1252 }));
1081 try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2aL", .code_page = .windows1252 }));
1082 try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2aL", .code_page = .windows1252 }));
1083
1084 try std.testing.expectEqual(Number{ .value = 0o20, .is_long = false }, parseNumberLiteral(.{ .slice = "0o20", .code_page = .windows1252 }));
1085 try std.testing.expectEqual(Number{ .value = 0o20, .is_long = true }, parseNumberLiteral(.{ .slice = "0o20L", .code_page = .windows1252 }));
1086 try std.testing.expectEqual(Number{ .value = 0o2, .is_long = false }, parseNumberLiteral(.{ .slice = "0o29", .code_page = .windows1252 }));
1087 try std.testing.expectEqual(Number{ .value = 0, .is_long = false }, parseNumberLiteral(.{ .slice = "0O29", .code_page = .windows1252 }));
1088
1089 try std.testing.expectEqual(Number{ .value = 0xFFFFFFFF, .is_long = false }, parseNumberLiteral(.{ .slice = "-1", .code_page = .windows1252 }));
1090 try std.testing.expectEqual(Number{ .value = 0xFFFFFFFE, .is_long = false }, parseNumberLiteral(.{ .slice = "~1", .code_page = .windows1252 }));
1091 try std.testing.expectEqual(Number{ .value = 0xFFFFFFFF, .is_long = true }, parseNumberLiteral(.{ .slice = "-4294967297L", .code_page = .windows1252 }));
1092 try std.testing.expectEqual(Number{ .value = 0xFFFFFFFE, .is_long = true }, parseNumberLiteral(.{ .slice = "~4294967297L", .code_page = .windows1252 }));
1093 try std.testing.expectEqual(Number{ .value = 0xFFFFFFFD, .is_long = false }, parseNumberLiteral(.{ .slice = "-0X3", .code_page = .windows1252 }));
1094
1095 // anything after L is ignored
1096 try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2aL5", .code_page = .windows1252 }));
1097}