master
 1const std = @import("std");
 2const lex = @import("lex.zig");
 3const SourceMappings = @import("source_mapping.zig").SourceMappings;
 4const SupportedCodePage = @import("code_pages.zig").SupportedCodePage;
 5
 6pub fn hasDisjointCodePage(source: []const u8, source_mappings: ?*const SourceMappings, default_code_page: SupportedCodePage) bool {
 7    var line_handler = lex.LineHandler{ .buffer = source };
 8    var i: usize = 0;
 9    while (i < source.len) {
10        const codepoint = default_code_page.codepointAt(i, source) orelse break;
11        const c = codepoint.value;
12        switch (c) {
13            '\r', '\n' => {
14                _ = line_handler.incrementLineNumber(i);
15                // Any lines that are not from the root file interrupt the disjoint code page
16                if (source_mappings != null and !source_mappings.?.isRootFile(line_handler.line_number)) return false;
17            },
18            // whitespace is ignored
19            ' ',
20            '\t',
21            // NBSP, this should technically be in the TODO below, but it is treated as whitespace
22            // due to a (misguided) special casing in the lexer, see the TODO in lex.zig
23            '\u{A0}',
24            => {},
25
26            // TODO: All of the below are treated as whitespace by the Win32 RC preprocessor, which also
27            //       means they are trimmed from the file during preprocessing. This means that these characters
28            //       should be treated like ' ', '\t' above, but since the resinator preprocessor does not treat
29            //       them as whitespace *or* trim whitespace, files with these characters are likely going to
30            //       error. So, in the future some sort of emulation of/rejection of the Win32 behavior might
31            //       make handling these codepoints specially make sense, but for now it doesn't really matter
32            //       so they are not handled specially for simplicity's sake.
33            //'\u{1680}',
34            //'\u{180E}',
35            //'\u{2001}',
36            //'\u{2002}',
37            //'\u{2003}',
38            //'\u{2004}',
39            //'\u{2005}',
40            //'\u{2006}',
41            //'\u{2007}',
42            //'\u{2008}',
43            //'\u{2009}',
44            //'\u{200A}',
45            //'\u{2028}',
46            //'\u{2029}',
47            //'\u{202F}',
48            //'\u{205F}',
49            //'\u{3000}',
50
51            '#' => {
52                if (source_mappings != null and !source_mappings.?.isRootFile(line_handler.line_number)) {
53                    return false;
54                }
55                const start_i = i;
56                while (i < source.len and source[i] != '\r' and source[i] != '\n') : (i += 1) {}
57                const line = source[start_i..i];
58                _ = (lex.parsePragmaCodePage(line) catch |err| switch (err) {
59                    error.NotPragma => return false,
60                    error.NotCodePagePragma => continue,
61                    error.CodePagePragmaUnsupportedCodePage => continue,
62                    else => continue,
63                }) orelse return false; // DEFAULT interrupts disjoint code page
64
65                // If we got a code page, then it is a disjoint code page pragma
66                return true;
67            },
68            else => {
69                // Any other character interrupts the disjoint code page
70                return false;
71            },
72        }
73
74        i += codepoint.byte_len;
75    }
76    return false;
77}
78
79test hasDisjointCodePage {
80    try std.testing.expect(hasDisjointCodePage("#pragma code_page(65001)\n", null, .windows1252));
81    // NBSP is a special case
82    try std.testing.expect(hasDisjointCodePage("\xA0\n#pragma code_page(65001)\n", null, .windows1252));
83    try std.testing.expect(hasDisjointCodePage("\u{A0}\n#pragma code_page(1252)\n", null, .utf8));
84    // other preprocessor commands don't interrupt
85    try std.testing.expect(hasDisjointCodePage("#pragma foo\n#pragma code_page(65001)\n", null, .windows1252));
86    // invalid code page doesn't interrupt
87    try std.testing.expect(hasDisjointCodePage("#pragma code_page(1234567)\n#pragma code_page(65001)\n", null, .windows1252));
88
89    try std.testing.expect(!hasDisjointCodePage("#if 1\n#endif\n#pragma code_page(65001)", null, .windows1252));
90    try std.testing.expect(!hasDisjointCodePage("// comment\n#pragma code_page(65001)", null, .windows1252));
91    try std.testing.expect(!hasDisjointCodePage("/* comment */\n#pragma code_page(65001)", null, .windows1252));
92}
93
94test "multiline comment edge case" {
95    // TODO
96    if (true) return error.SkipZigTest;
97
98    try std.testing.expect(hasDisjointCodePage("/* comment */#pragma code_page(65001)", null, .windows1252));
99}