zig/lib/compiler/resinator/comments.zig at master

  1//! Expects to run after a C preprocessor step that preserves comments.
  2//!
  3//! `rc` has a peculiar quirk where something like `blah/**/blah` will be
  4//! transformed into `blahblah` during parsing. However, `clang -E` will
  5//! transform it into `blah blah`, so in order to match `rc`, we need
  6//! to remove comments ourselves after the preprocessor runs.
  7//! Note: Multiline comments that actually span more than one line do
  8//!       get translated to a space character by `rc`.
  9//!
 10//! Removing comments before lexing also allows the lexer to not have to
 11//! deal with comments which would complicate its implementation (this is something
 12//! of a tradeoff, as removing comments in a separate pass means that we'll
 13//! need to iterate the source twice instead of once, but having to deal with
 14//! comments when lexing would be a pain).
 15
 16const std = @import("std");
 17const Allocator = std.mem.Allocator;
 18const UncheckedSliceWriter = @import("utils.zig").UncheckedSliceWriter;
 19const SourceMappings = @import("source_mapping.zig").SourceMappings;
 20const LineHandler = @import("lex.zig").LineHandler;
 21const formsLineEndingPair = @import("source_mapping.zig").formsLineEndingPair;
 22
 23/// `buf` must be at least as long as `source`
 24/// In-place transformation is supported (i.e. `source` and `buf` can be the same slice)
 25pub fn removeComments(source: []const u8, buf: []u8, source_mappings: ?*SourceMappings) ![]u8 {
 26    std.debug.assert(buf.len >= source.len);
 27    var result = UncheckedSliceWriter{ .slice = buf };
 28    const State = enum {
 29        start,
 30        forward_slash,
 31        line_comment,
 32        multiline_comment,
 33        multiline_comment_end,
 34        single_quoted,
 35        single_quoted_escape,
 36        double_quoted,
 37        double_quoted_escape,
 38    };
 39    var state: State = .start;
 40    var index: usize = 0;
 41    var pending_start: ?usize = null;
 42    var line_handler = LineHandler{ .buffer = source };
 43    while (index < source.len) : (index += 1) {
 44        const c = source[index];
 45        // TODO: Disallow \x1A, \x00, \x7F in comments. At least \x1A and \x00 can definitely
 46        //       cause errors or parsing weirdness in the Win32 RC compiler. These are disallowed
 47        //       in the lexer, but comments are stripped before getting to the lexer.
 48        switch (state) {
 49            .start => switch (c) {
 50                '/' => {
 51                    state = .forward_slash;
 52                    pending_start = index;
 53                },
 54                '\r', '\n' => {
 55                    _ = line_handler.incrementLineNumber(index);
 56                    result.write(c);
 57                },
 58                else => {
 59                    switch (c) {
 60                        '"' => state = .double_quoted,
 61                        '\'' => state = .single_quoted,
 62                        else => {},
 63                    }
 64                    result.write(c);
 65                },
 66            },
 67            .forward_slash => switch (c) {
 68                '/' => state = .line_comment,
 69                '*' => {
 70                    state = .multiline_comment;
 71                },
 72                else => {
 73                    _ = line_handler.maybeIncrementLineNumber(index);
 74                    result.writeSlice(source[pending_start.? .. index + 1]);
 75                    pending_start = null;
 76                    state = .start;
 77                },
 78            },
 79            .line_comment => switch (c) {
 80                '\r', '\n' => {
 81                    _ = line_handler.incrementLineNumber(index);
 82                    result.write(c);
 83                    state = .start;
 84                },
 85                else => {},
 86            },
 87            .multiline_comment => switch (c) {
 88                '\r' => try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings),
 89                '\n' => {
 90                    _ = line_handler.incrementLineNumber(index);
 91                    result.write(c);
 92                },
 93                '*' => state = .multiline_comment_end,
 94                else => {},
 95            },
 96            .multiline_comment_end => switch (c) {
 97                '\r' => {
 98                    try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings);
 99                    // We only want to treat this as a newline if it's part of a CRLF pair. If it's
100                    // not, then we still want to stay in .multiline_comment_end, so that e.g. `*<\r>/` still
101                    // functions as a `*/` comment ending. Kinda crazy, but that's how the Win32 implementation works.
102                    if (formsLineEndingPair(source, '\r', index + 1)) {
103                        state = .multiline_comment;
104                    }
105                },
106                '\n' => {
107                    _ = line_handler.incrementLineNumber(index);
108                    result.write(c);
109                    state = .multiline_comment;
110                },
111                '/' => {
112                    state = .start;
113                },
114                else => {
115                    state = .multiline_comment;
116                },
117            },
118            .single_quoted => switch (c) {
119                '\r', '\n' => {
120                    _ = line_handler.incrementLineNumber(index);
121                    state = .start;
122                    result.write(c);
123                },
124                '\\' => {
125                    state = .single_quoted_escape;
126                    result.write(c);
127                },
128                '\'' => {
129                    state = .start;
130                    result.write(c);
131                },
132                else => {
133                    result.write(c);
134                },
135            },
136            .single_quoted_escape => switch (c) {
137                '\r', '\n' => {
138                    _ = line_handler.incrementLineNumber(index);
139                    state = .start;
140                    result.write(c);
141                },
142                else => {
143                    state = .single_quoted;
144                    result.write(c);
145                },
146            },
147            .double_quoted => switch (c) {
148                '\r', '\n' => {
149                    _ = line_handler.incrementLineNumber(index);
150                    state = .start;
151                    result.write(c);
152                },
153                '\\' => {
154                    state = .double_quoted_escape;
155                    result.write(c);
156                },
157                '"' => {
158                    state = .start;
159                    result.write(c);
160                },
161                else => {
162                    result.write(c);
163                },
164            },
165            .double_quoted_escape => switch (c) {
166                '\r', '\n' => {
167                    _ = line_handler.incrementLineNumber(index);
168                    state = .start;
169                    result.write(c);
170                },
171                else => {
172                    state = .double_quoted;
173                    result.write(c);
174                },
175            },
176        }
177    } else {
178        switch (state) {
179            .start,
180            .line_comment,
181            .multiline_comment,
182            .multiline_comment_end,
183            .single_quoted,
184            .single_quoted_escape,
185            .double_quoted,
186            .double_quoted_escape,
187            => {},
188            .forward_slash => {
189                result.writeSlice(source[pending_start.?..index]);
190            },
191        }
192    }
193    return result.getWritten();
194}
195
196inline fn handleMultilineCarriageReturn(
197    source: []const u8,
198    line_handler: *LineHandler,
199    index: usize,
200    result: *UncheckedSliceWriter,
201    source_mappings: ?*SourceMappings,
202) !void {
203    // This is a dumb way to go about this, but basically we want to determine
204    // if this is part of a distinct CRLF or LFCR pair. This function call will detect
205    // LFCR pairs correctly since the function we're in will only be called on CR,
206    // but will not detect CRLF pairs since it only looks at the line ending before the
207    // CR. So, we do a second (forward) check if the first fails to detect CRLF that is
208    // not part of another pair.
209    const is_lfcr_pair = line_handler.currentIndexFormsLineEndingPair(index);
210    const is_crlf_pair = !is_lfcr_pair and formsLineEndingPair(source, '\r', index + 1);
211    // Note: Bare \r within a multiline comment should *not* be treated as a line ending for the
212    // purposes of removing comments, but *should* be treated as a line ending for the
213    // purposes of line counting/source mapping
214    _ = line_handler.incrementLineNumber(index);
215    // So only write the \r if it's part of a CRLF/LFCR pair
216    if (is_lfcr_pair or is_crlf_pair) {
217        result.write('\r');
218    }
219    // And otherwise, we want to collapse the source mapping so that we can still know which
220    // line came from where.
221    else {
222        // Because the line gets collapsed, we need to decrement line number so that
223        // the next collapse acts on the first of the collapsed line numbers
224        line_handler.line_number -= 1;
225        if (source_mappings) |mappings| {
226            try mappings.collapse(line_handler.line_number, 1);
227        }
228    }
229}
230
231pub fn removeCommentsAlloc(allocator: Allocator, source: []const u8, source_mappings: ?*SourceMappings) ![]u8 {
232    const buf = try allocator.alloc(u8, source.len);
233    errdefer allocator.free(buf);
234    const result = try removeComments(source, buf, source_mappings);
235    return allocator.realloc(buf, result.len);
236}
237
238fn testRemoveComments(expected: []const u8, source: []const u8) !void {
239    const result = try removeCommentsAlloc(std.testing.allocator, source, null);
240    defer std.testing.allocator.free(result);
241
242    try std.testing.expectEqualStrings(expected, result);
243}
244
245test "basic" {
246    try testRemoveComments("", "// comment");
247    try testRemoveComments("", "/* comment */");
248}
249
250test "mixed" {
251    try testRemoveComments("hello", "hello// comment");
252    try testRemoveComments("hello", "hel/* comment */lo");
253}
254
255test "within a string" {
256    // escaped " is \"
257    try testRemoveComments(
258        \\blah"//som\"/*ething*/"BLAH
259    ,
260        \\blah"//som\"/*ething*/"BLAH
261    );
262}
263
264test "line comments retain newlines" {
265    try testRemoveComments(
266        \\
267        \\
268        \\
269    ,
270        \\// comment
271        \\// comment
272        \\// comment
273    );
274
275    try testRemoveComments("\r\n", "//comment\r\n");
276}
277
278test "unfinished multiline comment" {
279    try testRemoveComments(
280        \\unfinished
281        \\
282    ,
283        \\unfinished/*
284        \\
285    );
286}
287
288test "crazy" {
289    try testRemoveComments(
290        \\blah"/*som*/\""BLAH
291    ,
292        \\blah"/*som*/\""/*ething*/BLAH
293    );
294
295    try testRemoveComments(
296        \\blah"/*som*/"BLAH RCDATA "BEGIN END
297        \\
298        \\
299        \\hello
300        \\"
301    ,
302        \\blah"/*som*/"/*ething*/BLAH RCDATA "BEGIN END
303        \\// comment
304        \\//"blah blah" RCDATA {}
305        \\hello
306        \\"
307    );
308}
309
310test "multiline comment with newlines" {
311    // bare \r is not treated as a newline
312    try testRemoveComments("blahblah", "blah/*some\rthing*/blah");
313
314    try testRemoveComments(
315        \\blah
316        \\blah
317    ,
318        \\blah/*some
319        \\thing*/blah
320    );
321    try testRemoveComments(
322        "blah\r\nblah",
323        "blah/*some\r\nthing*/blah",
324    );
325
326    // handle *<not /> correctly
327    try testRemoveComments(
328        \\blah
329        \\
330        \\
331    ,
332        \\blah/*some
333        \\thing*
334        \\/bl*ah*/
335    );
336}
337
338test "comments appended to a line" {
339    try testRemoveComments(
340        \\blah 
341        \\blah
342    ,
343        \\blah // line comment
344        \\blah
345    );
346    try testRemoveComments(
347        "blah \r\nblah",
348        "blah // line comment\r\nblah",
349    );
350}
351
352test "forward slash only" {
353    try testRemoveComments(
354        \\  /
355        \\/
356    ,
357        \\  /
358        \\/
359    );
360}
361
362test "remove comments with mappings" {
363    const allocator = std.testing.allocator;
364    var mut_source = "blah/*\rcommented line*\r/blah".*;
365    var mappings = SourceMappings{};
366    _ = try mappings.files.put(allocator, "test.rc");
367    try mappings.set(1, 1, 0);
368    try mappings.set(2, 2, 0);
369    try mappings.set(3, 3, 0);
370    defer mappings.deinit(allocator);
371
372    const result = try removeComments(&mut_source, &mut_source, &mappings);
373
374    try std.testing.expectEqualStrings("blahblah", result);
375    try std.testing.expectEqual(@as(usize, 1), mappings.end_line);
376    try std.testing.expectEqual(@as(usize, 3), mappings.getCorrespondingSpan(1).?.end_line);
377}
378
379test "in place" {
380    var mut_source = "blah /* comment */ blah".*;
381    const result = try removeComments(&mut_source, &mut_source, null);
382    try std.testing.expectEqualStrings("blah  blah", result);
383}