master
1//! Expects to run after a C preprocessor step that preserves comments.
2//!
3//! `rc` has a peculiar quirk where something like `blah/**/blah` will be
4//! transformed into `blahblah` during parsing. However, `clang -E` will
5//! transform it into `blah blah`, so in order to match `rc`, we need
6//! to remove comments ourselves after the preprocessor runs.
7//! Note: Multiline comments that actually span more than one line do
8//! get translated to a space character by `rc`.
9//!
10//! Removing comments before lexing also allows the lexer to not have to
11//! deal with comments which would complicate its implementation (this is something
12//! of a tradeoff, as removing comments in a separate pass means that we'll
13//! need to iterate the source twice instead of once, but having to deal with
14//! comments when lexing would be a pain).
15
16const std = @import("std");
17const Allocator = std.mem.Allocator;
18const UncheckedSliceWriter = @import("utils.zig").UncheckedSliceWriter;
19const SourceMappings = @import("source_mapping.zig").SourceMappings;
20const LineHandler = @import("lex.zig").LineHandler;
21const formsLineEndingPair = @import("source_mapping.zig").formsLineEndingPair;
22
23/// `buf` must be at least as long as `source`
24/// In-place transformation is supported (i.e. `source` and `buf` can be the same slice)
25pub fn removeComments(source: []const u8, buf: []u8, source_mappings: ?*SourceMappings) ![]u8 {
26 std.debug.assert(buf.len >= source.len);
27 var result = UncheckedSliceWriter{ .slice = buf };
28 const State = enum {
29 start,
30 forward_slash,
31 line_comment,
32 multiline_comment,
33 multiline_comment_end,
34 single_quoted,
35 single_quoted_escape,
36 double_quoted,
37 double_quoted_escape,
38 };
39 var state: State = .start;
40 var index: usize = 0;
41 var pending_start: ?usize = null;
42 var line_handler = LineHandler{ .buffer = source };
43 while (index < source.len) : (index += 1) {
44 const c = source[index];
45 // TODO: Disallow \x1A, \x00, \x7F in comments. At least \x1A and \x00 can definitely
46 // cause errors or parsing weirdness in the Win32 RC compiler. These are disallowed
47 // in the lexer, but comments are stripped before getting to the lexer.
48 switch (state) {
49 .start => switch (c) {
50 '/' => {
51 state = .forward_slash;
52 pending_start = index;
53 },
54 '\r', '\n' => {
55 _ = line_handler.incrementLineNumber(index);
56 result.write(c);
57 },
58 else => {
59 switch (c) {
60 '"' => state = .double_quoted,
61 '\'' => state = .single_quoted,
62 else => {},
63 }
64 result.write(c);
65 },
66 },
67 .forward_slash => switch (c) {
68 '/' => state = .line_comment,
69 '*' => {
70 state = .multiline_comment;
71 },
72 else => {
73 _ = line_handler.maybeIncrementLineNumber(index);
74 result.writeSlice(source[pending_start.? .. index + 1]);
75 pending_start = null;
76 state = .start;
77 },
78 },
79 .line_comment => switch (c) {
80 '\r', '\n' => {
81 _ = line_handler.incrementLineNumber(index);
82 result.write(c);
83 state = .start;
84 },
85 else => {},
86 },
87 .multiline_comment => switch (c) {
88 '\r' => try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings),
89 '\n' => {
90 _ = line_handler.incrementLineNumber(index);
91 result.write(c);
92 },
93 '*' => state = .multiline_comment_end,
94 else => {},
95 },
96 .multiline_comment_end => switch (c) {
97 '\r' => {
98 try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings);
99 // We only want to treat this as a newline if it's part of a CRLF pair. If it's
100 // not, then we still want to stay in .multiline_comment_end, so that e.g. `*<\r>/` still
101 // functions as a `*/` comment ending. Kinda crazy, but that's how the Win32 implementation works.
102 if (formsLineEndingPair(source, '\r', index + 1)) {
103 state = .multiline_comment;
104 }
105 },
106 '\n' => {
107 _ = line_handler.incrementLineNumber(index);
108 result.write(c);
109 state = .multiline_comment;
110 },
111 '/' => {
112 state = .start;
113 },
114 else => {
115 state = .multiline_comment;
116 },
117 },
118 .single_quoted => switch (c) {
119 '\r', '\n' => {
120 _ = line_handler.incrementLineNumber(index);
121 state = .start;
122 result.write(c);
123 },
124 '\\' => {
125 state = .single_quoted_escape;
126 result.write(c);
127 },
128 '\'' => {
129 state = .start;
130 result.write(c);
131 },
132 else => {
133 result.write(c);
134 },
135 },
136 .single_quoted_escape => switch (c) {
137 '\r', '\n' => {
138 _ = line_handler.incrementLineNumber(index);
139 state = .start;
140 result.write(c);
141 },
142 else => {
143 state = .single_quoted;
144 result.write(c);
145 },
146 },
147 .double_quoted => switch (c) {
148 '\r', '\n' => {
149 _ = line_handler.incrementLineNumber(index);
150 state = .start;
151 result.write(c);
152 },
153 '\\' => {
154 state = .double_quoted_escape;
155 result.write(c);
156 },
157 '"' => {
158 state = .start;
159 result.write(c);
160 },
161 else => {
162 result.write(c);
163 },
164 },
165 .double_quoted_escape => switch (c) {
166 '\r', '\n' => {
167 _ = line_handler.incrementLineNumber(index);
168 state = .start;
169 result.write(c);
170 },
171 else => {
172 state = .double_quoted;
173 result.write(c);
174 },
175 },
176 }
177 } else {
178 switch (state) {
179 .start,
180 .line_comment,
181 .multiline_comment,
182 .multiline_comment_end,
183 .single_quoted,
184 .single_quoted_escape,
185 .double_quoted,
186 .double_quoted_escape,
187 => {},
188 .forward_slash => {
189 result.writeSlice(source[pending_start.?..index]);
190 },
191 }
192 }
193 return result.getWritten();
194}
195
196inline fn handleMultilineCarriageReturn(
197 source: []const u8,
198 line_handler: *LineHandler,
199 index: usize,
200 result: *UncheckedSliceWriter,
201 source_mappings: ?*SourceMappings,
202) !void {
203 // This is a dumb way to go about this, but basically we want to determine
204 // if this is part of a distinct CRLF or LFCR pair. This function call will detect
205 // LFCR pairs correctly since the function we're in will only be called on CR,
206 // but will not detect CRLF pairs since it only looks at the line ending before the
207 // CR. So, we do a second (forward) check if the first fails to detect CRLF that is
208 // not part of another pair.
209 const is_lfcr_pair = line_handler.currentIndexFormsLineEndingPair(index);
210 const is_crlf_pair = !is_lfcr_pair and formsLineEndingPair(source, '\r', index + 1);
211 // Note: Bare \r within a multiline comment should *not* be treated as a line ending for the
212 // purposes of removing comments, but *should* be treated as a line ending for the
213 // purposes of line counting/source mapping
214 _ = line_handler.incrementLineNumber(index);
215 // So only write the \r if it's part of a CRLF/LFCR pair
216 if (is_lfcr_pair or is_crlf_pair) {
217 result.write('\r');
218 }
219 // And otherwise, we want to collapse the source mapping so that we can still know which
220 // line came from where.
221 else {
222 // Because the line gets collapsed, we need to decrement line number so that
223 // the next collapse acts on the first of the collapsed line numbers
224 line_handler.line_number -= 1;
225 if (source_mappings) |mappings| {
226 try mappings.collapse(line_handler.line_number, 1);
227 }
228 }
229}
230
231pub fn removeCommentsAlloc(allocator: Allocator, source: []const u8, source_mappings: ?*SourceMappings) ![]u8 {
232 const buf = try allocator.alloc(u8, source.len);
233 errdefer allocator.free(buf);
234 const result = try removeComments(source, buf, source_mappings);
235 return allocator.realloc(buf, result.len);
236}
237
238fn testRemoveComments(expected: []const u8, source: []const u8) !void {
239 const result = try removeCommentsAlloc(std.testing.allocator, source, null);
240 defer std.testing.allocator.free(result);
241
242 try std.testing.expectEqualStrings(expected, result);
243}
244
245test "basic" {
246 try testRemoveComments("", "// comment");
247 try testRemoveComments("", "/* comment */");
248}
249
250test "mixed" {
251 try testRemoveComments("hello", "hello// comment");
252 try testRemoveComments("hello", "hel/* comment */lo");
253}
254
255test "within a string" {
256 // escaped " is \"
257 try testRemoveComments(
258 \\blah"//som\"/*ething*/"BLAH
259 ,
260 \\blah"//som\"/*ething*/"BLAH
261 );
262}
263
264test "line comments retain newlines" {
265 try testRemoveComments(
266 \\
267 \\
268 \\
269 ,
270 \\// comment
271 \\// comment
272 \\// comment
273 );
274
275 try testRemoveComments("\r\n", "//comment\r\n");
276}
277
278test "unfinished multiline comment" {
279 try testRemoveComments(
280 \\unfinished
281 \\
282 ,
283 \\unfinished/*
284 \\
285 );
286}
287
288test "crazy" {
289 try testRemoveComments(
290 \\blah"/*som*/\""BLAH
291 ,
292 \\blah"/*som*/\""/*ething*/BLAH
293 );
294
295 try testRemoveComments(
296 \\blah"/*som*/"BLAH RCDATA "BEGIN END
297 \\
298 \\
299 \\hello
300 \\"
301 ,
302 \\blah"/*som*/"/*ething*/BLAH RCDATA "BEGIN END
303 \\// comment
304 \\//"blah blah" RCDATA {}
305 \\hello
306 \\"
307 );
308}
309
310test "multiline comment with newlines" {
311 // bare \r is not treated as a newline
312 try testRemoveComments("blahblah", "blah/*some\rthing*/blah");
313
314 try testRemoveComments(
315 \\blah
316 \\blah
317 ,
318 \\blah/*some
319 \\thing*/blah
320 );
321 try testRemoveComments(
322 "blah\r\nblah",
323 "blah/*some\r\nthing*/blah",
324 );
325
326 // handle *<not /> correctly
327 try testRemoveComments(
328 \\blah
329 \\
330 \\
331 ,
332 \\blah/*some
333 \\thing*
334 \\/bl*ah*/
335 );
336}
337
338test "comments appended to a line" {
339 try testRemoveComments(
340 \\blah
341 \\blah
342 ,
343 \\blah // line comment
344 \\blah
345 );
346 try testRemoveComments(
347 "blah \r\nblah",
348 "blah // line comment\r\nblah",
349 );
350}
351
352test "forward slash only" {
353 try testRemoveComments(
354 \\ /
355 \\/
356 ,
357 \\ /
358 \\/
359 );
360}
361
362test "remove comments with mappings" {
363 const allocator = std.testing.allocator;
364 var mut_source = "blah/*\rcommented line*\r/blah".*;
365 var mappings = SourceMappings{};
366 _ = try mappings.files.put(allocator, "test.rc");
367 try mappings.set(1, 1, 0);
368 try mappings.set(2, 2, 0);
369 try mappings.set(3, 3, 0);
370 defer mappings.deinit(allocator);
371
372 const result = try removeComments(&mut_source, &mut_source, &mappings);
373
374 try std.testing.expectEqualStrings("blahblah", result);
375 try std.testing.expectEqual(@as(usize, 1), mappings.end_line);
376 try std.testing.expectEqual(@as(usize, 3), mappings.getCorrespondingSpan(1).?.end_line);
377}
378
379test "in place" {
380 var mut_source = "blah /* comment */ blah".*;
381 const result = try removeComments(&mut_source, &mut_source, null);
382 try std.testing.expectEqualStrings("blah blah", result);
383}