master
1const std = @import("./std.zig");
2const builtin = @import("builtin");
3const assert = std.debug.assert;
4const testing = std.testing;
5const mem = std.mem;
6const native_endian = builtin.cpu.arch.endian();
7const Allocator = std.mem.Allocator;
8
9/// Use this to replace an unknown, unrecognized, or unrepresentable character.
10///
11/// See also: https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character
12pub const replacement_character: u21 = 0xFFFD;
13pub const replacement_character_utf8: [3]u8 = utf8EncodeComptime(replacement_character);
14
15/// Returns how many bytes the UTF-8 representation would require
16/// for the given codepoint.
17pub fn utf8CodepointSequenceLength(c: u21) !u3 {
18 if (c < 0x80) return @as(u3, 1);
19 if (c < 0x800) return @as(u3, 2);
20 if (c < 0x10000) return @as(u3, 3);
21 if (c < 0x110000) return @as(u3, 4);
22 return error.CodepointTooLarge;
23}
24
25/// Given the first byte of a UTF-8 codepoint,
26/// returns a number 1-4 indicating the total length of the codepoint in bytes.
27/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
28pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
29 // The switch is optimized much better than a "smart" approach using @clz
30 return switch (first_byte) {
31 0b0000_0000...0b0111_1111 => 1,
32 0b1100_0000...0b1101_1111 => 2,
33 0b1110_0000...0b1110_1111 => 3,
34 0b1111_0000...0b1111_0111 => 4,
35 else => error.Utf8InvalidStartByte,
36 };
37}
38
39/// Encodes the given codepoint into a UTF-8 byte sequence.
40/// c: the codepoint.
41/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
42/// Errors: if c cannot be encoded in UTF-8.
43/// Returns: the number of bytes written to out.
44pub fn utf8Encode(c: u21, out: []u8) error{ Utf8CannotEncodeSurrogateHalf, CodepointTooLarge }!u3 {
45 return utf8EncodeImpl(c, out, .cannot_encode_surrogate_half);
46}
47
48const Surrogates = enum {
49 cannot_encode_surrogate_half,
50 can_encode_surrogate_half,
51};
52
53fn utf8EncodeImpl(c: u21, out: []u8, comptime surrogates: Surrogates) !u3 {
54 const length = try utf8CodepointSequenceLength(c);
55 assert(out.len >= length);
56 switch (length) {
57 // The pattern for each is the same
58 // - Increasing the initial shift by 6 each time
59 // - Each time after the first shorten the shifted
60 // value to a max of 0b111111 (63)
61 1 => out[0] = @as(u8, @intCast(c)), // Can just do 0 + codepoint for initial range
62 2 => {
63 out[0] = @as(u8, @intCast(0b11000000 | (c >> 6)));
64 out[1] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));
65 },
66 3 => {
67 if (surrogates == .cannot_encode_surrogate_half and isSurrogateCodepoint(c)) {
68 return error.Utf8CannotEncodeSurrogateHalf;
69 }
70 out[0] = @as(u8, @intCast(0b11100000 | (c >> 12)));
71 out[1] = @as(u8, @intCast(0b10000000 | ((c >> 6) & 0b111111)));
72 out[2] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));
73 },
74 4 => {
75 out[0] = @as(u8, @intCast(0b11110000 | (c >> 18)));
76 out[1] = @as(u8, @intCast(0b10000000 | ((c >> 12) & 0b111111)));
77 out[2] = @as(u8, @intCast(0b10000000 | ((c >> 6) & 0b111111)));
78 out[3] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));
79 },
80 else => unreachable,
81 }
82 return length;
83}
84
85pub inline fn utf8EncodeComptime(comptime c: u21) [
86 utf8CodepointSequenceLength(c) catch |err|
87 @compileError(@errorName(err))
88]u8 {
89 comptime var result: [
90 utf8CodepointSequenceLength(c) catch
91 unreachable
92 ]u8 = undefined;
93 comptime assert((utf8Encode(c, &result) catch |err|
94 @compileError(@errorName(err))) == result.len);
95 return result;
96}
97
98const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
99
100/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
101pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
102 return switch (bytes.len) {
103 1 => bytes[0],
104 2 => utf8Decode2(bytes[0..2].*),
105 3 => utf8Decode3(bytes[0..3].*),
106 4 => utf8Decode4(bytes[0..4].*),
107 else => unreachable,
108 };
109}
110
111const Utf8Decode2Error = error{
112 Utf8ExpectedContinuation,
113 Utf8OverlongEncoding,
114};
115pub fn utf8Decode2(bytes: [2]u8) Utf8Decode2Error!u21 {
116 assert(bytes[0] & 0b11100000 == 0b11000000);
117 var value: u21 = bytes[0] & 0b00011111;
118
119 if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
120 value <<= 6;
121 value |= bytes[1] & 0b00111111;
122
123 if (value < 0x80) return error.Utf8OverlongEncoding;
124
125 return value;
126}
127
128const Utf8Decode3Error = Utf8Decode3AllowSurrogateHalfError || error{
129 Utf8EncodesSurrogateHalf,
130};
131pub fn utf8Decode3(bytes: [3]u8) Utf8Decode3Error!u21 {
132 const value = try utf8Decode3AllowSurrogateHalf(bytes);
133
134 if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
135
136 return value;
137}
138
139const Utf8Decode3AllowSurrogateHalfError = error{
140 Utf8ExpectedContinuation,
141 Utf8OverlongEncoding,
142};
143pub fn utf8Decode3AllowSurrogateHalf(bytes: [3]u8) Utf8Decode3AllowSurrogateHalfError!u21 {
144 assert(bytes[0] & 0b11110000 == 0b11100000);
145 var value: u21 = bytes[0] & 0b00001111;
146
147 if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
148 value <<= 6;
149 value |= bytes[1] & 0b00111111;
150
151 if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
152 value <<= 6;
153 value |= bytes[2] & 0b00111111;
154
155 if (value < 0x800) return error.Utf8OverlongEncoding;
156
157 return value;
158}
159
160const Utf8Decode4Error = error{
161 Utf8ExpectedContinuation,
162 Utf8OverlongEncoding,
163 Utf8CodepointTooLarge,
164};
165pub fn utf8Decode4(bytes: [4]u8) Utf8Decode4Error!u21 {
166 assert(bytes[0] & 0b11111000 == 0b11110000);
167 var value: u21 = bytes[0] & 0b00000111;
168
169 if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
170 value <<= 6;
171 value |= bytes[1] & 0b00111111;
172
173 if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
174 value <<= 6;
175 value |= bytes[2] & 0b00111111;
176
177 if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
178 value <<= 6;
179 value |= bytes[3] & 0b00111111;
180
181 if (value < 0x10000) return error.Utf8OverlongEncoding;
182 if (value > 0x10FFFF) return error.Utf8CodepointTooLarge;
183
184 return value;
185}
186
187/// Returns true if the given unicode codepoint can be encoded in UTF-8.
188pub fn utf8ValidCodepoint(value: u21) bool {
189 return switch (value) {
190 0xD800...0xDFFF => false, // Surrogates range
191 0x110000...0x1FFFFF => false, // Above the maximum codepoint value
192 else => true,
193 };
194}
195
196/// Returns the length of a supplied UTF-8 string literal in terms of unicode
197/// codepoints.
198pub fn utf8CountCodepoints(s: []const u8) !usize {
199 var len: usize = 0;
200
201 const N = @sizeOf(usize);
202 const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
203
204 var i: usize = 0;
205 while (i < s.len) {
206 // Fast path for ASCII sequences
207 while (i + N <= s.len) : (i += N) {
208 const v = mem.readInt(usize, s[i..][0..N], native_endian);
209 if (v & MASK != 0) break;
210 len += N;
211 }
212
213 if (i < s.len) {
214 const n = try utf8ByteSequenceLength(s[i]);
215 if (i + n > s.len) return error.TruncatedInput;
216
217 switch (n) {
218 1 => {}, // ASCII, no validation needed
219 else => _ = try utf8Decode(s[i..][0..n]),
220 }
221
222 i += n;
223 len += 1;
224 }
225 }
226
227 return len;
228}
229
230/// Returns true if the input consists entirely of UTF-8 codepoints
231pub fn utf8ValidateSlice(input: []const u8) bool {
232 return utf8ValidateSliceImpl(input, .cannot_encode_surrogate_half);
233}
234
235fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
236 var remaining = input;
237
238 if (std.simd.suggestVectorLength(u8)) |chunk_len| {
239 const Chunk = @Vector(chunk_len, u8);
240
241 // Fast path. Check for and skip ASCII characters at the start of the input.
242 while (remaining.len >= chunk_len) {
243 const chunk: Chunk = remaining[0..chunk_len].*;
244 const mask: Chunk = @splat(0x80);
245 if (@reduce(.Or, chunk & mask == mask)) {
246 // found a non ASCII byte
247 break;
248 }
249 remaining = remaining[chunk_len..];
250 }
251 }
252
253 // default lowest and highest continuation byte
254 const lo_cb = 0b10000000;
255 const hi_cb = 0b10111111;
256
257 const min_non_ascii_codepoint = 0x80;
258
259 // The first nibble is used to identify the continuation byte range to
260 // accept. The second nibble is the size.
261 const xx = 0xF1; // invalid: size 1
262 const as = 0xF0; // ASCII: size 1
263 const s1 = 0x02; // accept 0, size 2
264 const s2 = switch (surrogates) {
265 .cannot_encode_surrogate_half => 0x13, // accept 1, size 3
266 .can_encode_surrogate_half => 0x03, // accept 0, size 3
267 };
268 const s3 = 0x03; // accept 0, size 3
269 const s4 = switch (surrogates) {
270 .cannot_encode_surrogate_half => 0x23, // accept 2, size 3
271 .can_encode_surrogate_half => 0x03, // accept 0, size 3
272 };
273 const s5 = 0x34; // accept 3, size 4
274 const s6 = 0x04; // accept 0, size 4
275 const s7 = 0x44; // accept 4, size 4
276
277 // Information about the first byte in a UTF-8 sequence.
278 const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{
279 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
280 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
281 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
282 s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
283 };
284
285 const n = remaining.len;
286 var i: usize = 0;
287 while (i < n) {
288 const first_byte = remaining[i];
289 if (first_byte < min_non_ascii_codepoint) {
290 i += 1;
291 continue;
292 }
293
294 const info = first[first_byte];
295 if (info == xx) {
296 return false; // Illegal starter byte.
297 }
298
299 const size = info & 7;
300 if (i + size > n) {
301 return false; // Short or invalid.
302 }
303
304 // Figure out the acceptable low and high continuation bytes, starting
305 // with our defaults.
306 var accept_lo: u8 = lo_cb;
307 var accept_hi: u8 = hi_cb;
308
309 switch (info >> 4) {
310 0 => {},
311 1 => accept_lo = 0xA0,
312 2 => accept_hi = 0x9F,
313 3 => accept_lo = 0x90,
314 4 => accept_hi = 0x8F,
315 else => unreachable,
316 }
317
318 const c1 = remaining[i + 1];
319 if (c1 < accept_lo or accept_hi < c1) {
320 return false;
321 }
322
323 switch (size) {
324 2 => i += 2,
325 3 => {
326 const c2 = remaining[i + 2];
327 if (c2 < lo_cb or hi_cb < c2) {
328 return false;
329 }
330 i += 3;
331 },
332 4 => {
333 const c2 = remaining[i + 2];
334 if (c2 < lo_cb or hi_cb < c2) {
335 return false;
336 }
337 const c3 = remaining[i + 3];
338 if (c3 < lo_cb or hi_cb < c3) {
339 return false;
340 }
341 i += 4;
342 },
343 else => unreachable,
344 }
345 }
346
347 return true;
348}
349
350/// Utf8View iterates the code points of a utf-8 encoded string.
351///
352/// ```
353/// var utf8 = (try std.unicode.Utf8View.init("hi there")).iterator();
354/// while (utf8.nextCodepointSlice()) |codepoint| {
355/// std.debug.print("got codepoint {s}\n", .{codepoint});
356/// }
357/// ```
358pub const Utf8View = struct {
359 bytes: []const u8,
360
361 pub fn init(s: []const u8) !Utf8View {
362 if (!utf8ValidateSlice(s)) {
363 return error.InvalidUtf8;
364 }
365
366 return initUnchecked(s);
367 }
368
369 pub fn initUnchecked(s: []const u8) Utf8View {
370 return Utf8View{ .bytes = s };
371 }
372
373 pub inline fn initComptime(comptime s: []const u8) Utf8View {
374 return comptime if (init(s)) |r| r else |err| switch (err) {
375 error.InvalidUtf8 => {
376 @compileError("invalid utf8");
377 },
378 };
379 }
380
381 pub fn iterator(s: Utf8View) Utf8Iterator {
382 return Utf8Iterator{
383 .bytes = s.bytes,
384 .i = 0,
385 };
386 }
387};
388
389pub const Utf8Iterator = struct {
390 bytes: []const u8,
391 i: usize,
392
393 pub fn nextCodepointSlice(it: *Utf8Iterator) ?[]const u8 {
394 if (it.i >= it.bytes.len) {
395 return null;
396 }
397
398 const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;
399 it.i += cp_len;
400 return it.bytes[it.i - cp_len .. it.i];
401 }
402
403 pub fn nextCodepoint(it: *Utf8Iterator) ?u21 {
404 const slice = it.nextCodepointSlice() orelse return null;
405 return utf8Decode(slice) catch unreachable;
406 }
407
408 /// Look ahead at the next n codepoints without advancing the iterator.
409 /// If fewer than n codepoints are available, then return the remainder of the string.
410 pub fn peek(it: *Utf8Iterator, n: usize) []const u8 {
411 const original_i = it.i;
412 defer it.i = original_i;
413
414 var end_ix = original_i;
415 var found: usize = 0;
416 while (found < n) : (found += 1) {
417 const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
418 end_ix += next_codepoint.len;
419 }
420
421 return it.bytes[original_i..end_ix];
422 }
423};
424
425pub fn utf16IsHighSurrogate(c: u16) bool {
426 return c & ~@as(u16, 0x03ff) == 0xd800;
427}
428
429pub fn utf16IsLowSurrogate(c: u16) bool {
430 return c & ~@as(u16, 0x03ff) == 0xdc00;
431}
432
433/// Returns how many code units the UTF-16 representation would require
434/// for the given codepoint.
435pub fn utf16CodepointSequenceLength(c: u21) !u2 {
436 if (c <= 0xFFFF) return 1;
437 if (c <= 0x10FFFF) return 2;
438 return error.CodepointTooLarge;
439}
440
441test utf16CodepointSequenceLength {
442 try testing.expectEqual(@as(u2, 1), try utf16CodepointSequenceLength('a'));
443 try testing.expectEqual(@as(u2, 1), try utf16CodepointSequenceLength(0xFFFF));
444 try testing.expectEqual(@as(u2, 2), try utf16CodepointSequenceLength(0x10000));
445 try testing.expectEqual(@as(u2, 2), try utf16CodepointSequenceLength(0x10FFFF));
446 try testing.expectError(error.CodepointTooLarge, utf16CodepointSequenceLength(0x110000));
447}
448
449/// Given the first code unit of a UTF-16 codepoint, returns a number 1-2
450/// indicating the total length of the codepoint in UTF-16 code units.
451/// If this code unit does not match the form of a UTF-16 start code unit, returns Utf16InvalidStartCodeUnit.
452pub fn utf16CodeUnitSequenceLength(first_code_unit: u16) !u2 {
453 if (utf16IsHighSurrogate(first_code_unit)) return 2;
454 if (utf16IsLowSurrogate(first_code_unit)) return error.Utf16InvalidStartCodeUnit;
455 return 1;
456}
457
458test utf16CodeUnitSequenceLength {
459 try testing.expectEqual(@as(u2, 1), try utf16CodeUnitSequenceLength('a'));
460 try testing.expectEqual(@as(u2, 1), try utf16CodeUnitSequenceLength(0xFFFF));
461 try testing.expectEqual(@as(u2, 2), try utf16CodeUnitSequenceLength(0xDBFF));
462 try testing.expectError(error.Utf16InvalidStartCodeUnit, utf16CodeUnitSequenceLength(0xDFFF));
463}
464
465/// Decodes the codepoint encoded in the given pair of UTF-16 code units.
466/// Asserts that `surrogate_pair.len >= 2` and that the first code unit is a high surrogate.
467/// If the second code unit is not a low surrogate, error.ExpectedSecondSurrogateHalf is returned.
468pub fn utf16DecodeSurrogatePair(surrogate_pair: []const u16) !u21 {
469 assert(surrogate_pair.len >= 2);
470 assert(utf16IsHighSurrogate(surrogate_pair[0]));
471 const high_half: u21 = surrogate_pair[0];
472 const low_half = surrogate_pair[1];
473 if (!utf16IsLowSurrogate(low_half)) return error.ExpectedSecondSurrogateHalf;
474 return 0x10000 + ((high_half & 0x03ff) << 10) | (low_half & 0x03ff);
475}
476
477pub const Utf16LeIterator = struct {
478 bytes: []const u8,
479 i: usize,
480
481 pub fn init(s: []const u16) Utf16LeIterator {
482 return Utf16LeIterator{
483 .bytes = mem.sliceAsBytes(s),
484 .i = 0,
485 };
486 }
487
488 pub const NextCodepointError = error{ DanglingSurrogateHalf, ExpectedSecondSurrogateHalf, UnexpectedSecondSurrogateHalf };
489
490 pub fn nextCodepoint(it: *Utf16LeIterator) NextCodepointError!?u21 {
491 assert(it.i <= it.bytes.len);
492 if (it.i == it.bytes.len) return null;
493 var code_units: [2]u16 = undefined;
494 code_units[0] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
495 it.i += 2;
496 if (utf16IsHighSurrogate(code_units[0])) {
497 // surrogate pair
498 if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf;
499 code_units[1] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
500 const codepoint = try utf16DecodeSurrogatePair(&code_units);
501 it.i += 2;
502 return codepoint;
503 } else if (utf16IsLowSurrogate(code_units[0])) {
504 return error.UnexpectedSecondSurrogateHalf;
505 } else {
506 return code_units[0];
507 }
508 }
509};
510
511/// Returns the length of a supplied UTF-16 string literal in terms of unicode
512/// codepoints.
513pub fn utf16CountCodepoints(utf16le: []const u16) !usize {
514 var len: usize = 0;
515 var it = Utf16LeIterator.init(utf16le);
516 while (try it.nextCodepoint()) |_| len += 1;
517 return len;
518}
519
520fn testUtf16CountCodepoints() !void {
521 try testing.expectEqual(
522 @as(usize, 1),
523 try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("a")),
524 );
525 try testing.expectEqual(
526 @as(usize, 10),
527 try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("abcdefghij")),
528 );
529 try testing.expectEqual(
530 @as(usize, 10),
531 try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("äåéëþüúíóö")),
532 );
533 try testing.expectEqual(
534 @as(usize, 5),
535 try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("こんにちは")),
536 );
537}
538
539test "utf16 count codepoints" {
540 @setEvalBranchQuota(2000);
541 try testUtf16CountCodepoints();
542 try comptime testUtf16CountCodepoints();
543}
544
545test "utf8 encode" {
546 try comptime testUtf8Encode();
547 try testUtf8Encode();
548}
549fn testUtf8Encode() !void {
550 // A few taken from wikipedia a few taken elsewhere
551 var array: [4]u8 = undefined;
552 try testing.expect((try utf8Encode(try utf8Decode("€"), array[0..])) == 3);
553 try testing.expect(array[0] == 0b11100010);
554 try testing.expect(array[1] == 0b10000010);
555 try testing.expect(array[2] == 0b10101100);
556
557 try testing.expect((try utf8Encode(try utf8Decode("$"), array[0..])) == 1);
558 try testing.expect(array[0] == 0b00100100);
559
560 try testing.expect((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2);
561 try testing.expect(array[0] == 0b11000010);
562 try testing.expect(array[1] == 0b10100010);
563
564 try testing.expect((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4);
565 try testing.expect(array[0] == 0b11110000);
566 try testing.expect(array[1] == 0b10010000);
567 try testing.expect(array[2] == 0b10001101);
568 try testing.expect(array[3] == 0b10001000);
569}
570
571test "utf8 encode comptime" {
572 try testing.expectEqualSlices(u8, "€", &utf8EncodeComptime('€'));
573 try testing.expectEqualSlices(u8, "$", &utf8EncodeComptime('$'));
574 try testing.expectEqualSlices(u8, "¢", &utf8EncodeComptime('¢'));
575 try testing.expectEqualSlices(u8, "𐍈", &utf8EncodeComptime('𐍈'));
576}
577
578test "utf8 encode error" {
579 try comptime testUtf8EncodeError();
580 try testUtf8EncodeError();
581}
582fn testUtf8EncodeError() !void {
583 var array: [4]u8 = undefined;
584 try testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
585 try testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
586 try testErrorEncode(0x110000, array[0..], error.CodepointTooLarge);
587 try testErrorEncode(0x1fffff, array[0..], error.CodepointTooLarge);
588}
589
590fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) !void {
591 try testing.expectError(expectedErr, utf8Encode(codePoint, array));
592}
593
594test "utf8 iterator on ascii" {
595 try comptime testUtf8IteratorOnAscii();
596 try testUtf8IteratorOnAscii();
597}
598fn testUtf8IteratorOnAscii() !void {
599 const s = Utf8View.initComptime("abc");
600
601 var it1 = s.iterator();
602 try testing.expect(mem.eql(u8, "a", it1.nextCodepointSlice().?));
603 try testing.expect(mem.eql(u8, "b", it1.nextCodepointSlice().?));
604 try testing.expect(mem.eql(u8, "c", it1.nextCodepointSlice().?));
605 try testing.expect(it1.nextCodepointSlice() == null);
606
607 var it2 = s.iterator();
608 try testing.expect(it2.nextCodepoint().? == 'a');
609 try testing.expect(it2.nextCodepoint().? == 'b');
610 try testing.expect(it2.nextCodepoint().? == 'c');
611 try testing.expect(it2.nextCodepoint() == null);
612}
613
614test "utf8 view bad" {
615 try comptime testUtf8ViewBad();
616 try testUtf8ViewBad();
617}
618fn testUtf8ViewBad() !void {
619 // Compile-time error.
620 // const s3 = Utf8View.initComptime("\xfe\xf2");
621 try testing.expectError(error.InvalidUtf8, Utf8View.init("hel\xadlo"));
622}
623
624test "utf8 view ok" {
625 try comptime testUtf8ViewOk();
626 try testUtf8ViewOk();
627}
628fn testUtf8ViewOk() !void {
629 const s = Utf8View.initComptime("東京市");
630
631 var it1 = s.iterator();
632 try testing.expect(mem.eql(u8, "東", it1.nextCodepointSlice().?));
633 try testing.expect(mem.eql(u8, "京", it1.nextCodepointSlice().?));
634 try testing.expect(mem.eql(u8, "市", it1.nextCodepointSlice().?));
635 try testing.expect(it1.nextCodepointSlice() == null);
636
637 var it2 = s.iterator();
638 try testing.expect(it2.nextCodepoint().? == 0x6771);
639 try testing.expect(it2.nextCodepoint().? == 0x4eac);
640 try testing.expect(it2.nextCodepoint().? == 0x5e02);
641 try testing.expect(it2.nextCodepoint() == null);
642}
643
644test "validate slice" {
645 try comptime testValidateSlice();
646 try testValidateSlice();
647
648 // We skip a variable (based on recommended vector size) chunks of
649 // ASCII characters. Let's make sure we're chunking correctly.
650 const str = [_]u8{'a'} ** 550 ++ "\xc0";
651 for (0..str.len - 3) |i| {
652 try testing.expect(!utf8ValidateSlice(str[i..]));
653 }
654}
655fn testValidateSlice() !void {
656 try testing.expect(utf8ValidateSlice("abc"));
657 try testing.expect(utf8ValidateSlice("abc\xdf\xbf"));
658 try testing.expect(utf8ValidateSlice(""));
659 try testing.expect(utf8ValidateSlice("a"));
660 try testing.expect(utf8ValidateSlice("abc"));
661 try testing.expect(utf8ValidateSlice("Ж"));
662 try testing.expect(utf8ValidateSlice("ЖЖ"));
663 try testing.expect(utf8ValidateSlice("брэд-ЛГТМ"));
664 try testing.expect(utf8ValidateSlice("☺☻☹"));
665 try testing.expect(utf8ValidateSlice("a\u{fffdb}"));
666 try testing.expect(utf8ValidateSlice("\xf4\x8f\xbf\xbf"));
667 try testing.expect(utf8ValidateSlice("abc\xdf\xbf"));
668
669 try testing.expect(!utf8ValidateSlice("abc\xc0"));
670 try testing.expect(!utf8ValidateSlice("abc\xc0abc"));
671 try testing.expect(!utf8ValidateSlice("aa\xe2"));
672 try testing.expect(!utf8ValidateSlice("\x42\xfa"));
673 try testing.expect(!utf8ValidateSlice("\x42\xfa\x43"));
674 try testing.expect(!utf8ValidateSlice("abc\xc0"));
675 try testing.expect(!utf8ValidateSlice("abc\xc0abc"));
676 try testing.expect(!utf8ValidateSlice("\xf4\x90\x80\x80"));
677 try testing.expect(!utf8ValidateSlice("\xf7\xbf\xbf\xbf"));
678 try testing.expect(!utf8ValidateSlice("\xfb\xbf\xbf\xbf\xbf"));
679 try testing.expect(!utf8ValidateSlice("\xc0\x80"));
680 try testing.expect(!utf8ValidateSlice("\xed\xa0\x80"));
681 try testing.expect(!utf8ValidateSlice("\xed\xbf\xbf"));
682}
683
684test "valid utf8" {
685 try comptime testValidUtf8();
686 try testValidUtf8();
687}
688fn testValidUtf8() !void {
689 try testValid("\x00", 0x0);
690 try testValid("\x20", 0x20);
691 try testValid("\x7f", 0x7f);
692 try testValid("\xc2\x80", 0x80);
693 try testValid("\xdf\xbf", 0x7ff);
694 try testValid("\xe0\xa0\x80", 0x800);
695 try testValid("\xe1\x80\x80", 0x1000);
696 try testValid("\xef\xbf\xbf", 0xffff);
697 try testValid("\xf0\x90\x80\x80", 0x10000);
698 try testValid("\xf1\x80\x80\x80", 0x40000);
699 try testValid("\xf3\xbf\xbf\xbf", 0xfffff);
700 try testValid("\xf4\x8f\xbf\xbf", 0x10ffff);
701}
702
703test "invalid utf8 continuation bytes" {
704 try comptime testInvalidUtf8ContinuationBytes();
705 try testInvalidUtf8ContinuationBytes();
706}
707fn testInvalidUtf8ContinuationBytes() !void {
708 // unexpected continuation
709 try testError("\x80", error.Utf8InvalidStartByte);
710 try testError("\xbf", error.Utf8InvalidStartByte);
711 // too many leading 1's
712 try testError("\xf8", error.Utf8InvalidStartByte);
713 try testError("\xff", error.Utf8InvalidStartByte);
714 // expected continuation for 2 byte sequences
715 try testError("\xc2", error.UnexpectedEof);
716 try testError("\xc2\x00", error.Utf8ExpectedContinuation);
717 try testError("\xc2\xc0", error.Utf8ExpectedContinuation);
718 // expected continuation for 3 byte sequences
719 try testError("\xe0", error.UnexpectedEof);
720 try testError("\xe0\x00", error.UnexpectedEof);
721 try testError("\xe0\xc0", error.UnexpectedEof);
722 try testError("\xe0\xa0", error.UnexpectedEof);
723 try testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation);
724 try testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation);
725 // expected continuation for 4 byte sequences
726 try testError("\xf0", error.UnexpectedEof);
727 try testError("\xf0\x00", error.UnexpectedEof);
728 try testError("\xf0\xc0", error.UnexpectedEof);
729 try testError("\xf0\x90\x00", error.UnexpectedEof);
730 try testError("\xf0\x90\xc0", error.UnexpectedEof);
731 try testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation);
732 try testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation);
733}
734
735test "overlong utf8 codepoint" {
736 try comptime testOverlongUtf8Codepoint();
737 try testOverlongUtf8Codepoint();
738}
739fn testOverlongUtf8Codepoint() !void {
740 try testError("\xc0\x80", error.Utf8OverlongEncoding);
741 try testError("\xc1\xbf", error.Utf8OverlongEncoding);
742 try testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
743 try testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding);
744 try testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding);
745 try testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding);
746}
747
748test "misc invalid utf8" {
749 try comptime testMiscInvalidUtf8();
750 try testMiscInvalidUtf8();
751}
752fn testMiscInvalidUtf8() !void {
753 // codepoint out of bounds
754 try testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
755 try testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
756 // surrogate halves
757 try testValid("\xed\x9f\xbf", 0xd7ff);
758 try testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf);
759 try testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf);
760 try testValid("\xee\x80\x80", 0xe000);
761}
762
763test "utf8 iterator peeking" {
764 try comptime testUtf8Peeking();
765 try testUtf8Peeking();
766}
767
768fn testUtf8Peeking() !void {
769 const s = Utf8View.initComptime("noël");
770 var it = s.iterator();
771
772 try testing.expect(mem.eql(u8, "n", it.nextCodepointSlice().?));
773
774 try testing.expect(mem.eql(u8, "o", it.peek(1)));
775 try testing.expect(mem.eql(u8, "oë", it.peek(2)));
776 try testing.expect(mem.eql(u8, "oël", it.peek(3)));
777 try testing.expect(mem.eql(u8, "oël", it.peek(4)));
778 try testing.expect(mem.eql(u8, "oël", it.peek(10)));
779
780 try testing.expect(mem.eql(u8, "o", it.nextCodepointSlice().?));
781 try testing.expect(mem.eql(u8, "ë", it.nextCodepointSlice().?));
782 try testing.expect(mem.eql(u8, "l", it.nextCodepointSlice().?));
783 try testing.expect(it.nextCodepointSlice() == null);
784
785 try testing.expect(mem.eql(u8, &[_]u8{}, it.peek(1)));
786}
787
788fn testError(bytes: []const u8, expected_err: anyerror) !void {
789 try testing.expectError(expected_err, testDecode(bytes));
790}
791
792fn testValid(bytes: []const u8, expected_codepoint: u21) !void {
793 try testing.expect((testDecode(bytes) catch unreachable) == expected_codepoint);
794}
795
796fn testDecode(bytes: []const u8) !u21 {
797 const length = try utf8ByteSequenceLength(bytes[0]);
798 if (bytes.len < length) return error.UnexpectedEof;
799 try testing.expect(bytes.len == length);
800 return utf8Decode(bytes);
801}
802
803/// Print the given `utf8` string, encoded as UTF-8 bytes.
804/// Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD)
805/// according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of
806/// the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder
807fn formatUtf8(utf8: []const u8, writer: *std.Io.Writer) std.Io.Writer.Error!void {
808 var buf: [300]u8 = undefined; // just an arbitrary size
809 var u8len: usize = 0;
810
811 // This implementation is based on this specification:
812 // https://encoding.spec.whatwg.org/#utf-8-decoder
813 var codepoint: u21 = 0;
814 var cont_bytes_seen: u3 = 0;
815 var cont_bytes_needed: u3 = 0;
816 var lower_boundary: u8 = 0x80;
817 var upper_boundary: u8 = 0xBF;
818
819 var i: usize = 0;
820 while (i < utf8.len) {
821 const byte = utf8[i];
822 if (cont_bytes_needed == 0) {
823 switch (byte) {
824 0x00...0x7F => {
825 buf[u8len] = byte;
826 u8len += 1;
827 },
828 0xC2...0xDF => {
829 cont_bytes_needed = 1;
830 codepoint = byte & 0b00011111;
831 },
832 0xE0...0xEF => {
833 if (byte == 0xE0) lower_boundary = 0xA0;
834 if (byte == 0xED) upper_boundary = 0x9F;
835 cont_bytes_needed = 2;
836 codepoint = byte & 0b00001111;
837 },
838 0xF0...0xF4 => {
839 if (byte == 0xF0) lower_boundary = 0x90;
840 if (byte == 0xF4) upper_boundary = 0x8F;
841 cont_bytes_needed = 3;
842 codepoint = byte & 0b00000111;
843 },
844 else => {
845 u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
846 },
847 }
848 // consume the byte
849 i += 1;
850 } else if (byte < lower_boundary or byte > upper_boundary) {
851 codepoint = 0;
852 cont_bytes_needed = 0;
853 cont_bytes_seen = 0;
854 lower_boundary = 0x80;
855 upper_boundary = 0xBF;
856 u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
857 // do not consume the current byte, it should now be treated as a possible start byte
858 } else {
859 lower_boundary = 0x80;
860 upper_boundary = 0xBF;
861 codepoint <<= 6;
862 codepoint |= byte & 0b00111111;
863 cont_bytes_seen += 1;
864 // consume the byte
865 i += 1;
866
867 if (cont_bytes_seen == cont_bytes_needed) {
868 const codepoint_len = cont_bytes_seen + 1;
869 const codepoint_start_i = i - codepoint_len;
870 @memcpy(buf[u8len..][0..codepoint_len], utf8[codepoint_start_i..][0..codepoint_len]);
871 u8len += codepoint_len;
872
873 codepoint = 0;
874 cont_bytes_needed = 0;
875 cont_bytes_seen = 0;
876 }
877 }
878 // make sure there's always enough room for another maximum length UTF-8 codepoint
879 if (u8len + 4 > buf.len) {
880 try writer.writeAll(buf[0..u8len]);
881 u8len = 0;
882 }
883 }
884 if (cont_bytes_needed != 0) {
885 // we know there's enough room because we always flush
886 // if there's less than 4 bytes remaining in the buffer.
887 u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
888 }
889 try writer.writeAll(buf[0..u8len]);
890}
891
892/// Return a Formatter for a (potentially ill-formed) UTF-8 string.
893/// Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD)
894/// according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of
895/// the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder
896pub fn fmtUtf8(utf8: []const u8) std.fmt.Alt([]const u8, formatUtf8) {
897 return .{ .data = utf8 };
898}
899
900test fmtUtf8 {
901 const expectFmt = testing.expectFmt;
902 try expectFmt("", "{f}", .{fmtUtf8("")});
903 try expectFmt("foo", "{f}", .{fmtUtf8("foo")});
904 try expectFmt("𐐷", "{f}", .{fmtUtf8("𐐷")});
905
906 // Table 3-8. U+FFFD for Non-Shortest Form Sequences
907 try expectFmt("��������A", "{f}", .{fmtUtf8("\xC0\xAF\xE0\x80\xBF\xF0\x81\x82A")});
908
909 // Table 3-9. U+FFFD for Ill-Formed Sequences for Surrogates
910 try expectFmt("��������A", "{f}", .{fmtUtf8("\xED\xA0\x80\xED\xBF\xBF\xED\xAFA")});
911
912 // Table 3-10. U+FFFD for Other Ill-Formed Sequences
913 try expectFmt("�����A��B", "{f}", .{fmtUtf8("\xF4\x91\x92\x93\xFFA\x80\xBFB")});
914
915 // Table 3-11. U+FFFD for Truncated Sequences
916 try expectFmt("����A", "{f}", .{fmtUtf8("\xE1\x80\xE2\xF0\x91\x92\xF1\xBFA")});
917}
918
919fn utf16LeToUtf8ArrayListImpl(
920 result: *std.array_list.Managed(u8),
921 utf16le: []const u16,
922 comptime surrogates: Surrogates,
923) (switch (surrogates) {
924 .cannot_encode_surrogate_half => Utf16LeToUtf8AllocError,
925 .can_encode_surrogate_half => Allocator.Error,
926})!void {
927 assert(result.unusedCapacitySlice().len >= utf16le.len);
928
929 var remaining = utf16le;
930 vectorized: {
931 const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
932 const Chunk = @Vector(chunk_len, u16);
933
934 // Fast path. Check for and encode ASCII characters at the start of the input.
935 while (remaining.len >= chunk_len) {
936 const chunk: Chunk = remaining[0..chunk_len].*;
937 const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F));
938 if (@reduce(.Or, chunk | mask != mask)) {
939 // found a non ASCII code unit
940 break;
941 }
942 const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk));
943 // We allocated enough space to encode every UTF-16 code unit
944 // as ASCII, so if the entire string is ASCII then we are
945 // guaranteed to have enough space allocated
946 result.addManyAsArrayAssumeCapacity(chunk_len).* = ascii_chunk;
947 remaining = remaining[chunk_len..];
948 }
949 }
950
951 switch (surrogates) {
952 .cannot_encode_surrogate_half => {
953 var it = Utf16LeIterator.init(remaining);
954 while (try it.nextCodepoint()) |codepoint| {
955 const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
956 assert((utf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len);
957 }
958 },
959 .can_encode_surrogate_half => {
960 var it = Wtf16LeIterator.init(remaining);
961 while (it.nextCodepoint()) |codepoint| {
962 const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
963 assert((wtf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len);
964 }
965 },
966 }
967}
968
969pub const Utf16LeToUtf8AllocError = Allocator.Error || Utf16LeToUtf8Error;
970
971pub fn utf16LeToUtf8ArrayList(result: *std.array_list.Managed(u8), utf16le: []const u16) Utf16LeToUtf8AllocError!void {
972 try result.ensureUnusedCapacity(utf16le.len);
973 return utf16LeToUtf8ArrayListImpl(result, utf16le, .cannot_encode_surrogate_half);
974}
975
976/// Caller owns returned memory.
977pub fn utf16LeToUtf8Alloc(allocator: Allocator, utf16le: []const u16) Utf16LeToUtf8AllocError![]u8 {
978 // optimistically guess that it will all be ascii.
979 var result = try std.array_list.Managed(u8).initCapacity(allocator, utf16le.len);
980 errdefer result.deinit();
981
982 try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half);
983 return result.toOwnedSlice();
984}
985
986/// Caller owns returned memory.
987pub fn utf16LeToUtf8AllocZ(allocator: Allocator, utf16le: []const u16) Utf16LeToUtf8AllocError![:0]u8 {
988 // optimistically guess that it will all be ascii (and allocate space for the null terminator)
989 var result = try std.array_list.Managed(u8).initCapacity(allocator, utf16le.len + 1);
990 errdefer result.deinit();
991
992 try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half);
993 return result.toOwnedSliceSentinel(0);
994}
995
996pub const Utf16LeToUtf8Error = Utf16LeIterator.NextCodepointError;
997
998/// Asserts that the output buffer is big enough.
999/// Returns end byte index into utf8.
1000fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surrogates) (switch (surrogates) {
1001 .cannot_encode_surrogate_half => Utf16LeToUtf8Error,
1002 .can_encode_surrogate_half => error{},
1003})!usize {
1004 var dest_index: usize = 0;
1005
1006 var remaining = utf16le;
1007 vectorized: {
1008 const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
1009 const Chunk = @Vector(chunk_len, u16);
1010
1011 // Fast path. Check for and encode ASCII characters at the start of the input.
1012 while (remaining.len >= chunk_len) {
1013 const chunk: Chunk = remaining[0..chunk_len].*;
1014 const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F));
1015 if (@reduce(.Or, chunk | mask != mask)) {
1016 // found a non ASCII code unit
1017 break;
1018 }
1019 const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk));
1020 utf8[dest_index..][0..chunk_len].* = ascii_chunk;
1021 dest_index += chunk_len;
1022 remaining = remaining[chunk_len..];
1023 }
1024 }
1025
1026 switch (surrogates) {
1027 .cannot_encode_surrogate_half => {
1028 var it = Utf16LeIterator.init(remaining);
1029 while (try it.nextCodepoint()) |codepoint| {
1030 dest_index += utf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) {
1031 // The maximum possible codepoint encoded by UTF-16 is U+10FFFF,
1032 // which is within the valid codepoint range.
1033 error.CodepointTooLarge => unreachable,
1034 // We know the codepoint was valid in UTF-16, meaning it is not
1035 // an unpaired surrogate codepoint.
1036 error.Utf8CannotEncodeSurrogateHalf => unreachable,
1037 };
1038 }
1039 },
1040 .can_encode_surrogate_half => {
1041 var it = Wtf16LeIterator.init(remaining);
1042 while (it.nextCodepoint()) |codepoint| {
1043 dest_index += wtf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) {
1044 // The maximum possible codepoint encoded by UTF-16 is U+10FFFF,
1045 // which is within the valid codepoint range.
1046 error.CodepointTooLarge => unreachable,
1047 };
1048 }
1049 },
1050 }
1051 return dest_index;
1052}
1053
1054pub fn utf16LeToUtf8(utf8: []u8, utf16le: []const u16) Utf16LeToUtf8Error!usize {
1055 return utf16LeToUtf8Impl(utf8, utf16le, .cannot_encode_surrogate_half);
1056}
1057
1058test utf16LeToUtf8 {
1059 var utf16le: [2]u16 = undefined;
1060 const utf16le_as_bytes = mem.sliceAsBytes(utf16le[0..]);
1061
1062 {
1063 mem.writeInt(u16, utf16le_as_bytes[0..2], 'A', .little);
1064 mem.writeInt(u16, utf16le_as_bytes[2..4], 'a', .little);
1065 const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1066 defer testing.allocator.free(utf8);
1067 try testing.expect(mem.eql(u8, utf8, "Aa"));
1068 }
1069
1070 {
1071 mem.writeInt(u16, utf16le_as_bytes[0..2], 0x80, .little);
1072 mem.writeInt(u16, utf16le_as_bytes[2..4], 0xffff, .little);
1073 const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1074 defer testing.allocator.free(utf8);
1075 try testing.expect(mem.eql(u8, utf8, "\xc2\x80" ++ "\xef\xbf\xbf"));
1076 }
1077
1078 {
1079 // the values just outside the surrogate half range
1080 mem.writeInt(u16, utf16le_as_bytes[0..2], 0xd7ff, .little);
1081 mem.writeInt(u16, utf16le_as_bytes[2..4], 0xe000, .little);
1082 const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1083 defer testing.allocator.free(utf8);
1084 try testing.expect(mem.eql(u8, utf8, "\xed\x9f\xbf" ++ "\xee\x80\x80"));
1085 }
1086
1087 {
1088 // smallest surrogate pair
1089 mem.writeInt(u16, utf16le_as_bytes[0..2], 0xd800, .little);
1090 mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdc00, .little);
1091 const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1092 defer testing.allocator.free(utf8);
1093 try testing.expect(mem.eql(u8, utf8, "\xf0\x90\x80\x80"));
1094 }
1095
1096 {
1097 // largest surrogate pair
1098 mem.writeInt(u16, utf16le_as_bytes[0..2], 0xdbff, .little);
1099 mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdfff, .little);
1100 const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1101 defer testing.allocator.free(utf8);
1102 try testing.expect(mem.eql(u8, utf8, "\xf4\x8f\xbf\xbf"));
1103 }
1104
1105 {
1106 mem.writeInt(u16, utf16le_as_bytes[0..2], 0xdbff, .little);
1107 mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdc00, .little);
1108 const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1109 defer testing.allocator.free(utf8);
1110 try testing.expect(mem.eql(u8, utf8, "\xf4\x8f\xb0\x80"));
1111 }
1112
1113 {
1114 mem.writeInt(u16, utf16le_as_bytes[0..2], 0xdcdc, .little);
1115 mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdcdc, .little);
1116 const result = utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1117 try testing.expectError(error.UnexpectedSecondSurrogateHalf, result);
1118 }
1119}
1120
1121fn utf8ToUtf16LeArrayListImpl(result: *std.array_list.Managed(u16), utf8: []const u8, comptime surrogates: Surrogates) !void {
1122 assert(result.unusedCapacitySlice().len >= utf8.len);
1123
1124 var remaining = utf8;
1125 vectorized: {
1126 const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
1127 const Chunk = @Vector(chunk_len, u8);
1128
1129 // Fast path. Check for and encode ASCII characters at the start of the input.
1130 while (remaining.len >= chunk_len) {
1131 const chunk: Chunk = remaining[0..chunk_len].*;
1132 const mask: Chunk = @splat(0x80);
1133 if (@reduce(.Or, chunk & mask == mask)) {
1134 // found a non ASCII code unit
1135 break;
1136 }
1137 const utf16_chunk = mem.nativeToLittle(@Vector(chunk_len, u16), chunk);
1138 result.addManyAsArrayAssumeCapacity(chunk_len).* = utf16_chunk;
1139 remaining = remaining[chunk_len..];
1140 }
1141 }
1142
1143 const view = switch (surrogates) {
1144 .cannot_encode_surrogate_half => try Utf8View.init(remaining),
1145 .can_encode_surrogate_half => try Wtf8View.init(remaining),
1146 };
1147 var it = view.iterator();
1148 while (it.nextCodepoint()) |codepoint| {
1149 if (codepoint < 0x10000) {
1150 try result.append(mem.nativeToLittle(u16, @intCast(codepoint)));
1151 } else {
1152 const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;
1153 const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;
1154 try result.appendSlice(&.{ mem.nativeToLittle(u16, high), mem.nativeToLittle(u16, low) });
1155 }
1156 }
1157}
1158
1159pub fn utf8ToUtf16LeArrayList(result: *std.array_list.Managed(u16), utf8: []const u8) error{ InvalidUtf8, OutOfMemory }!void {
1160 try result.ensureUnusedCapacity(utf8.len);
1161 return utf8ToUtf16LeArrayListImpl(result, utf8, .cannot_encode_surrogate_half);
1162}
1163
1164pub fn utf8ToUtf16LeAlloc(allocator: Allocator, utf8: []const u8) error{ InvalidUtf8, OutOfMemory }![]u16 {
1165 // optimistically guess that it will not require surrogate pairs
1166 var result = try std.array_list.Managed(u16).initCapacity(allocator, utf8.len);
1167 errdefer result.deinit();
1168
1169 try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half);
1170 return result.toOwnedSlice();
1171}
1172
1173pub fn utf8ToUtf16LeAllocZ(allocator: Allocator, utf8: []const u8) error{ InvalidUtf8, OutOfMemory }![:0]u16 {
1174 // optimistically guess that it will not require surrogate pairs
1175 var result = try std.array_list.Managed(u16).initCapacity(allocator, utf8.len + 1);
1176 errdefer result.deinit();
1177
1178 try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half);
1179 return result.toOwnedSliceSentinel(0);
1180}
1181
1182/// Returns index of next character. If exact fit, returned index equals output slice length.
1183/// Assumes there is enough space for the output.
1184pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) error{InvalidUtf8}!usize {
1185 return utf8ToUtf16LeImpl(utf16le, utf8, .cannot_encode_surrogate_half);
1186}
1187
1188pub fn utf8ToUtf16LeImpl(utf16le: []u16, utf8: []const u8, comptime surrogates: Surrogates) !usize {
1189 var dest_index: usize = 0;
1190
1191 var remaining = utf8;
1192 vectorized: {
1193 const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
1194 const Chunk = @Vector(chunk_len, u8);
1195
1196 // Fast path. Check for and encode ASCII characters at the start of the input.
1197 while (remaining.len >= chunk_len) {
1198 const chunk: Chunk = remaining[0..chunk_len].*;
1199 const mask: Chunk = @splat(0x80);
1200 if (@reduce(.Or, chunk & mask == mask)) {
1201 // found a non ASCII code unit
1202 break;
1203 }
1204 const utf16_chunk = mem.nativeToLittle(@Vector(chunk_len, u16), chunk);
1205 utf16le[dest_index..][0..chunk_len].* = utf16_chunk;
1206 dest_index += chunk_len;
1207 remaining = remaining[chunk_len..];
1208 }
1209 }
1210
1211 const view = switch (surrogates) {
1212 .cannot_encode_surrogate_half => try Utf8View.init(remaining),
1213 .can_encode_surrogate_half => try Wtf8View.init(remaining),
1214 };
1215 var it = view.iterator();
1216 while (it.nextCodepoint()) |codepoint| {
1217 if (codepoint < 0x10000) {
1218 utf16le[dest_index] = mem.nativeToLittle(u16, @intCast(codepoint));
1219 dest_index += 1;
1220 } else {
1221 const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;
1222 const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;
1223 utf16le[dest_index..][0..2].* = .{ mem.nativeToLittle(u16, high), mem.nativeToLittle(u16, low) };
1224 dest_index += 2;
1225 }
1226 }
1227 return dest_index;
1228}
1229
1230test utf8ToUtf16Le {
1231 var utf16le: [128]u16 = undefined;
1232 {
1233 const length = try utf8ToUtf16Le(utf16le[0..], "𐐷");
1234 try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16le[0..length]));
1235 }
1236 {
1237 const length = try utf8ToUtf16Le(utf16le[0..], "\u{10FFFF}");
1238 try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16le[0..length]));
1239 }
1240 {
1241 const result = utf8ToUtf16Le(utf16le[0..], "\xf4\x90\x80\x80");
1242 try testing.expectError(error.InvalidUtf8, result);
1243 }
1244 {
1245 const length = try utf8ToUtf16Le(utf16le[0..], "This string has been designed to test the vectorized implementat" ++
1246 "ion by beginning with one hundred twenty-seven ASCII characters¡");
1247 try testing.expectEqualSlices(u8, &.{
1248 'T', 0, 'h', 0, 'i', 0, 's', 0, ' ', 0, 's', 0, 't', 0, 'r', 0, 'i', 0, 'n', 0, 'g', 0, ' ', 0, 'h', 0, 'a', 0, 's', 0, ' ', 0,
1249 'b', 0, 'e', 0, 'e', 0, 'n', 0, ' ', 0, 'd', 0, 'e', 0, 's', 0, 'i', 0, 'g', 0, 'n', 0, 'e', 0, 'd', 0, ' ', 0, 't', 0, 'o', 0,
1250 ' ', 0, 't', 0, 'e', 0, 's', 0, 't', 0, ' ', 0, 't', 0, 'h', 0, 'e', 0, ' ', 0, 'v', 0, 'e', 0, 'c', 0, 't', 0, 'o', 0, 'r', 0,
1251 'i', 0, 'z', 0, 'e', 0, 'd', 0, ' ', 0, 'i', 0, 'm', 0, 'p', 0, 'l', 0, 'e', 0, 'm', 0, 'e', 0, 'n', 0, 't', 0, 'a', 0, 't', 0,
1252 'i', 0, 'o', 0, 'n', 0, ' ', 0, 'b', 0, 'y', 0, ' ', 0, 'b', 0, 'e', 0, 'g', 0, 'i', 0, 'n', 0, 'n', 0, 'i', 0, 'n', 0, 'g', 0,
1253 ' ', 0, 'w', 0, 'i', 0, 't', 0, 'h', 0, ' ', 0, 'o', 0, 'n', 0, 'e', 0, ' ', 0, 'h', 0, 'u', 0, 'n', 0, 'd', 0, 'r', 0, 'e', 0,
1254 'd', 0, ' ', 0, 't', 0, 'w', 0, 'e', 0, 'n', 0, 't', 0, 'y', 0, '-', 0, 's', 0, 'e', 0, 'v', 0, 'e', 0, 'n', 0, ' ', 0, 'A', 0,
1255 'S', 0, 'C', 0, 'I', 0, 'I', 0, ' ', 0, 'c', 0, 'h', 0, 'a', 0, 'r', 0, 'a', 0, 'c', 0, 't', 0, 'e', 0, 'r', 0, 's', 0, '¡', 0,
1256 }, mem.sliceAsBytes(utf16le[0..length]));
1257 }
1258}
1259
1260test utf8ToUtf16LeArrayList {
1261 {
1262 var list = std.array_list.Managed(u16).init(testing.allocator);
1263 defer list.deinit();
1264 try utf8ToUtf16LeArrayList(&list, "𐐷");
1265 try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(list.items));
1266 }
1267 {
1268 var list = std.array_list.Managed(u16).init(testing.allocator);
1269 defer list.deinit();
1270 try utf8ToUtf16LeArrayList(&list, "\u{10FFFF}");
1271 try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(list.items));
1272 }
1273 {
1274 var list = std.array_list.Managed(u16).init(testing.allocator);
1275 defer list.deinit();
1276 const result = utf8ToUtf16LeArrayList(&list, "\xf4\x90\x80\x80");
1277 try testing.expectError(error.InvalidUtf8, result);
1278 }
1279}
1280
1281test utf8ToUtf16LeAlloc {
1282 {
1283 const utf16 = try utf8ToUtf16LeAlloc(testing.allocator, "𐐷");
1284 defer testing.allocator.free(utf16);
1285 try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16[0..]));
1286 }
1287 {
1288 const utf16 = try utf8ToUtf16LeAlloc(testing.allocator, "\u{10FFFF}");
1289 defer testing.allocator.free(utf16);
1290 try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16[0..]));
1291 }
1292 {
1293 const result = utf8ToUtf16LeAlloc(testing.allocator, "\xf4\x90\x80\x80");
1294 try testing.expectError(error.InvalidUtf8, result);
1295 }
1296}
1297
1298test utf8ToUtf16LeAllocZ {
1299 {
1300 const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "𐐷");
1301 defer testing.allocator.free(utf16);
1302 try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16));
1303 try testing.expect(utf16[2] == 0);
1304 }
1305 {
1306 const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "\u{10FFFF}");
1307 defer testing.allocator.free(utf16);
1308 try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16));
1309 try testing.expect(utf16[2] == 0);
1310 }
1311 {
1312 const result = utf8ToUtf16LeAllocZ(testing.allocator, "\xf4\x90\x80\x80");
1313 try testing.expectError(error.InvalidUtf8, result);
1314 }
1315 {
1316 const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "This string has been designed to test the vectorized implementat" ++
1317 "ion by beginning with one hundred twenty-seven ASCII characters¡");
1318 defer testing.allocator.free(utf16);
1319 try testing.expectEqualSlices(u8, &.{
1320 'T', 0, 'h', 0, 'i', 0, 's', 0, ' ', 0, 's', 0, 't', 0, 'r', 0, 'i', 0, 'n', 0, 'g', 0, ' ', 0, 'h', 0, 'a', 0, 's', 0, ' ', 0,
1321 'b', 0, 'e', 0, 'e', 0, 'n', 0, ' ', 0, 'd', 0, 'e', 0, 's', 0, 'i', 0, 'g', 0, 'n', 0, 'e', 0, 'd', 0, ' ', 0, 't', 0, 'o', 0,
1322 ' ', 0, 't', 0, 'e', 0, 's', 0, 't', 0, ' ', 0, 't', 0, 'h', 0, 'e', 0, ' ', 0, 'v', 0, 'e', 0, 'c', 0, 't', 0, 'o', 0, 'r', 0,
1323 'i', 0, 'z', 0, 'e', 0, 'd', 0, ' ', 0, 'i', 0, 'm', 0, 'p', 0, 'l', 0, 'e', 0, 'm', 0, 'e', 0, 'n', 0, 't', 0, 'a', 0, 't', 0,
1324 'i', 0, 'o', 0, 'n', 0, ' ', 0, 'b', 0, 'y', 0, ' ', 0, 'b', 0, 'e', 0, 'g', 0, 'i', 0, 'n', 0, 'n', 0, 'i', 0, 'n', 0, 'g', 0,
1325 ' ', 0, 'w', 0, 'i', 0, 't', 0, 'h', 0, ' ', 0, 'o', 0, 'n', 0, 'e', 0, ' ', 0, 'h', 0, 'u', 0, 'n', 0, 'd', 0, 'r', 0, 'e', 0,
1326 'd', 0, ' ', 0, 't', 0, 'w', 0, 'e', 0, 'n', 0, 't', 0, 'y', 0, '-', 0, 's', 0, 'e', 0, 'v', 0, 'e', 0, 'n', 0, ' ', 0, 'A', 0,
1327 'S', 0, 'C', 0, 'I', 0, 'I', 0, ' ', 0, 'c', 0, 'h', 0, 'a', 0, 'r', 0, 'a', 0, 'c', 0, 't', 0, 'e', 0, 'r', 0, 's', 0, '¡', 0,
1328 }, mem.sliceAsBytes(utf16));
1329 }
1330}
1331
1332test "ArrayList functions on a re-used list" {
1333 // utf8ToUtf16LeArrayList
1334 {
1335 var list = std.array_list.Managed(u16).init(testing.allocator);
1336 defer list.deinit();
1337
1338 const init_slice = utf8ToUtf16LeStringLiteral("abcdefg");
1339 try list.ensureTotalCapacityPrecise(init_slice.len);
1340 list.appendSliceAssumeCapacity(init_slice);
1341
1342 try utf8ToUtf16LeArrayList(&list, "hijklmnopqrstuvwyxz");
1343
1344 try testing.expectEqualSlices(u16, utf8ToUtf16LeStringLiteral("abcdefghijklmnopqrstuvwyxz"), list.items);
1345 }
1346
1347 // utf16LeToUtf8ArrayList
1348 {
1349 var list = std.array_list.Managed(u8).init(testing.allocator);
1350 defer list.deinit();
1351
1352 const init_slice = "abcdefg";
1353 try list.ensureTotalCapacityPrecise(init_slice.len);
1354 list.appendSliceAssumeCapacity(init_slice);
1355
1356 try utf16LeToUtf8ArrayList(&list, utf8ToUtf16LeStringLiteral("hijklmnopqrstuvwyxz"));
1357
1358 try testing.expectEqualStrings("abcdefghijklmnopqrstuvwyxz", list.items);
1359 }
1360
1361 // wtf8ToWtf16LeArrayList
1362 {
1363 var list = std.array_list.Managed(u16).init(testing.allocator);
1364 defer list.deinit();
1365
1366 const init_slice = utf8ToUtf16LeStringLiteral("abcdefg");
1367 try list.ensureTotalCapacityPrecise(init_slice.len);
1368 list.appendSliceAssumeCapacity(init_slice);
1369
1370 try wtf8ToWtf16LeArrayList(&list, "hijklmnopqrstuvwyxz");
1371
1372 try testing.expectEqualSlices(u16, utf8ToUtf16LeStringLiteral("abcdefghijklmnopqrstuvwyxz"), list.items);
1373 }
1374
1375 // wtf16LeToWtf8ArrayList
1376 {
1377 var list = std.array_list.Managed(u8).init(testing.allocator);
1378 defer list.deinit();
1379
1380 const init_slice = "abcdefg";
1381 try list.ensureTotalCapacityPrecise(init_slice.len);
1382 list.appendSliceAssumeCapacity(init_slice);
1383
1384 try wtf16LeToWtf8ArrayList(&list, utf8ToUtf16LeStringLiteral("hijklmnopqrstuvwyxz"));
1385
1386 try testing.expectEqualStrings("abcdefghijklmnopqrstuvwyxz", list.items);
1387 }
1388}
1389
1390fn utf8ToUtf16LeStringLiteralImpl(comptime utf8: []const u8, comptime surrogates: Surrogates) *const [calcUtf16LeLenImpl(utf8, surrogates) catch |err| @compileError(err):0]u16 {
1391 return comptime blk: {
1392 const len: usize = calcUtf16LeLenImpl(utf8, surrogates) catch unreachable;
1393 var utf16le: [len:0]u16 = [_:0]u16{0} ** len;
1394 const utf16le_len = utf8ToUtf16LeImpl(&utf16le, utf8[0..], surrogates) catch |err| @compileError(err);
1395 assert(len == utf16le_len);
1396 const final = utf16le;
1397 break :blk &final;
1398 };
1399}
1400
1401/// Converts a UTF-8 string literal into a UTF-16LE string literal.
1402pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch |err| @compileError(err):0]u16 {
1403 return utf8ToUtf16LeStringLiteralImpl(utf8, .cannot_encode_surrogate_half);
1404}
1405
1406/// Converts a WTF-8 string literal into a WTF-16LE string literal.
1407pub fn wtf8ToWtf16LeStringLiteral(comptime wtf8: []const u8) *const [calcWtf16LeLen(wtf8) catch |err| @compileError(err):0]u16 {
1408 return utf8ToUtf16LeStringLiteralImpl(wtf8, .can_encode_surrogate_half);
1409}
1410
1411pub fn calcUtf16LeLenImpl(utf8: []const u8, comptime surrogates: Surrogates) !usize {
1412 const utf8DecodeImpl = switch (surrogates) {
1413 .cannot_encode_surrogate_half => utf8Decode,
1414 .can_encode_surrogate_half => wtf8Decode,
1415 };
1416 var src_i: usize = 0;
1417 var dest_len: usize = 0;
1418 while (src_i < utf8.len) {
1419 const n = try utf8ByteSequenceLength(utf8[src_i]);
1420 const next_src_i = src_i + n;
1421 const codepoint = try utf8DecodeImpl(utf8[src_i..next_src_i]);
1422 if (codepoint < 0x10000) {
1423 dest_len += 1;
1424 } else {
1425 dest_len += 2;
1426 }
1427 src_i = next_src_i;
1428 }
1429 return dest_len;
1430}
1431
1432const CalcUtf16LeLenError = Utf8DecodeError || error{Utf8InvalidStartByte};
1433
1434/// Returns length in UTF-16LE of UTF-8 slice as length of []u16.
1435/// Length in []u8 is 2*len16.
1436pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
1437 return calcUtf16LeLenImpl(utf8, .cannot_encode_surrogate_half);
1438}
1439
1440const CalcWtf16LeLenError = Wtf8DecodeError || error{Utf8InvalidStartByte};
1441
1442/// Returns length in WTF-16LE of WTF-8 slice as length of []u16.
1443/// Length in []u8 is 2*len16.
1444pub fn calcWtf16LeLen(wtf8: []const u8) CalcWtf16LeLenError!usize {
1445 return calcUtf16LeLenImpl(wtf8, .can_encode_surrogate_half);
1446}
1447
1448fn testCalcUtf16LeLenImpl(calcUtf16LeLenImpl_: anytype) !void {
1449 try testing.expectEqual(@as(usize, 1), try calcUtf16LeLenImpl_("a"));
1450 try testing.expectEqual(@as(usize, 10), try calcUtf16LeLenImpl_("abcdefghij"));
1451 try testing.expectEqual(@as(usize, 10), try calcUtf16LeLenImpl_("äåéëþüúíóö"));
1452 try testing.expectEqual(@as(usize, 5), try calcUtf16LeLenImpl_("こんにちは"));
1453}
1454
1455test calcUtf16LeLen {
1456 try testCalcUtf16LeLenImpl(calcUtf16LeLen);
1457 try comptime testCalcUtf16LeLenImpl(calcUtf16LeLen);
1458}
1459
1460test calcWtf16LeLen {
1461 try testCalcUtf16LeLenImpl(calcWtf16LeLen);
1462 try comptime testCalcUtf16LeLenImpl(calcWtf16LeLen);
1463}
1464
1465/// Print the given `utf16le` string, encoded as UTF-8 bytes.
1466/// Unpaired surrogates are replaced by the replacement character (U+FFFD).
1467fn formatUtf16Le(utf16le: []const u16, writer: *std.Io.Writer) std.Io.Writer.Error!void {
1468 var buf: [300]u8 = undefined; // just an arbitrary size
1469 var it = Utf16LeIterator.init(utf16le);
1470 var u8len: usize = 0;
1471 while (it.nextCodepoint() catch replacement_character) |codepoint| {
1472 u8len += utf8Encode(codepoint, buf[u8len..]) catch
1473 utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
1474 // make sure there's always enough room for another maximum length UTF-8 codepoint
1475 if (u8len + 4 > buf.len) {
1476 try writer.writeAll(buf[0..u8len]);
1477 u8len = 0;
1478 }
1479 }
1480 try writer.writeAll(buf[0..u8len]);
1481}
1482
1483/// Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
1484/// which will be converted to UTF-8 during formatting.
1485/// Unpaired surrogates are replaced by the replacement character (U+FFFD).
1486pub fn fmtUtf16Le(utf16le: []const u16) std.fmt.Alt([]const u16, formatUtf16Le) {
1487 return .{ .data = utf16le };
1488}
1489
1490test fmtUtf16Le {
1491 const expectFmt = testing.expectFmt;
1492 try expectFmt("", "{f}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral(""))});
1493 try expectFmt("", "{f}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral(""))});
1494 try expectFmt("foo", "{f}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("foo"))});
1495 try expectFmt("foo", "{f}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral("foo"))});
1496 try expectFmt("𐐷", "{f}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral("𐐷"))});
1497 try expectFmt("", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xd7", native_endian)})});
1498 try expectFmt("�", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xd8", native_endian)})});
1499 try expectFmt("�", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdb", native_endian)})});
1500 try expectFmt("�", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xdc", native_endian)})});
1501 try expectFmt("�", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdf", native_endian)})});
1502 try expectFmt("", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xe0", native_endian)})});
1503}
1504
1505fn testUtf8ToUtf16LeStringLiteral(utf8ToUtf16LeStringLiteral_: anytype) !void {
1506 {
1507 const bytes = [_:0]u16{
1508 mem.nativeToLittle(u16, 0x41),
1509 };
1510 const utf16 = utf8ToUtf16LeStringLiteral_("A");
1511 try testing.expectEqualSlices(u16, &bytes, utf16);
1512 try testing.expect(utf16[1] == 0);
1513 }
1514 {
1515 const bytes = [_:0]u16{
1516 mem.nativeToLittle(u16, 0xD801),
1517 mem.nativeToLittle(u16, 0xDC37),
1518 };
1519 const utf16 = utf8ToUtf16LeStringLiteral_("𐐷");
1520 try testing.expectEqualSlices(u16, &bytes, utf16);
1521 try testing.expect(utf16[2] == 0);
1522 }
1523 {
1524 const bytes = [_:0]u16{
1525 mem.nativeToLittle(u16, 0x02FF),
1526 };
1527 const utf16 = utf8ToUtf16LeStringLiteral_("\u{02FF}");
1528 try testing.expectEqualSlices(u16, &bytes, utf16);
1529 try testing.expect(utf16[1] == 0);
1530 }
1531 {
1532 const bytes = [_:0]u16{
1533 mem.nativeToLittle(u16, 0x7FF),
1534 };
1535 const utf16 = utf8ToUtf16LeStringLiteral_("\u{7FF}");
1536 try testing.expectEqualSlices(u16, &bytes, utf16);
1537 try testing.expect(utf16[1] == 0);
1538 }
1539 {
1540 const bytes = [_:0]u16{
1541 mem.nativeToLittle(u16, 0x801),
1542 };
1543 const utf16 = utf8ToUtf16LeStringLiteral_("\u{801}");
1544 try testing.expectEqualSlices(u16, &bytes, utf16);
1545 try testing.expect(utf16[1] == 0);
1546 }
1547 {
1548 const bytes = [_:0]u16{
1549 mem.nativeToLittle(u16, 0xDBFF),
1550 mem.nativeToLittle(u16, 0xDFFF),
1551 };
1552 const utf16 = utf8ToUtf16LeStringLiteral_("\u{10FFFF}");
1553 try testing.expectEqualSlices(u16, &bytes, utf16);
1554 try testing.expect(utf16[2] == 0);
1555 }
1556}
1557
1558test utf8ToUtf16LeStringLiteral {
1559 try testUtf8ToUtf16LeStringLiteral(utf8ToUtf16LeStringLiteral);
1560}
1561
1562test wtf8ToWtf16LeStringLiteral {
1563 try testUtf8ToUtf16LeStringLiteral(wtf8ToWtf16LeStringLiteral);
1564}
1565
1566fn testUtf8CountCodepoints() !void {
1567 try testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
1568 try testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
1569 try testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは"));
1570 // testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80"));
1571}
1572
1573test "utf8 count codepoints" {
1574 try testUtf8CountCodepoints();
1575 try comptime testUtf8CountCodepoints();
1576}
1577
1578fn testUtf8ValidCodepoint() !void {
1579 try testing.expect(utf8ValidCodepoint('e'));
1580 try testing.expect(utf8ValidCodepoint('ë'));
1581 try testing.expect(utf8ValidCodepoint('は'));
1582 try testing.expect(utf8ValidCodepoint(0xe000));
1583 try testing.expect(utf8ValidCodepoint(0x10ffff));
1584 try testing.expect(!utf8ValidCodepoint(0xd800));
1585 try testing.expect(!utf8ValidCodepoint(0xdfff));
1586 try testing.expect(!utf8ValidCodepoint(0x110000));
1587}
1588
1589test "utf8 valid codepoint" {
1590 try testUtf8ValidCodepoint();
1591 try comptime testUtf8ValidCodepoint();
1592}
1593
1594/// Returns true if the codepoint is a surrogate (U+DC00 to U+DFFF)
1595pub fn isSurrogateCodepoint(c: u21) bool {
1596 return switch (c) {
1597 0xD800...0xDFFF => true,
1598 else => false,
1599 };
1600}
1601
1602/// Encodes the given codepoint into a WTF-8 byte sequence.
1603/// c: the codepoint.
1604/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
1605/// Errors: if c cannot be encoded in WTF-8.
1606/// Returns: the number of bytes written to out.
1607pub fn wtf8Encode(c: u21, out: []u8) error{CodepointTooLarge}!u3 {
1608 return utf8EncodeImpl(c, out, .can_encode_surrogate_half);
1609}
1610
1611const Wtf8DecodeError = Utf8Decode2Error || Utf8Decode3AllowSurrogateHalfError || Utf8Decode4Error;
1612
1613/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
1614pub fn wtf8Decode(bytes: []const u8) Wtf8DecodeError!u21 {
1615 return switch (bytes.len) {
1616 1 => bytes[0],
1617 2 => utf8Decode2(bytes[0..2].*),
1618 3 => utf8Decode3AllowSurrogateHalf(bytes[0..3].*),
1619 4 => utf8Decode4(bytes[0..4].*),
1620 else => unreachable,
1621 };
1622}
1623
1624/// Returns true if the input consists entirely of WTF-8 codepoints
1625/// (all the same restrictions as UTF-8, but allows surrogate codepoints
1626/// U+D800 to U+DFFF).
1627/// Does not check for well-formed WTF-8, meaning that this function
1628/// does not check that all surrogate halves are unpaired.
1629pub fn wtf8ValidateSlice(input: []const u8) bool {
1630 return utf8ValidateSliceImpl(input, .can_encode_surrogate_half);
1631}
1632
1633test "validate WTF-8 slice" {
1634 try testValidateWtf8Slice();
1635 try comptime testValidateWtf8Slice();
1636
1637 // We skip a variable (based on recommended vector size) chunks of
1638 // ASCII characters. Let's make sure we're chunking correctly.
1639 const str = [_]u8{'a'} ** 550 ++ "\xc0";
1640 for (0..str.len - 3) |i| {
1641 try testing.expect(!wtf8ValidateSlice(str[i..]));
1642 }
1643}
1644fn testValidateWtf8Slice() !void {
1645 // These are valid/invalid under both UTF-8 and WTF-8 rules.
1646 try testing.expect(wtf8ValidateSlice("abc"));
1647 try testing.expect(wtf8ValidateSlice("abc\xdf\xbf"));
1648 try testing.expect(wtf8ValidateSlice(""));
1649 try testing.expect(wtf8ValidateSlice("a"));
1650 try testing.expect(wtf8ValidateSlice("abc"));
1651 try testing.expect(wtf8ValidateSlice("Ж"));
1652 try testing.expect(wtf8ValidateSlice("ЖЖ"));
1653 try testing.expect(wtf8ValidateSlice("брэд-ЛГТМ"));
1654 try testing.expect(wtf8ValidateSlice("☺☻☹"));
1655 try testing.expect(wtf8ValidateSlice("a\u{fffdb}"));
1656 try testing.expect(wtf8ValidateSlice("\xf4\x8f\xbf\xbf"));
1657 try testing.expect(wtf8ValidateSlice("abc\xdf\xbf"));
1658
1659 try testing.expect(!wtf8ValidateSlice("abc\xc0"));
1660 try testing.expect(!wtf8ValidateSlice("abc\xc0abc"));
1661 try testing.expect(!wtf8ValidateSlice("aa\xe2"));
1662 try testing.expect(!wtf8ValidateSlice("\x42\xfa"));
1663 try testing.expect(!wtf8ValidateSlice("\x42\xfa\x43"));
1664 try testing.expect(!wtf8ValidateSlice("abc\xc0"));
1665 try testing.expect(!wtf8ValidateSlice("abc\xc0abc"));
1666 try testing.expect(!wtf8ValidateSlice("\xf4\x90\x80\x80"));
1667 try testing.expect(!wtf8ValidateSlice("\xf7\xbf\xbf\xbf"));
1668 try testing.expect(!wtf8ValidateSlice("\xfb\xbf\xbf\xbf\xbf"));
1669 try testing.expect(!wtf8ValidateSlice("\xc0\x80"));
1670
1671 // But surrogate codepoints are only valid in WTF-8.
1672 try testing.expect(wtf8ValidateSlice("\xed\xa0\x80"));
1673 try testing.expect(wtf8ValidateSlice("\xed\xbf\xbf"));
1674}
1675
1676/// Wtf8View iterates the code points of a WTF-8 encoded string,
1677/// including surrogate halves.
1678///
1679/// ```
1680/// var wtf8 = (try std.unicode.Wtf8View.init("hi there")).iterator();
1681/// while (wtf8.nextCodepointSlice()) |codepoint| {
1682/// // note: codepoint could be a surrogate half which is invalid
1683/// // UTF-8, avoid printing or otherwise sending/emitting this directly
1684/// }
1685/// ```
1686pub const Wtf8View = struct {
1687 bytes: []const u8,
1688
1689 pub fn init(s: []const u8) error{InvalidWtf8}!Wtf8View {
1690 if (!wtf8ValidateSlice(s)) {
1691 return error.InvalidWtf8;
1692 }
1693
1694 return initUnchecked(s);
1695 }
1696
1697 pub fn initUnchecked(s: []const u8) Wtf8View {
1698 return Wtf8View{ .bytes = s };
1699 }
1700
1701 pub inline fn initComptime(comptime s: []const u8) Wtf8View {
1702 return comptime if (init(s)) |r| r else |err| switch (err) {
1703 error.InvalidWtf8 => {
1704 @compileError("invalid wtf8");
1705 },
1706 };
1707 }
1708
1709 pub fn iterator(s: Wtf8View) Wtf8Iterator {
1710 return Wtf8Iterator{
1711 .bytes = s.bytes,
1712 .i = 0,
1713 };
1714 }
1715};
1716
1717/// Asserts that `bytes` is valid WTF-8
1718pub const Wtf8Iterator = struct {
1719 bytes: []const u8,
1720 i: usize,
1721
1722 pub fn nextCodepointSlice(it: *Wtf8Iterator) ?[]const u8 {
1723 if (it.i >= it.bytes.len) {
1724 return null;
1725 }
1726
1727 const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;
1728 it.i += cp_len;
1729 return it.bytes[it.i - cp_len .. it.i];
1730 }
1731
1732 pub fn nextCodepoint(it: *Wtf8Iterator) ?u21 {
1733 const slice = it.nextCodepointSlice() orelse return null;
1734 return wtf8Decode(slice) catch unreachable;
1735 }
1736
1737 /// Look ahead at the next n codepoints without advancing the iterator.
1738 /// If fewer than n codepoints are available, then return the remainder of the string.
1739 pub fn peek(it: *Wtf8Iterator, n: usize) []const u8 {
1740 const original_i = it.i;
1741 defer it.i = original_i;
1742
1743 var end_ix = original_i;
1744 var found: usize = 0;
1745 while (found < n) : (found += 1) {
1746 const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
1747 end_ix += next_codepoint.len;
1748 }
1749
1750 return it.bytes[original_i..end_ix];
1751 }
1752};
1753
1754pub fn wtf16LeToWtf8ArrayList(result: *std.array_list.Managed(u8), utf16le: []const u16) Allocator.Error!void {
1755 try result.ensureUnusedCapacity(utf16le.len);
1756 return utf16LeToUtf8ArrayListImpl(result, utf16le, .can_encode_surrogate_half);
1757}
1758
1759/// Caller must free returned memory.
1760pub fn wtf16LeToWtf8Alloc(allocator: Allocator, wtf16le: []const u16) Allocator.Error![]u8 {
1761 // optimistically guess that it will all be ascii.
1762 var result = try std.array_list.Managed(u8).initCapacity(allocator, wtf16le.len);
1763 errdefer result.deinit();
1764
1765 try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half);
1766 return result.toOwnedSlice();
1767}
1768
1769/// Caller must free returned memory.
1770pub fn wtf16LeToWtf8AllocZ(allocator: Allocator, wtf16le: []const u16) Allocator.Error![:0]u8 {
1771 // optimistically guess that it will all be ascii (and allocate space for the null terminator)
1772 var result = try std.array_list.Managed(u8).initCapacity(allocator, wtf16le.len + 1);
1773 errdefer result.deinit();
1774
1775 try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half);
1776 return result.toOwnedSliceSentinel(0);
1777}
1778
1779pub fn wtf16LeToWtf8(wtf8: []u8, wtf16le: []const u16) usize {
1780 return utf16LeToUtf8Impl(wtf8, wtf16le, .can_encode_surrogate_half) catch |err| switch (err) {};
1781}
1782
1783pub fn wtf8ToWtf16LeArrayList(result: *std.array_list.Managed(u16), wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }!void {
1784 try result.ensureUnusedCapacity(wtf8.len);
1785 return utf8ToUtf16LeArrayListImpl(result, wtf8, .can_encode_surrogate_half);
1786}
1787
1788pub fn wtf8ToWtf16LeAlloc(allocator: Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![]u16 {
1789 // optimistically guess that it will not require surrogate pairs
1790 var result = try std.array_list.Managed(u16).initCapacity(allocator, wtf8.len);
1791 errdefer result.deinit();
1792
1793 try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half);
1794 return result.toOwnedSlice();
1795}
1796
1797pub fn wtf8ToWtf16LeAllocZ(allocator: Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![:0]u16 {
1798 // optimistically guess that it will not require surrogate pairs
1799 var result = try std.array_list.Managed(u16).initCapacity(allocator, wtf8.len + 1);
1800 errdefer result.deinit();
1801
1802 try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half);
1803 return result.toOwnedSliceSentinel(0);
1804}
1805
1806/// Returns index of next character. If exact fit, returned index equals output slice length.
1807/// Assumes there is enough space for the output.
1808pub fn wtf8ToWtf16Le(wtf16le: []u16, wtf8: []const u8) error{InvalidWtf8}!usize {
1809 return utf8ToUtf16LeImpl(wtf16le, wtf8, .can_encode_surrogate_half);
1810}
1811
1812/// Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
1813/// character (U+FFFD).
1814/// All surrogate codepoints and the replacement character are encoded as three
1815/// bytes, meaning the input and output slices will always be the same length.
1816/// In-place conversion is supported when `utf8` and `wtf8` refer to the same slice.
1817/// Note: If `wtf8` is entirely composed of well-formed UTF-8, then no conversion is necessary.
1818/// `utf8ValidateSlice` can be used to check if lossy conversion is worthwhile.
1819/// If `wtf8` is not valid WTF-8, then `error.InvalidWtf8` is returned.
1820pub fn wtf8ToUtf8Lossy(utf8: []u8, wtf8: []const u8) error{InvalidWtf8}!void {
1821 assert(utf8.len >= wtf8.len);
1822
1823 const in_place = utf8.ptr == wtf8.ptr;
1824 const replacement_char_bytes = comptime blk: {
1825 var buf: [3]u8 = undefined;
1826 assert((utf8Encode(replacement_character, &buf) catch unreachable) == 3);
1827 break :blk buf;
1828 };
1829
1830 var dest_i: usize = 0;
1831 const view = try Wtf8View.init(wtf8);
1832 var it = view.iterator();
1833 while (it.nextCodepointSlice()) |codepoint_slice| {
1834 // All surrogate codepoints are encoded as 3 bytes
1835 if (codepoint_slice.len == 3) {
1836 const codepoint = wtf8Decode(codepoint_slice) catch unreachable;
1837 if (isSurrogateCodepoint(codepoint)) {
1838 @memcpy(utf8[dest_i..][0..replacement_char_bytes.len], &replacement_char_bytes);
1839 dest_i += replacement_char_bytes.len;
1840 continue;
1841 }
1842 }
1843 if (!in_place) {
1844 @memcpy(utf8[dest_i..][0..codepoint_slice.len], codepoint_slice);
1845 }
1846 dest_i += codepoint_slice.len;
1847 }
1848}
1849
1850pub fn wtf8ToUtf8LossyAlloc(allocator: Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![]u8 {
1851 const utf8 = try allocator.alloc(u8, wtf8.len);
1852 errdefer allocator.free(utf8);
1853
1854 try wtf8ToUtf8Lossy(utf8, wtf8);
1855
1856 return utf8;
1857}
1858
1859pub fn wtf8ToUtf8LossyAllocZ(allocator: Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![:0]u8 {
1860 const utf8 = try allocator.allocSentinel(u8, wtf8.len, 0);
1861 errdefer allocator.free(utf8);
1862
1863 try wtf8ToUtf8Lossy(utf8, wtf8);
1864
1865 return utf8;
1866}
1867
1868test wtf8ToUtf8Lossy {
1869 var buf: [32]u8 = undefined;
1870
1871 const invalid_utf8 = "\xff";
1872 try testing.expectError(error.InvalidWtf8, wtf8ToUtf8Lossy(&buf, invalid_utf8));
1873
1874 const ascii = "abcd";
1875 try wtf8ToUtf8Lossy(&buf, ascii);
1876 try testing.expectEqualStrings("abcd", buf[0..ascii.len]);
1877
1878 const high_surrogate_half = "ab\xed\xa0\xbdcd";
1879 try wtf8ToUtf8Lossy(&buf, high_surrogate_half);
1880 try testing.expectEqualStrings("ab\u{FFFD}cd", buf[0..high_surrogate_half.len]);
1881
1882 const low_surrogate_half = "ab\xed\xb2\xa9cd";
1883 try wtf8ToUtf8Lossy(&buf, low_surrogate_half);
1884 try testing.expectEqualStrings("ab\u{FFFD}cd", buf[0..low_surrogate_half.len]);
1885
1886 // If the WTF-8 is not well-formed, each surrogate half is converted into a separate
1887 // replacement character instead of being interpreted as a surrogate pair.
1888 const encoded_surrogate_pair = "ab\xed\xa0\xbd\xed\xb2\xa9cd";
1889 try wtf8ToUtf8Lossy(&buf, encoded_surrogate_pair);
1890 try testing.expectEqualStrings("ab\u{FFFD}\u{FFFD}cd", buf[0..encoded_surrogate_pair.len]);
1891
1892 // in place
1893 @memcpy(buf[0..low_surrogate_half.len], low_surrogate_half);
1894 const slice = buf[0..low_surrogate_half.len];
1895 try wtf8ToUtf8Lossy(slice, slice);
1896 try testing.expectEqualStrings("ab\u{FFFD}cd", slice);
1897}
1898
1899test wtf8ToUtf8LossyAlloc {
1900 const invalid_utf8 = "\xff";
1901 try testing.expectError(error.InvalidWtf8, wtf8ToUtf8LossyAlloc(testing.allocator, invalid_utf8));
1902
1903 {
1904 const ascii = "abcd";
1905 const utf8 = try wtf8ToUtf8LossyAlloc(testing.allocator, ascii);
1906 defer testing.allocator.free(utf8);
1907 try testing.expectEqualStrings("abcd", utf8);
1908 }
1909
1910 {
1911 const surrogate_half = "ab\xed\xa0\xbdcd";
1912 const utf8 = try wtf8ToUtf8LossyAlloc(testing.allocator, surrogate_half);
1913 defer testing.allocator.free(utf8);
1914 try testing.expectEqualStrings("ab\u{FFFD}cd", utf8);
1915 }
1916
1917 {
1918 // If the WTF-8 is not well-formed, each surrogate half is converted into a separate
1919 // replacement character instead of being interpreted as a surrogate pair.
1920 const encoded_surrogate_pair = "ab\xed\xa0\xbd\xed\xb2\xa9cd";
1921 const utf8 = try wtf8ToUtf8LossyAlloc(testing.allocator, encoded_surrogate_pair);
1922 defer testing.allocator.free(utf8);
1923 try testing.expectEqualStrings("ab\u{FFFD}\u{FFFD}cd", utf8);
1924 }
1925}
1926
1927test wtf8ToUtf8LossyAllocZ {
1928 const invalid_utf8 = "\xff";
1929 try testing.expectError(error.InvalidWtf8, wtf8ToUtf8LossyAllocZ(testing.allocator, invalid_utf8));
1930
1931 {
1932 const ascii = "abcd";
1933 const utf8 = try wtf8ToUtf8LossyAllocZ(testing.allocator, ascii);
1934 defer testing.allocator.free(utf8);
1935 try testing.expectEqualStrings("abcd", utf8);
1936 }
1937
1938 {
1939 const surrogate_half = "ab\xed\xa0\xbdcd";
1940 const utf8 = try wtf8ToUtf8LossyAllocZ(testing.allocator, surrogate_half);
1941 defer testing.allocator.free(utf8);
1942 try testing.expectEqualStrings("ab\u{FFFD}cd", utf8);
1943 }
1944
1945 {
1946 // If the WTF-8 is not well-formed, each surrogate half is converted into a separate
1947 // replacement character instead of being interpreted as a surrogate pair.
1948 const encoded_surrogate_pair = "ab\xed\xa0\xbd\xed\xb2\xa9cd";
1949 const utf8 = try wtf8ToUtf8LossyAllocZ(testing.allocator, encoded_surrogate_pair);
1950 defer testing.allocator.free(utf8);
1951 try testing.expectEqualStrings("ab\u{FFFD}\u{FFFD}cd", utf8);
1952 }
1953}
1954
1955pub const Wtf16LeIterator = struct {
1956 bytes: []const u8,
1957 i: usize,
1958
1959 pub fn init(s: []const u16) Wtf16LeIterator {
1960 return Wtf16LeIterator{
1961 .bytes = mem.sliceAsBytes(s),
1962 .i = 0,
1963 };
1964 }
1965
1966 /// If the next codepoint is encoded by a surrogate pair, returns the
1967 /// codepoint that the surrogate pair represents.
1968 /// If the next codepoint is an unpaired surrogate, returns the codepoint
1969 /// of the unpaired surrogate.
1970 pub fn nextCodepoint(it: *Wtf16LeIterator) ?u21 {
1971 assert(it.i <= it.bytes.len);
1972 if (it.i == it.bytes.len) return null;
1973 var code_units: [2]u16 = undefined;
1974 code_units[0] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
1975 it.i += 2;
1976 surrogate_pair: {
1977 if (utf16IsHighSurrogate(code_units[0])) {
1978 if (it.i >= it.bytes.len) break :surrogate_pair;
1979 code_units[1] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
1980 const codepoint = utf16DecodeSurrogatePair(&code_units) catch break :surrogate_pair;
1981 it.i += 2;
1982 return codepoint;
1983 }
1984 }
1985 return code_units[0];
1986 }
1987};
1988
1989test "non-well-formed WTF-8 does not roundtrip" {
1990 // This encodes the surrogate pair U+D83D U+DCA9.
1991 // The well-formed version of this would be U+1F4A9 which is \xF0\x9F\x92\xA9.
1992 const non_well_formed_wtf8 = "\xed\xa0\xbd\xed\xb2\xa9";
1993
1994 var wtf16_buf: [2]u16 = undefined;
1995 const wtf16_len = try wtf8ToWtf16Le(&wtf16_buf, non_well_formed_wtf8);
1996 const wtf16 = wtf16_buf[0..wtf16_len];
1997
1998 try testing.expectEqualSlices(u16, &[_]u16{
1999 mem.nativeToLittle(u16, 0xD83D), // high surrogate
2000 mem.nativeToLittle(u16, 0xDCA9), // low surrogate
2001 }, wtf16);
2002
2003 var wtf8_buf: [4]u8 = undefined;
2004 const wtf8_len = wtf16LeToWtf8(&wtf8_buf, wtf16);
2005 const wtf8 = wtf8_buf[0..wtf8_len];
2006
2007 // Converting to WTF-16 and back results in well-formed WTF-8,
2008 // but it does not match the input WTF-8
2009 try testing.expectEqualSlices(u8, "\xf0\x9f\x92\xa9", wtf8);
2010}
2011
2012fn testRoundtripWtf8(wtf8: []const u8) !void {
2013 // Buffer
2014 {
2015 var wtf16_buf: [32]u16 = undefined;
2016 const wtf16_len = try wtf8ToWtf16Le(&wtf16_buf, wtf8);
2017 try testing.expectEqual(wtf16_len, calcWtf16LeLen(wtf8));
2018 const wtf16 = wtf16_buf[0..wtf16_len];
2019
2020 var roundtripped_buf: [32]u8 = undefined;
2021 const roundtripped_len = wtf16LeToWtf8(&roundtripped_buf, wtf16);
2022 const roundtripped = roundtripped_buf[0..roundtripped_len];
2023
2024 try testing.expectEqualSlices(u8, wtf8, roundtripped);
2025 }
2026 // Alloc
2027 {
2028 const wtf16 = try wtf8ToWtf16LeAlloc(testing.allocator, wtf8);
2029 defer testing.allocator.free(wtf16);
2030
2031 const roundtripped = try wtf16LeToWtf8Alloc(testing.allocator, wtf16);
2032 defer testing.allocator.free(roundtripped);
2033
2034 try testing.expectEqualSlices(u8, wtf8, roundtripped);
2035 }
2036 // AllocZ
2037 {
2038 const wtf16 = try wtf8ToWtf16LeAllocZ(testing.allocator, wtf8);
2039 defer testing.allocator.free(wtf16);
2040
2041 const roundtripped = try wtf16LeToWtf8AllocZ(testing.allocator, wtf16);
2042 defer testing.allocator.free(roundtripped);
2043
2044 try testing.expectEqualSlices(u8, wtf8, roundtripped);
2045 }
2046}
2047
2048test "well-formed WTF-8 roundtrips" {
2049 try testRoundtripWtf8("\xed\x9f\xbf"); // not a surrogate half
2050 try testRoundtripWtf8("\xed\xa0\xbd"); // high surrogate
2051 try testRoundtripWtf8("\xed\xb2\xa9"); // low surrogate
2052 try testRoundtripWtf8("\xed\xa0\xbd \xed\xb2\xa9"); // <high surrogate><space><low surrogate>
2053 try testRoundtripWtf8("\xed\xa0\x80\xed\xaf\xbf"); // <high surrogate><high surrogate>
2054 try testRoundtripWtf8("\xed\xa0\x80\xee\x80\x80"); // <high surrogate><not surrogate>
2055 try testRoundtripWtf8("\xed\x9f\xbf\xed\xb0\x80"); // <not surrogate><low surrogate>
2056 try testRoundtripWtf8("a\xed\xb0\x80"); // <not surrogate><low surrogate>
2057 try testRoundtripWtf8("\xf0\x9f\x92\xa9"); // U+1F4A9, encoded as a surrogate pair in WTF-16
2058}
2059
2060fn testRoundtripWtf16(wtf16le: []const u16) !void {
2061 // Buffer
2062 {
2063 var wtf8_buf: [32]u8 = undefined;
2064 const wtf8_len = wtf16LeToWtf8(&wtf8_buf, wtf16le);
2065 const wtf8 = wtf8_buf[0..wtf8_len];
2066
2067 var roundtripped_buf: [32]u16 = undefined;
2068 const roundtripped_len = try wtf8ToWtf16Le(&roundtripped_buf, wtf8);
2069 const roundtripped = roundtripped_buf[0..roundtripped_len];
2070
2071 try testing.expectEqualSlices(u16, wtf16le, roundtripped);
2072 }
2073 // Alloc
2074 {
2075 const wtf8 = try wtf16LeToWtf8Alloc(testing.allocator, wtf16le);
2076 defer testing.allocator.free(wtf8);
2077
2078 const roundtripped = try wtf8ToWtf16LeAlloc(testing.allocator, wtf8);
2079 defer testing.allocator.free(roundtripped);
2080
2081 try testing.expectEqualSlices(u16, wtf16le, roundtripped);
2082 }
2083 // AllocZ
2084 {
2085 const wtf8 = try wtf16LeToWtf8AllocZ(testing.allocator, wtf16le);
2086 defer testing.allocator.free(wtf8);
2087
2088 const roundtripped = try wtf8ToWtf16LeAllocZ(testing.allocator, wtf8);
2089 defer testing.allocator.free(roundtripped);
2090
2091 try testing.expectEqualSlices(u16, wtf16le, roundtripped);
2092 }
2093}
2094
2095test "well-formed WTF-16 roundtrips" {
2096 try testRoundtripWtf16(&[_]u16{
2097 mem.nativeToLittle(u16, 0xD83D), // high surrogate
2098 mem.nativeToLittle(u16, 0xDCA9), // low surrogate
2099 });
2100 try testRoundtripWtf16(&[_]u16{
2101 mem.nativeToLittle(u16, 0xD83D), // high surrogate
2102 mem.nativeToLittle(u16, ' '), // not surrogate
2103 mem.nativeToLittle(u16, 0xDCA9), // low surrogate
2104 });
2105 try testRoundtripWtf16(&[_]u16{
2106 mem.nativeToLittle(u16, 0xD800), // high surrogate
2107 mem.nativeToLittle(u16, 0xDBFF), // high surrogate
2108 });
2109 try testRoundtripWtf16(&[_]u16{
2110 mem.nativeToLittle(u16, 0xD800), // high surrogate
2111 mem.nativeToLittle(u16, 0xE000), // not surrogate
2112 });
2113 try testRoundtripWtf16(&[_]u16{
2114 mem.nativeToLittle(u16, 0xD7FF), // not surrogate
2115 mem.nativeToLittle(u16, 0xDC00), // low surrogate
2116 });
2117 try testRoundtripWtf16(&[_]u16{
2118 mem.nativeToLittle(u16, 0x61), // not surrogate
2119 mem.nativeToLittle(u16, 0xDC00), // low surrogate
2120 });
2121 try testRoundtripWtf16(&[_]u16{
2122 mem.nativeToLittle(u16, 0xDC00), // low surrogate
2123 });
2124}
2125
2126/// Returns the length, in bytes, that would be necessary to encode the
2127/// given WTF-16 LE slice as WTF-8.
2128pub fn calcWtf8Len(wtf16le: []const u16) usize {
2129 var it = Wtf16LeIterator.init(wtf16le);
2130 var num_wtf8_bytes: usize = 0;
2131 while (it.nextCodepoint()) |codepoint| {
2132 // Note: If utf8CodepointSequenceLength is ever changed to error on surrogate
2133 // codepoints, then it would no longer be eligible to be used in this context.
2134 num_wtf8_bytes += utf8CodepointSequenceLength(codepoint) catch |err| switch (err) {
2135 error.CodepointTooLarge => unreachable,
2136 };
2137 }
2138 return num_wtf8_bytes;
2139}
2140
2141fn testCalcWtf8Len() !void {
2142 const L = utf8ToUtf16LeStringLiteral;
2143 try testing.expectEqual(@as(usize, 1), calcWtf8Len(L("a")));
2144 try testing.expectEqual(@as(usize, 10), calcWtf8Len(L("abcdefghij")));
2145 // unpaired surrogate
2146 try testing.expectEqual(@as(usize, 3), calcWtf8Len(&[_]u16{
2147 mem.nativeToLittle(u16, 0xD800),
2148 }));
2149 try testing.expectEqual(@as(usize, 15), calcWtf8Len(L("こんにちは")));
2150 // First codepoints that are encoded as 1, 2, 3, and 4 bytes
2151 try testing.expectEqual(@as(usize, 1 + 2 + 3 + 4), calcWtf8Len(L("\u{0}\u{80}\u{800}\u{10000}")));
2152}
2153
2154test "calculate wtf8 string length of given wtf16 string" {
2155 try testCalcWtf8Len();
2156 try comptime testCalcWtf8Len();
2157}