master
   1const std = @import("./std.zig");
   2const builtin = @import("builtin");
   3const assert = std.debug.assert;
   4const testing = std.testing;
   5const mem = std.mem;
   6const native_endian = builtin.cpu.arch.endian();
   7const Allocator = std.mem.Allocator;
   8
   9/// Use this to replace an unknown, unrecognized, or unrepresentable character.
  10///
  11/// See also: https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character
  12pub const replacement_character: u21 = 0xFFFD;
  13pub const replacement_character_utf8: [3]u8 = utf8EncodeComptime(replacement_character);
  14
  15/// Returns how many bytes the UTF-8 representation would require
  16/// for the given codepoint.
  17pub fn utf8CodepointSequenceLength(c: u21) !u3 {
  18    if (c < 0x80) return @as(u3, 1);
  19    if (c < 0x800) return @as(u3, 2);
  20    if (c < 0x10000) return @as(u3, 3);
  21    if (c < 0x110000) return @as(u3, 4);
  22    return error.CodepointTooLarge;
  23}
  24
  25/// Given the first byte of a UTF-8 codepoint,
  26/// returns a number 1-4 indicating the total length of the codepoint in bytes.
  27/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
  28pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
  29    // The switch is optimized much better than a "smart" approach using @clz
  30    return switch (first_byte) {
  31        0b0000_0000...0b0111_1111 => 1,
  32        0b1100_0000...0b1101_1111 => 2,
  33        0b1110_0000...0b1110_1111 => 3,
  34        0b1111_0000...0b1111_0111 => 4,
  35        else => error.Utf8InvalidStartByte,
  36    };
  37}
  38
  39/// Encodes the given codepoint into a UTF-8 byte sequence.
  40/// c: the codepoint.
  41/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
  42/// Errors: if c cannot be encoded in UTF-8.
  43/// Returns: the number of bytes written to out.
  44pub fn utf8Encode(c: u21, out: []u8) error{ Utf8CannotEncodeSurrogateHalf, CodepointTooLarge }!u3 {
  45    return utf8EncodeImpl(c, out, .cannot_encode_surrogate_half);
  46}
  47
  48const Surrogates = enum {
  49    cannot_encode_surrogate_half,
  50    can_encode_surrogate_half,
  51};
  52
  53fn utf8EncodeImpl(c: u21, out: []u8, comptime surrogates: Surrogates) !u3 {
  54    const length = try utf8CodepointSequenceLength(c);
  55    assert(out.len >= length);
  56    switch (length) {
  57        // The pattern for each is the same
  58        // - Increasing the initial shift by 6 each time
  59        // - Each time after the first shorten the shifted
  60        //   value to a max of 0b111111 (63)
  61        1 => out[0] = @as(u8, @intCast(c)), // Can just do 0 + codepoint for initial range
  62        2 => {
  63            out[0] = @as(u8, @intCast(0b11000000 | (c >> 6)));
  64            out[1] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));
  65        },
  66        3 => {
  67            if (surrogates == .cannot_encode_surrogate_half and isSurrogateCodepoint(c)) {
  68                return error.Utf8CannotEncodeSurrogateHalf;
  69            }
  70            out[0] = @as(u8, @intCast(0b11100000 | (c >> 12)));
  71            out[1] = @as(u8, @intCast(0b10000000 | ((c >> 6) & 0b111111)));
  72            out[2] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));
  73        },
  74        4 => {
  75            out[0] = @as(u8, @intCast(0b11110000 | (c >> 18)));
  76            out[1] = @as(u8, @intCast(0b10000000 | ((c >> 12) & 0b111111)));
  77            out[2] = @as(u8, @intCast(0b10000000 | ((c >> 6) & 0b111111)));
  78            out[3] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));
  79        },
  80        else => unreachable,
  81    }
  82    return length;
  83}
  84
  85pub inline fn utf8EncodeComptime(comptime c: u21) [
  86    utf8CodepointSequenceLength(c) catch |err|
  87        @compileError(@errorName(err))
  88]u8 {
  89    comptime var result: [
  90        utf8CodepointSequenceLength(c) catch
  91            unreachable
  92    ]u8 = undefined;
  93    comptime assert((utf8Encode(c, &result) catch |err|
  94        @compileError(@errorName(err))) == result.len);
  95    return result;
  96}
  97
  98const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
  99
 100/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
 101pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
 102    return switch (bytes.len) {
 103        1 => bytes[0],
 104        2 => utf8Decode2(bytes[0..2].*),
 105        3 => utf8Decode3(bytes[0..3].*),
 106        4 => utf8Decode4(bytes[0..4].*),
 107        else => unreachable,
 108    };
 109}
 110
 111const Utf8Decode2Error = error{
 112    Utf8ExpectedContinuation,
 113    Utf8OverlongEncoding,
 114};
 115pub fn utf8Decode2(bytes: [2]u8) Utf8Decode2Error!u21 {
 116    assert(bytes[0] & 0b11100000 == 0b11000000);
 117    var value: u21 = bytes[0] & 0b00011111;
 118
 119    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
 120    value <<= 6;
 121    value |= bytes[1] & 0b00111111;
 122
 123    if (value < 0x80) return error.Utf8OverlongEncoding;
 124
 125    return value;
 126}
 127
 128const Utf8Decode3Error = Utf8Decode3AllowSurrogateHalfError || error{
 129    Utf8EncodesSurrogateHalf,
 130};
 131pub fn utf8Decode3(bytes: [3]u8) Utf8Decode3Error!u21 {
 132    const value = try utf8Decode3AllowSurrogateHalf(bytes);
 133
 134    if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
 135
 136    return value;
 137}
 138
 139const Utf8Decode3AllowSurrogateHalfError = error{
 140    Utf8ExpectedContinuation,
 141    Utf8OverlongEncoding,
 142};
 143pub fn utf8Decode3AllowSurrogateHalf(bytes: [3]u8) Utf8Decode3AllowSurrogateHalfError!u21 {
 144    assert(bytes[0] & 0b11110000 == 0b11100000);
 145    var value: u21 = bytes[0] & 0b00001111;
 146
 147    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
 148    value <<= 6;
 149    value |= bytes[1] & 0b00111111;
 150
 151    if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
 152    value <<= 6;
 153    value |= bytes[2] & 0b00111111;
 154
 155    if (value < 0x800) return error.Utf8OverlongEncoding;
 156
 157    return value;
 158}
 159
 160const Utf8Decode4Error = error{
 161    Utf8ExpectedContinuation,
 162    Utf8OverlongEncoding,
 163    Utf8CodepointTooLarge,
 164};
 165pub fn utf8Decode4(bytes: [4]u8) Utf8Decode4Error!u21 {
 166    assert(bytes[0] & 0b11111000 == 0b11110000);
 167    var value: u21 = bytes[0] & 0b00000111;
 168
 169    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
 170    value <<= 6;
 171    value |= bytes[1] & 0b00111111;
 172
 173    if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
 174    value <<= 6;
 175    value |= bytes[2] & 0b00111111;
 176
 177    if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
 178    value <<= 6;
 179    value |= bytes[3] & 0b00111111;
 180
 181    if (value < 0x10000) return error.Utf8OverlongEncoding;
 182    if (value > 0x10FFFF) return error.Utf8CodepointTooLarge;
 183
 184    return value;
 185}
 186
 187/// Returns true if the given unicode codepoint can be encoded in UTF-8.
 188pub fn utf8ValidCodepoint(value: u21) bool {
 189    return switch (value) {
 190        0xD800...0xDFFF => false, // Surrogates range
 191        0x110000...0x1FFFFF => false, // Above the maximum codepoint value
 192        else => true,
 193    };
 194}
 195
 196/// Returns the length of a supplied UTF-8 string literal in terms of unicode
 197/// codepoints.
 198pub fn utf8CountCodepoints(s: []const u8) !usize {
 199    var len: usize = 0;
 200
 201    const N = @sizeOf(usize);
 202    const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
 203
 204    var i: usize = 0;
 205    while (i < s.len) {
 206        // Fast path for ASCII sequences
 207        while (i + N <= s.len) : (i += N) {
 208            const v = mem.readInt(usize, s[i..][0..N], native_endian);
 209            if (v & MASK != 0) break;
 210            len += N;
 211        }
 212
 213        if (i < s.len) {
 214            const n = try utf8ByteSequenceLength(s[i]);
 215            if (i + n > s.len) return error.TruncatedInput;
 216
 217            switch (n) {
 218                1 => {}, // ASCII, no validation needed
 219                else => _ = try utf8Decode(s[i..][0..n]),
 220            }
 221
 222            i += n;
 223            len += 1;
 224        }
 225    }
 226
 227    return len;
 228}
 229
 230/// Returns true if the input consists entirely of UTF-8 codepoints
 231pub fn utf8ValidateSlice(input: []const u8) bool {
 232    return utf8ValidateSliceImpl(input, .cannot_encode_surrogate_half);
 233}
 234
 235fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
 236    var remaining = input;
 237
 238    if (std.simd.suggestVectorLength(u8)) |chunk_len| {
 239        const Chunk = @Vector(chunk_len, u8);
 240
 241        // Fast path. Check for and skip ASCII characters at the start of the input.
 242        while (remaining.len >= chunk_len) {
 243            const chunk: Chunk = remaining[0..chunk_len].*;
 244            const mask: Chunk = @splat(0x80);
 245            if (@reduce(.Or, chunk & mask == mask)) {
 246                // found a non ASCII byte
 247                break;
 248            }
 249            remaining = remaining[chunk_len..];
 250        }
 251    }
 252
 253    // default lowest and highest continuation byte
 254    const lo_cb = 0b10000000;
 255    const hi_cb = 0b10111111;
 256
 257    const min_non_ascii_codepoint = 0x80;
 258
 259    // The first nibble is used to identify the continuation byte range to
 260    // accept. The second nibble is the size.
 261    const xx = 0xF1; // invalid: size 1
 262    const as = 0xF0; // ASCII: size 1
 263    const s1 = 0x02; // accept 0, size 2
 264    const s2 = switch (surrogates) {
 265        .cannot_encode_surrogate_half => 0x13, // accept 1, size 3
 266        .can_encode_surrogate_half => 0x03, // accept 0, size 3
 267    };
 268    const s3 = 0x03; // accept 0, size 3
 269    const s4 = switch (surrogates) {
 270        .cannot_encode_surrogate_half => 0x23, // accept 2, size 3
 271        .can_encode_surrogate_half => 0x03, // accept 0, size 3
 272    };
 273    const s5 = 0x34; // accept 3, size 4
 274    const s6 = 0x04; // accept 0, size 4
 275    const s7 = 0x44; // accept 4, size 4
 276
 277    // Information about the first byte in a UTF-8 sequence.
 278    const first = comptime ([_]u8{as} ** 128) ++ ([_]u8{xx} ** 64) ++ [_]u8{
 279        xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
 280        s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1,
 281        s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3,
 282        s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx,
 283    };
 284
 285    const n = remaining.len;
 286    var i: usize = 0;
 287    while (i < n) {
 288        const first_byte = remaining[i];
 289        if (first_byte < min_non_ascii_codepoint) {
 290            i += 1;
 291            continue;
 292        }
 293
 294        const info = first[first_byte];
 295        if (info == xx) {
 296            return false; // Illegal starter byte.
 297        }
 298
 299        const size = info & 7;
 300        if (i + size > n) {
 301            return false; // Short or invalid.
 302        }
 303
 304        // Figure out the acceptable low and high continuation bytes, starting
 305        // with our defaults.
 306        var accept_lo: u8 = lo_cb;
 307        var accept_hi: u8 = hi_cb;
 308
 309        switch (info >> 4) {
 310            0 => {},
 311            1 => accept_lo = 0xA0,
 312            2 => accept_hi = 0x9F,
 313            3 => accept_lo = 0x90,
 314            4 => accept_hi = 0x8F,
 315            else => unreachable,
 316        }
 317
 318        const c1 = remaining[i + 1];
 319        if (c1 < accept_lo or accept_hi < c1) {
 320            return false;
 321        }
 322
 323        switch (size) {
 324            2 => i += 2,
 325            3 => {
 326                const c2 = remaining[i + 2];
 327                if (c2 < lo_cb or hi_cb < c2) {
 328                    return false;
 329                }
 330                i += 3;
 331            },
 332            4 => {
 333                const c2 = remaining[i + 2];
 334                if (c2 < lo_cb or hi_cb < c2) {
 335                    return false;
 336                }
 337                const c3 = remaining[i + 3];
 338                if (c3 < lo_cb or hi_cb < c3) {
 339                    return false;
 340                }
 341                i += 4;
 342            },
 343            else => unreachable,
 344        }
 345    }
 346
 347    return true;
 348}
 349
 350/// Utf8View iterates the code points of a utf-8 encoded string.
 351///
 352/// ```
 353/// var utf8 = (try std.unicode.Utf8View.init("hi there")).iterator();
 354/// while (utf8.nextCodepointSlice()) |codepoint| {
 355///   std.debug.print("got codepoint {s}\n", .{codepoint});
 356/// }
 357/// ```
 358pub const Utf8View = struct {
 359    bytes: []const u8,
 360
 361    pub fn init(s: []const u8) !Utf8View {
 362        if (!utf8ValidateSlice(s)) {
 363            return error.InvalidUtf8;
 364        }
 365
 366        return initUnchecked(s);
 367    }
 368
 369    pub fn initUnchecked(s: []const u8) Utf8View {
 370        return Utf8View{ .bytes = s };
 371    }
 372
 373    pub inline fn initComptime(comptime s: []const u8) Utf8View {
 374        return comptime if (init(s)) |r| r else |err| switch (err) {
 375            error.InvalidUtf8 => {
 376                @compileError("invalid utf8");
 377            },
 378        };
 379    }
 380
 381    pub fn iterator(s: Utf8View) Utf8Iterator {
 382        return Utf8Iterator{
 383            .bytes = s.bytes,
 384            .i = 0,
 385        };
 386    }
 387};
 388
 389pub const Utf8Iterator = struct {
 390    bytes: []const u8,
 391    i: usize,
 392
 393    pub fn nextCodepointSlice(it: *Utf8Iterator) ?[]const u8 {
 394        if (it.i >= it.bytes.len) {
 395            return null;
 396        }
 397
 398        const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;
 399        it.i += cp_len;
 400        return it.bytes[it.i - cp_len .. it.i];
 401    }
 402
 403    pub fn nextCodepoint(it: *Utf8Iterator) ?u21 {
 404        const slice = it.nextCodepointSlice() orelse return null;
 405        return utf8Decode(slice) catch unreachable;
 406    }
 407
 408    /// Look ahead at the next n codepoints without advancing the iterator.
 409    /// If fewer than n codepoints are available, then return the remainder of the string.
 410    pub fn peek(it: *Utf8Iterator, n: usize) []const u8 {
 411        const original_i = it.i;
 412        defer it.i = original_i;
 413
 414        var end_ix = original_i;
 415        var found: usize = 0;
 416        while (found < n) : (found += 1) {
 417            const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
 418            end_ix += next_codepoint.len;
 419        }
 420
 421        return it.bytes[original_i..end_ix];
 422    }
 423};
 424
 425pub fn utf16IsHighSurrogate(c: u16) bool {
 426    return c & ~@as(u16, 0x03ff) == 0xd800;
 427}
 428
 429pub fn utf16IsLowSurrogate(c: u16) bool {
 430    return c & ~@as(u16, 0x03ff) == 0xdc00;
 431}
 432
 433/// Returns how many code units the UTF-16 representation would require
 434/// for the given codepoint.
 435pub fn utf16CodepointSequenceLength(c: u21) !u2 {
 436    if (c <= 0xFFFF) return 1;
 437    if (c <= 0x10FFFF) return 2;
 438    return error.CodepointTooLarge;
 439}
 440
 441test utf16CodepointSequenceLength {
 442    try testing.expectEqual(@as(u2, 1), try utf16CodepointSequenceLength('a'));
 443    try testing.expectEqual(@as(u2, 1), try utf16CodepointSequenceLength(0xFFFF));
 444    try testing.expectEqual(@as(u2, 2), try utf16CodepointSequenceLength(0x10000));
 445    try testing.expectEqual(@as(u2, 2), try utf16CodepointSequenceLength(0x10FFFF));
 446    try testing.expectError(error.CodepointTooLarge, utf16CodepointSequenceLength(0x110000));
 447}
 448
 449/// Given the first code unit of a UTF-16 codepoint, returns a number 1-2
 450/// indicating the total length of the codepoint in UTF-16 code units.
 451/// If this code unit does not match the form of a UTF-16 start code unit, returns Utf16InvalidStartCodeUnit.
 452pub fn utf16CodeUnitSequenceLength(first_code_unit: u16) !u2 {
 453    if (utf16IsHighSurrogate(first_code_unit)) return 2;
 454    if (utf16IsLowSurrogate(first_code_unit)) return error.Utf16InvalidStartCodeUnit;
 455    return 1;
 456}
 457
 458test utf16CodeUnitSequenceLength {
 459    try testing.expectEqual(@as(u2, 1), try utf16CodeUnitSequenceLength('a'));
 460    try testing.expectEqual(@as(u2, 1), try utf16CodeUnitSequenceLength(0xFFFF));
 461    try testing.expectEqual(@as(u2, 2), try utf16CodeUnitSequenceLength(0xDBFF));
 462    try testing.expectError(error.Utf16InvalidStartCodeUnit, utf16CodeUnitSequenceLength(0xDFFF));
 463}
 464
 465/// Decodes the codepoint encoded in the given pair of UTF-16 code units.
 466/// Asserts that `surrogate_pair.len >= 2` and that the first code unit is a high surrogate.
 467/// If the second code unit is not a low surrogate, error.ExpectedSecondSurrogateHalf is returned.
 468pub fn utf16DecodeSurrogatePair(surrogate_pair: []const u16) !u21 {
 469    assert(surrogate_pair.len >= 2);
 470    assert(utf16IsHighSurrogate(surrogate_pair[0]));
 471    const high_half: u21 = surrogate_pair[0];
 472    const low_half = surrogate_pair[1];
 473    if (!utf16IsLowSurrogate(low_half)) return error.ExpectedSecondSurrogateHalf;
 474    return 0x10000 + ((high_half & 0x03ff) << 10) | (low_half & 0x03ff);
 475}
 476
 477pub const Utf16LeIterator = struct {
 478    bytes: []const u8,
 479    i: usize,
 480
 481    pub fn init(s: []const u16) Utf16LeIterator {
 482        return Utf16LeIterator{
 483            .bytes = mem.sliceAsBytes(s),
 484            .i = 0,
 485        };
 486    }
 487
 488    pub const NextCodepointError = error{ DanglingSurrogateHalf, ExpectedSecondSurrogateHalf, UnexpectedSecondSurrogateHalf };
 489
 490    pub fn nextCodepoint(it: *Utf16LeIterator) NextCodepointError!?u21 {
 491        assert(it.i <= it.bytes.len);
 492        if (it.i == it.bytes.len) return null;
 493        var code_units: [2]u16 = undefined;
 494        code_units[0] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
 495        it.i += 2;
 496        if (utf16IsHighSurrogate(code_units[0])) {
 497            // surrogate pair
 498            if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf;
 499            code_units[1] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
 500            const codepoint = try utf16DecodeSurrogatePair(&code_units);
 501            it.i += 2;
 502            return codepoint;
 503        } else if (utf16IsLowSurrogate(code_units[0])) {
 504            return error.UnexpectedSecondSurrogateHalf;
 505        } else {
 506            return code_units[0];
 507        }
 508    }
 509};
 510
 511/// Returns the length of a supplied UTF-16 string literal in terms of unicode
 512/// codepoints.
 513pub fn utf16CountCodepoints(utf16le: []const u16) !usize {
 514    var len: usize = 0;
 515    var it = Utf16LeIterator.init(utf16le);
 516    while (try it.nextCodepoint()) |_| len += 1;
 517    return len;
 518}
 519
 520fn testUtf16CountCodepoints() !void {
 521    try testing.expectEqual(
 522        @as(usize, 1),
 523        try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("a")),
 524    );
 525    try testing.expectEqual(
 526        @as(usize, 10),
 527        try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("abcdefghij")),
 528    );
 529    try testing.expectEqual(
 530        @as(usize, 10),
 531        try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("äåéëþüúíóö")),
 532    );
 533    try testing.expectEqual(
 534        @as(usize, 5),
 535        try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("こんにちは")),
 536    );
 537}
 538
 539test "utf16 count codepoints" {
 540    @setEvalBranchQuota(2000);
 541    try testUtf16CountCodepoints();
 542    try comptime testUtf16CountCodepoints();
 543}
 544
 545test "utf8 encode" {
 546    try comptime testUtf8Encode();
 547    try testUtf8Encode();
 548}
 549fn testUtf8Encode() !void {
 550    // A few taken from wikipedia a few taken elsewhere
 551    var array: [4]u8 = undefined;
 552    try testing.expect((try utf8Encode(try utf8Decode(""), array[0..])) == 3);
 553    try testing.expect(array[0] == 0b11100010);
 554    try testing.expect(array[1] == 0b10000010);
 555    try testing.expect(array[2] == 0b10101100);
 556
 557    try testing.expect((try utf8Encode(try utf8Decode("$"), array[0..])) == 1);
 558    try testing.expect(array[0] == 0b00100100);
 559
 560    try testing.expect((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2);
 561    try testing.expect(array[0] == 0b11000010);
 562    try testing.expect(array[1] == 0b10100010);
 563
 564    try testing.expect((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4);
 565    try testing.expect(array[0] == 0b11110000);
 566    try testing.expect(array[1] == 0b10010000);
 567    try testing.expect(array[2] == 0b10001101);
 568    try testing.expect(array[3] == 0b10001000);
 569}
 570
 571test "utf8 encode comptime" {
 572    try testing.expectEqualSlices(u8, "", &utf8EncodeComptime('€'));
 573    try testing.expectEqualSlices(u8, "$", &utf8EncodeComptime('$'));
 574    try testing.expectEqualSlices(u8, "¢", &utf8EncodeComptime('¢'));
 575    try testing.expectEqualSlices(u8, "𐍈", &utf8EncodeComptime('𐍈'));
 576}
 577
 578test "utf8 encode error" {
 579    try comptime testUtf8EncodeError();
 580    try testUtf8EncodeError();
 581}
 582fn testUtf8EncodeError() !void {
 583    var array: [4]u8 = undefined;
 584    try testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
 585    try testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
 586    try testErrorEncode(0x110000, array[0..], error.CodepointTooLarge);
 587    try testErrorEncode(0x1fffff, array[0..], error.CodepointTooLarge);
 588}
 589
 590fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) !void {
 591    try testing.expectError(expectedErr, utf8Encode(codePoint, array));
 592}
 593
 594test "utf8 iterator on ascii" {
 595    try comptime testUtf8IteratorOnAscii();
 596    try testUtf8IteratorOnAscii();
 597}
 598fn testUtf8IteratorOnAscii() !void {
 599    const s = Utf8View.initComptime("abc");
 600
 601    var it1 = s.iterator();
 602    try testing.expect(mem.eql(u8, "a", it1.nextCodepointSlice().?));
 603    try testing.expect(mem.eql(u8, "b", it1.nextCodepointSlice().?));
 604    try testing.expect(mem.eql(u8, "c", it1.nextCodepointSlice().?));
 605    try testing.expect(it1.nextCodepointSlice() == null);
 606
 607    var it2 = s.iterator();
 608    try testing.expect(it2.nextCodepoint().? == 'a');
 609    try testing.expect(it2.nextCodepoint().? == 'b');
 610    try testing.expect(it2.nextCodepoint().? == 'c');
 611    try testing.expect(it2.nextCodepoint() == null);
 612}
 613
 614test "utf8 view bad" {
 615    try comptime testUtf8ViewBad();
 616    try testUtf8ViewBad();
 617}
 618fn testUtf8ViewBad() !void {
 619    // Compile-time error.
 620    // const s3 = Utf8View.initComptime("\xfe\xf2");
 621    try testing.expectError(error.InvalidUtf8, Utf8View.init("hel\xadlo"));
 622}
 623
 624test "utf8 view ok" {
 625    try comptime testUtf8ViewOk();
 626    try testUtf8ViewOk();
 627}
 628fn testUtf8ViewOk() !void {
 629    const s = Utf8View.initComptime("東京市");
 630
 631    var it1 = s.iterator();
 632    try testing.expect(mem.eql(u8, "", it1.nextCodepointSlice().?));
 633    try testing.expect(mem.eql(u8, "", it1.nextCodepointSlice().?));
 634    try testing.expect(mem.eql(u8, "", it1.nextCodepointSlice().?));
 635    try testing.expect(it1.nextCodepointSlice() == null);
 636
 637    var it2 = s.iterator();
 638    try testing.expect(it2.nextCodepoint().? == 0x6771);
 639    try testing.expect(it2.nextCodepoint().? == 0x4eac);
 640    try testing.expect(it2.nextCodepoint().? == 0x5e02);
 641    try testing.expect(it2.nextCodepoint() == null);
 642}
 643
 644test "validate slice" {
 645    try comptime testValidateSlice();
 646    try testValidateSlice();
 647
 648    // We skip a variable (based on recommended vector size) chunks of
 649    // ASCII characters. Let's make sure we're chunking correctly.
 650    const str = [_]u8{'a'} ** 550 ++ "\xc0";
 651    for (0..str.len - 3) |i| {
 652        try testing.expect(!utf8ValidateSlice(str[i..]));
 653    }
 654}
 655fn testValidateSlice() !void {
 656    try testing.expect(utf8ValidateSlice("abc"));
 657    try testing.expect(utf8ValidateSlice("abc\xdf\xbf"));
 658    try testing.expect(utf8ValidateSlice(""));
 659    try testing.expect(utf8ValidateSlice("a"));
 660    try testing.expect(utf8ValidateSlice("abc"));
 661    try testing.expect(utf8ValidateSlice("Ж"));
 662    try testing.expect(utf8ValidateSlice("ЖЖ"));
 663    try testing.expect(utf8ValidateSlice("брэд-ЛГТМ"));
 664    try testing.expect(utf8ValidateSlice("☺☻☹"));
 665    try testing.expect(utf8ValidateSlice("a\u{fffdb}"));
 666    try testing.expect(utf8ValidateSlice("\xf4\x8f\xbf\xbf"));
 667    try testing.expect(utf8ValidateSlice("abc\xdf\xbf"));
 668
 669    try testing.expect(!utf8ValidateSlice("abc\xc0"));
 670    try testing.expect(!utf8ValidateSlice("abc\xc0abc"));
 671    try testing.expect(!utf8ValidateSlice("aa\xe2"));
 672    try testing.expect(!utf8ValidateSlice("\x42\xfa"));
 673    try testing.expect(!utf8ValidateSlice("\x42\xfa\x43"));
 674    try testing.expect(!utf8ValidateSlice("abc\xc0"));
 675    try testing.expect(!utf8ValidateSlice("abc\xc0abc"));
 676    try testing.expect(!utf8ValidateSlice("\xf4\x90\x80\x80"));
 677    try testing.expect(!utf8ValidateSlice("\xf7\xbf\xbf\xbf"));
 678    try testing.expect(!utf8ValidateSlice("\xfb\xbf\xbf\xbf\xbf"));
 679    try testing.expect(!utf8ValidateSlice("\xc0\x80"));
 680    try testing.expect(!utf8ValidateSlice("\xed\xa0\x80"));
 681    try testing.expect(!utf8ValidateSlice("\xed\xbf\xbf"));
 682}
 683
 684test "valid utf8" {
 685    try comptime testValidUtf8();
 686    try testValidUtf8();
 687}
 688fn testValidUtf8() !void {
 689    try testValid("\x00", 0x0);
 690    try testValid("\x20", 0x20);
 691    try testValid("\x7f", 0x7f);
 692    try testValid("\xc2\x80", 0x80);
 693    try testValid("\xdf\xbf", 0x7ff);
 694    try testValid("\xe0\xa0\x80", 0x800);
 695    try testValid("\xe1\x80\x80", 0x1000);
 696    try testValid("\xef\xbf\xbf", 0xffff);
 697    try testValid("\xf0\x90\x80\x80", 0x10000);
 698    try testValid("\xf1\x80\x80\x80", 0x40000);
 699    try testValid("\xf3\xbf\xbf\xbf", 0xfffff);
 700    try testValid("\xf4\x8f\xbf\xbf", 0x10ffff);
 701}
 702
 703test "invalid utf8 continuation bytes" {
 704    try comptime testInvalidUtf8ContinuationBytes();
 705    try testInvalidUtf8ContinuationBytes();
 706}
 707fn testInvalidUtf8ContinuationBytes() !void {
 708    // unexpected continuation
 709    try testError("\x80", error.Utf8InvalidStartByte);
 710    try testError("\xbf", error.Utf8InvalidStartByte);
 711    // too many leading 1's
 712    try testError("\xf8", error.Utf8InvalidStartByte);
 713    try testError("\xff", error.Utf8InvalidStartByte);
 714    // expected continuation for 2 byte sequences
 715    try testError("\xc2", error.UnexpectedEof);
 716    try testError("\xc2\x00", error.Utf8ExpectedContinuation);
 717    try testError("\xc2\xc0", error.Utf8ExpectedContinuation);
 718    // expected continuation for 3 byte sequences
 719    try testError("\xe0", error.UnexpectedEof);
 720    try testError("\xe0\x00", error.UnexpectedEof);
 721    try testError("\xe0\xc0", error.UnexpectedEof);
 722    try testError("\xe0\xa0", error.UnexpectedEof);
 723    try testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation);
 724    try testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation);
 725    // expected continuation for 4 byte sequences
 726    try testError("\xf0", error.UnexpectedEof);
 727    try testError("\xf0\x00", error.UnexpectedEof);
 728    try testError("\xf0\xc0", error.UnexpectedEof);
 729    try testError("\xf0\x90\x00", error.UnexpectedEof);
 730    try testError("\xf0\x90\xc0", error.UnexpectedEof);
 731    try testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation);
 732    try testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation);
 733}
 734
 735test "overlong utf8 codepoint" {
 736    try comptime testOverlongUtf8Codepoint();
 737    try testOverlongUtf8Codepoint();
 738}
 739fn testOverlongUtf8Codepoint() !void {
 740    try testError("\xc0\x80", error.Utf8OverlongEncoding);
 741    try testError("\xc1\xbf", error.Utf8OverlongEncoding);
 742    try testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
 743    try testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding);
 744    try testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding);
 745    try testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding);
 746}
 747
 748test "misc invalid utf8" {
 749    try comptime testMiscInvalidUtf8();
 750    try testMiscInvalidUtf8();
 751}
 752fn testMiscInvalidUtf8() !void {
 753    // codepoint out of bounds
 754    try testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
 755    try testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
 756    // surrogate halves
 757    try testValid("\xed\x9f\xbf", 0xd7ff);
 758    try testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf);
 759    try testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf);
 760    try testValid("\xee\x80\x80", 0xe000);
 761}
 762
 763test "utf8 iterator peeking" {
 764    try comptime testUtf8Peeking();
 765    try testUtf8Peeking();
 766}
 767
 768fn testUtf8Peeking() !void {
 769    const s = Utf8View.initComptime("noël");
 770    var it = s.iterator();
 771
 772    try testing.expect(mem.eql(u8, "n", it.nextCodepointSlice().?));
 773
 774    try testing.expect(mem.eql(u8, "o", it.peek(1)));
 775    try testing.expect(mem.eql(u8, "", it.peek(2)));
 776    try testing.expect(mem.eql(u8, "oël", it.peek(3)));
 777    try testing.expect(mem.eql(u8, "oël", it.peek(4)));
 778    try testing.expect(mem.eql(u8, "oël", it.peek(10)));
 779
 780    try testing.expect(mem.eql(u8, "o", it.nextCodepointSlice().?));
 781    try testing.expect(mem.eql(u8, "ë", it.nextCodepointSlice().?));
 782    try testing.expect(mem.eql(u8, "l", it.nextCodepointSlice().?));
 783    try testing.expect(it.nextCodepointSlice() == null);
 784
 785    try testing.expect(mem.eql(u8, &[_]u8{}, it.peek(1)));
 786}
 787
 788fn testError(bytes: []const u8, expected_err: anyerror) !void {
 789    try testing.expectError(expected_err, testDecode(bytes));
 790}
 791
 792fn testValid(bytes: []const u8, expected_codepoint: u21) !void {
 793    try testing.expect((testDecode(bytes) catch unreachable) == expected_codepoint);
 794}
 795
 796fn testDecode(bytes: []const u8) !u21 {
 797    const length = try utf8ByteSequenceLength(bytes[0]);
 798    if (bytes.len < length) return error.UnexpectedEof;
 799    try testing.expect(bytes.len == length);
 800    return utf8Decode(bytes);
 801}
 802
 803/// Print the given `utf8` string, encoded as UTF-8 bytes.
 804/// Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD)
 805/// according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of
 806/// the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder
 807fn formatUtf8(utf8: []const u8, writer: *std.Io.Writer) std.Io.Writer.Error!void {
 808    var buf: [300]u8 = undefined; // just an arbitrary size
 809    var u8len: usize = 0;
 810
 811    // This implementation is based on this specification:
 812    // https://encoding.spec.whatwg.org/#utf-8-decoder
 813    var codepoint: u21 = 0;
 814    var cont_bytes_seen: u3 = 0;
 815    var cont_bytes_needed: u3 = 0;
 816    var lower_boundary: u8 = 0x80;
 817    var upper_boundary: u8 = 0xBF;
 818
 819    var i: usize = 0;
 820    while (i < utf8.len) {
 821        const byte = utf8[i];
 822        if (cont_bytes_needed == 0) {
 823            switch (byte) {
 824                0x00...0x7F => {
 825                    buf[u8len] = byte;
 826                    u8len += 1;
 827                },
 828                0xC2...0xDF => {
 829                    cont_bytes_needed = 1;
 830                    codepoint = byte & 0b00011111;
 831                },
 832                0xE0...0xEF => {
 833                    if (byte == 0xE0) lower_boundary = 0xA0;
 834                    if (byte == 0xED) upper_boundary = 0x9F;
 835                    cont_bytes_needed = 2;
 836                    codepoint = byte & 0b00001111;
 837                },
 838                0xF0...0xF4 => {
 839                    if (byte == 0xF0) lower_boundary = 0x90;
 840                    if (byte == 0xF4) upper_boundary = 0x8F;
 841                    cont_bytes_needed = 3;
 842                    codepoint = byte & 0b00000111;
 843                },
 844                else => {
 845                    u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
 846                },
 847            }
 848            // consume the byte
 849            i += 1;
 850        } else if (byte < lower_boundary or byte > upper_boundary) {
 851            codepoint = 0;
 852            cont_bytes_needed = 0;
 853            cont_bytes_seen = 0;
 854            lower_boundary = 0x80;
 855            upper_boundary = 0xBF;
 856            u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
 857            // do not consume the current byte, it should now be treated as a possible start byte
 858        } else {
 859            lower_boundary = 0x80;
 860            upper_boundary = 0xBF;
 861            codepoint <<= 6;
 862            codepoint |= byte & 0b00111111;
 863            cont_bytes_seen += 1;
 864            // consume the byte
 865            i += 1;
 866
 867            if (cont_bytes_seen == cont_bytes_needed) {
 868                const codepoint_len = cont_bytes_seen + 1;
 869                const codepoint_start_i = i - codepoint_len;
 870                @memcpy(buf[u8len..][0..codepoint_len], utf8[codepoint_start_i..][0..codepoint_len]);
 871                u8len += codepoint_len;
 872
 873                codepoint = 0;
 874                cont_bytes_needed = 0;
 875                cont_bytes_seen = 0;
 876            }
 877        }
 878        // make sure there's always enough room for another maximum length UTF-8 codepoint
 879        if (u8len + 4 > buf.len) {
 880            try writer.writeAll(buf[0..u8len]);
 881            u8len = 0;
 882        }
 883    }
 884    if (cont_bytes_needed != 0) {
 885        // we know there's enough room because we always flush
 886        // if there's less than 4 bytes remaining in the buffer.
 887        u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
 888    }
 889    try writer.writeAll(buf[0..u8len]);
 890}
 891
 892/// Return a Formatter for a (potentially ill-formed) UTF-8 string.
 893/// Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD)
 894/// according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of
 895/// the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder
 896pub fn fmtUtf8(utf8: []const u8) std.fmt.Alt([]const u8, formatUtf8) {
 897    return .{ .data = utf8 };
 898}
 899
 900test fmtUtf8 {
 901    const expectFmt = testing.expectFmt;
 902    try expectFmt("", "{f}", .{fmtUtf8("")});
 903    try expectFmt("foo", "{f}", .{fmtUtf8("foo")});
 904    try expectFmt("𐐷", "{f}", .{fmtUtf8("𐐷")});
 905
 906    // Table 3-8. U+FFFD for Non-Shortest Form Sequences
 907    try expectFmt("��������A", "{f}", .{fmtUtf8("\xC0\xAF\xE0\x80\xBF\xF0\x81\x82A")});
 908
 909    // Table 3-9. U+FFFD for Ill-Formed Sequences for Surrogates
 910    try expectFmt("��������A", "{f}", .{fmtUtf8("\xED\xA0\x80\xED\xBF\xBF\xED\xAFA")});
 911
 912    // Table 3-10. U+FFFD for Other Ill-Formed Sequences
 913    try expectFmt("�����A��B", "{f}", .{fmtUtf8("\xF4\x91\x92\x93\xFFA\x80\xBFB")});
 914
 915    // Table 3-11. U+FFFD for Truncated Sequences
 916    try expectFmt("����A", "{f}", .{fmtUtf8("\xE1\x80\xE2\xF0\x91\x92\xF1\xBFA")});
 917}
 918
 919fn utf16LeToUtf8ArrayListImpl(
 920    result: *std.array_list.Managed(u8),
 921    utf16le: []const u16,
 922    comptime surrogates: Surrogates,
 923) (switch (surrogates) {
 924    .cannot_encode_surrogate_half => Utf16LeToUtf8AllocError,
 925    .can_encode_surrogate_half => Allocator.Error,
 926})!void {
 927    assert(result.unusedCapacitySlice().len >= utf16le.len);
 928
 929    var remaining = utf16le;
 930    vectorized: {
 931        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
 932        const Chunk = @Vector(chunk_len, u16);
 933
 934        // Fast path. Check for and encode ASCII characters at the start of the input.
 935        while (remaining.len >= chunk_len) {
 936            const chunk: Chunk = remaining[0..chunk_len].*;
 937            const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F));
 938            if (@reduce(.Or, chunk | mask != mask)) {
 939                // found a non ASCII code unit
 940                break;
 941            }
 942            const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk));
 943            // We allocated enough space to encode every UTF-16 code unit
 944            // as ASCII, so if the entire string is ASCII then we are
 945            // guaranteed to have enough space allocated
 946            result.addManyAsArrayAssumeCapacity(chunk_len).* = ascii_chunk;
 947            remaining = remaining[chunk_len..];
 948        }
 949    }
 950
 951    switch (surrogates) {
 952        .cannot_encode_surrogate_half => {
 953            var it = Utf16LeIterator.init(remaining);
 954            while (try it.nextCodepoint()) |codepoint| {
 955                const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
 956                assert((utf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len);
 957            }
 958        },
 959        .can_encode_surrogate_half => {
 960            var it = Wtf16LeIterator.init(remaining);
 961            while (it.nextCodepoint()) |codepoint| {
 962                const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
 963                assert((wtf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len);
 964            }
 965        },
 966    }
 967}
 968
 969pub const Utf16LeToUtf8AllocError = Allocator.Error || Utf16LeToUtf8Error;
 970
 971pub fn utf16LeToUtf8ArrayList(result: *std.array_list.Managed(u8), utf16le: []const u16) Utf16LeToUtf8AllocError!void {
 972    try result.ensureUnusedCapacity(utf16le.len);
 973    return utf16LeToUtf8ArrayListImpl(result, utf16le, .cannot_encode_surrogate_half);
 974}
 975
 976/// Caller owns returned memory.
 977pub fn utf16LeToUtf8Alloc(allocator: Allocator, utf16le: []const u16) Utf16LeToUtf8AllocError![]u8 {
 978    // optimistically guess that it will all be ascii.
 979    var result = try std.array_list.Managed(u8).initCapacity(allocator, utf16le.len);
 980    errdefer result.deinit();
 981
 982    try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half);
 983    return result.toOwnedSlice();
 984}
 985
 986/// Caller owns returned memory.
 987pub fn utf16LeToUtf8AllocZ(allocator: Allocator, utf16le: []const u16) Utf16LeToUtf8AllocError![:0]u8 {
 988    // optimistically guess that it will all be ascii (and allocate space for the null terminator)
 989    var result = try std.array_list.Managed(u8).initCapacity(allocator, utf16le.len + 1);
 990    errdefer result.deinit();
 991
 992    try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half);
 993    return result.toOwnedSliceSentinel(0);
 994}
 995
 996pub const Utf16LeToUtf8Error = Utf16LeIterator.NextCodepointError;
 997
 998/// Asserts that the output buffer is big enough.
 999/// Returns end byte index into utf8.
1000fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surrogates) (switch (surrogates) {
1001    .cannot_encode_surrogate_half => Utf16LeToUtf8Error,
1002    .can_encode_surrogate_half => error{},
1003})!usize {
1004    var dest_index: usize = 0;
1005
1006    var remaining = utf16le;
1007    vectorized: {
1008        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
1009        const Chunk = @Vector(chunk_len, u16);
1010
1011        // Fast path. Check for and encode ASCII characters at the start of the input.
1012        while (remaining.len >= chunk_len) {
1013            const chunk: Chunk = remaining[0..chunk_len].*;
1014            const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F));
1015            if (@reduce(.Or, chunk | mask != mask)) {
1016                // found a non ASCII code unit
1017                break;
1018            }
1019            const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk));
1020            utf8[dest_index..][0..chunk_len].* = ascii_chunk;
1021            dest_index += chunk_len;
1022            remaining = remaining[chunk_len..];
1023        }
1024    }
1025
1026    switch (surrogates) {
1027        .cannot_encode_surrogate_half => {
1028            var it = Utf16LeIterator.init(remaining);
1029            while (try it.nextCodepoint()) |codepoint| {
1030                dest_index += utf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) {
1031                    // The maximum possible codepoint encoded by UTF-16 is U+10FFFF,
1032                    // which is within the valid codepoint range.
1033                    error.CodepointTooLarge => unreachable,
1034                    // We know the codepoint was valid in UTF-16, meaning it is not
1035                    // an unpaired surrogate codepoint.
1036                    error.Utf8CannotEncodeSurrogateHalf => unreachable,
1037                };
1038            }
1039        },
1040        .can_encode_surrogate_half => {
1041            var it = Wtf16LeIterator.init(remaining);
1042            while (it.nextCodepoint()) |codepoint| {
1043                dest_index += wtf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) {
1044                    // The maximum possible codepoint encoded by UTF-16 is U+10FFFF,
1045                    // which is within the valid codepoint range.
1046                    error.CodepointTooLarge => unreachable,
1047                };
1048            }
1049        },
1050    }
1051    return dest_index;
1052}
1053
1054pub fn utf16LeToUtf8(utf8: []u8, utf16le: []const u16) Utf16LeToUtf8Error!usize {
1055    return utf16LeToUtf8Impl(utf8, utf16le, .cannot_encode_surrogate_half);
1056}
1057
1058test utf16LeToUtf8 {
1059    var utf16le: [2]u16 = undefined;
1060    const utf16le_as_bytes = mem.sliceAsBytes(utf16le[0..]);
1061
1062    {
1063        mem.writeInt(u16, utf16le_as_bytes[0..2], 'A', .little);
1064        mem.writeInt(u16, utf16le_as_bytes[2..4], 'a', .little);
1065        const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1066        defer testing.allocator.free(utf8);
1067        try testing.expect(mem.eql(u8, utf8, "Aa"));
1068    }
1069
1070    {
1071        mem.writeInt(u16, utf16le_as_bytes[0..2], 0x80, .little);
1072        mem.writeInt(u16, utf16le_as_bytes[2..4], 0xffff, .little);
1073        const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1074        defer testing.allocator.free(utf8);
1075        try testing.expect(mem.eql(u8, utf8, "\xc2\x80" ++ "\xef\xbf\xbf"));
1076    }
1077
1078    {
1079        // the values just outside the surrogate half range
1080        mem.writeInt(u16, utf16le_as_bytes[0..2], 0xd7ff, .little);
1081        mem.writeInt(u16, utf16le_as_bytes[2..4], 0xe000, .little);
1082        const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1083        defer testing.allocator.free(utf8);
1084        try testing.expect(mem.eql(u8, utf8, "\xed\x9f\xbf" ++ "\xee\x80\x80"));
1085    }
1086
1087    {
1088        // smallest surrogate pair
1089        mem.writeInt(u16, utf16le_as_bytes[0..2], 0xd800, .little);
1090        mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdc00, .little);
1091        const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1092        defer testing.allocator.free(utf8);
1093        try testing.expect(mem.eql(u8, utf8, "\xf0\x90\x80\x80"));
1094    }
1095
1096    {
1097        // largest surrogate pair
1098        mem.writeInt(u16, utf16le_as_bytes[0..2], 0xdbff, .little);
1099        mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdfff, .little);
1100        const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1101        defer testing.allocator.free(utf8);
1102        try testing.expect(mem.eql(u8, utf8, "\xf4\x8f\xbf\xbf"));
1103    }
1104
1105    {
1106        mem.writeInt(u16, utf16le_as_bytes[0..2], 0xdbff, .little);
1107        mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdc00, .little);
1108        const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1109        defer testing.allocator.free(utf8);
1110        try testing.expect(mem.eql(u8, utf8, "\xf4\x8f\xb0\x80"));
1111    }
1112
1113    {
1114        mem.writeInt(u16, utf16le_as_bytes[0..2], 0xdcdc, .little);
1115        mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdcdc, .little);
1116        const result = utf16LeToUtf8Alloc(testing.allocator, &utf16le);
1117        try testing.expectError(error.UnexpectedSecondSurrogateHalf, result);
1118    }
1119}
1120
1121fn utf8ToUtf16LeArrayListImpl(result: *std.array_list.Managed(u16), utf8: []const u8, comptime surrogates: Surrogates) !void {
1122    assert(result.unusedCapacitySlice().len >= utf8.len);
1123
1124    var remaining = utf8;
1125    vectorized: {
1126        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
1127        const Chunk = @Vector(chunk_len, u8);
1128
1129        // Fast path. Check for and encode ASCII characters at the start of the input.
1130        while (remaining.len >= chunk_len) {
1131            const chunk: Chunk = remaining[0..chunk_len].*;
1132            const mask: Chunk = @splat(0x80);
1133            if (@reduce(.Or, chunk & mask == mask)) {
1134                // found a non ASCII code unit
1135                break;
1136            }
1137            const utf16_chunk = mem.nativeToLittle(@Vector(chunk_len, u16), chunk);
1138            result.addManyAsArrayAssumeCapacity(chunk_len).* = utf16_chunk;
1139            remaining = remaining[chunk_len..];
1140        }
1141    }
1142
1143    const view = switch (surrogates) {
1144        .cannot_encode_surrogate_half => try Utf8View.init(remaining),
1145        .can_encode_surrogate_half => try Wtf8View.init(remaining),
1146    };
1147    var it = view.iterator();
1148    while (it.nextCodepoint()) |codepoint| {
1149        if (codepoint < 0x10000) {
1150            try result.append(mem.nativeToLittle(u16, @intCast(codepoint)));
1151        } else {
1152            const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;
1153            const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;
1154            try result.appendSlice(&.{ mem.nativeToLittle(u16, high), mem.nativeToLittle(u16, low) });
1155        }
1156    }
1157}
1158
1159pub fn utf8ToUtf16LeArrayList(result: *std.array_list.Managed(u16), utf8: []const u8) error{ InvalidUtf8, OutOfMemory }!void {
1160    try result.ensureUnusedCapacity(utf8.len);
1161    return utf8ToUtf16LeArrayListImpl(result, utf8, .cannot_encode_surrogate_half);
1162}
1163
1164pub fn utf8ToUtf16LeAlloc(allocator: Allocator, utf8: []const u8) error{ InvalidUtf8, OutOfMemory }![]u16 {
1165    // optimistically guess that it will not require surrogate pairs
1166    var result = try std.array_list.Managed(u16).initCapacity(allocator, utf8.len);
1167    errdefer result.deinit();
1168
1169    try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half);
1170    return result.toOwnedSlice();
1171}
1172
1173pub fn utf8ToUtf16LeAllocZ(allocator: Allocator, utf8: []const u8) error{ InvalidUtf8, OutOfMemory }![:0]u16 {
1174    // optimistically guess that it will not require surrogate pairs
1175    var result = try std.array_list.Managed(u16).initCapacity(allocator, utf8.len + 1);
1176    errdefer result.deinit();
1177
1178    try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half);
1179    return result.toOwnedSliceSentinel(0);
1180}
1181
1182/// Returns index of next character. If exact fit, returned index equals output slice length.
1183/// Assumes there is enough space for the output.
1184pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) error{InvalidUtf8}!usize {
1185    return utf8ToUtf16LeImpl(utf16le, utf8, .cannot_encode_surrogate_half);
1186}
1187
1188pub fn utf8ToUtf16LeImpl(utf16le: []u16, utf8: []const u8, comptime surrogates: Surrogates) !usize {
1189    var dest_index: usize = 0;
1190
1191    var remaining = utf8;
1192    vectorized: {
1193        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
1194        const Chunk = @Vector(chunk_len, u8);
1195
1196        // Fast path. Check for and encode ASCII characters at the start of the input.
1197        while (remaining.len >= chunk_len) {
1198            const chunk: Chunk = remaining[0..chunk_len].*;
1199            const mask: Chunk = @splat(0x80);
1200            if (@reduce(.Or, chunk & mask == mask)) {
1201                // found a non ASCII code unit
1202                break;
1203            }
1204            const utf16_chunk = mem.nativeToLittle(@Vector(chunk_len, u16), chunk);
1205            utf16le[dest_index..][0..chunk_len].* = utf16_chunk;
1206            dest_index += chunk_len;
1207            remaining = remaining[chunk_len..];
1208        }
1209    }
1210
1211    const view = switch (surrogates) {
1212        .cannot_encode_surrogate_half => try Utf8View.init(remaining),
1213        .can_encode_surrogate_half => try Wtf8View.init(remaining),
1214    };
1215    var it = view.iterator();
1216    while (it.nextCodepoint()) |codepoint| {
1217        if (codepoint < 0x10000) {
1218            utf16le[dest_index] = mem.nativeToLittle(u16, @intCast(codepoint));
1219            dest_index += 1;
1220        } else {
1221            const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;
1222            const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;
1223            utf16le[dest_index..][0..2].* = .{ mem.nativeToLittle(u16, high), mem.nativeToLittle(u16, low) };
1224            dest_index += 2;
1225        }
1226    }
1227    return dest_index;
1228}
1229
1230test utf8ToUtf16Le {
1231    var utf16le: [128]u16 = undefined;
1232    {
1233        const length = try utf8ToUtf16Le(utf16le[0..], "𐐷");
1234        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16le[0..length]));
1235    }
1236    {
1237        const length = try utf8ToUtf16Le(utf16le[0..], "\u{10FFFF}");
1238        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16le[0..length]));
1239    }
1240    {
1241        const result = utf8ToUtf16Le(utf16le[0..], "\xf4\x90\x80\x80");
1242        try testing.expectError(error.InvalidUtf8, result);
1243    }
1244    {
1245        const length = try utf8ToUtf16Le(utf16le[0..], "This string has been designed to test the vectorized implementat" ++
1246            "ion by beginning with one hundred twenty-seven ASCII characters¡");
1247        try testing.expectEqualSlices(u8, &.{
1248            'T', 0, 'h', 0, 'i', 0, 's', 0, ' ', 0, 's', 0, 't', 0, 'r', 0, 'i', 0, 'n', 0, 'g', 0, ' ', 0, 'h', 0, 'a', 0, 's', 0, ' ',  0,
1249            'b', 0, 'e', 0, 'e', 0, 'n', 0, ' ', 0, 'd', 0, 'e', 0, 's', 0, 'i', 0, 'g', 0, 'n', 0, 'e', 0, 'd', 0, ' ', 0, 't', 0, 'o',  0,
1250            ' ', 0, 't', 0, 'e', 0, 's', 0, 't', 0, ' ', 0, 't', 0, 'h', 0, 'e', 0, ' ', 0, 'v', 0, 'e', 0, 'c', 0, 't', 0, 'o', 0, 'r',  0,
1251            'i', 0, 'z', 0, 'e', 0, 'd', 0, ' ', 0, 'i', 0, 'm', 0, 'p', 0, 'l', 0, 'e', 0, 'm', 0, 'e', 0, 'n', 0, 't', 0, 'a', 0, 't',  0,
1252            'i', 0, 'o', 0, 'n', 0, ' ', 0, 'b', 0, 'y', 0, ' ', 0, 'b', 0, 'e', 0, 'g', 0, 'i', 0, 'n', 0, 'n', 0, 'i', 0, 'n', 0, 'g',  0,
1253            ' ', 0, 'w', 0, 'i', 0, 't', 0, 'h', 0, ' ', 0, 'o', 0, 'n', 0, 'e', 0, ' ', 0, 'h', 0, 'u', 0, 'n', 0, 'd', 0, 'r', 0, 'e',  0,
1254            'd', 0, ' ', 0, 't', 0, 'w', 0, 'e', 0, 'n', 0, 't', 0, 'y', 0, '-', 0, 's', 0, 'e', 0, 'v', 0, 'e', 0, 'n', 0, ' ', 0, 'A',  0,
1255            'S', 0, 'C', 0, 'I', 0, 'I', 0, ' ', 0, 'c', 0, 'h', 0, 'a', 0, 'r', 0, 'a', 0, 'c', 0, 't', 0, 'e', 0, 'r', 0, 's', 0, '¡', 0,
1256        }, mem.sliceAsBytes(utf16le[0..length]));
1257    }
1258}
1259
1260test utf8ToUtf16LeArrayList {
1261    {
1262        var list = std.array_list.Managed(u16).init(testing.allocator);
1263        defer list.deinit();
1264        try utf8ToUtf16LeArrayList(&list, "𐐷");
1265        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(list.items));
1266    }
1267    {
1268        var list = std.array_list.Managed(u16).init(testing.allocator);
1269        defer list.deinit();
1270        try utf8ToUtf16LeArrayList(&list, "\u{10FFFF}");
1271        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(list.items));
1272    }
1273    {
1274        var list = std.array_list.Managed(u16).init(testing.allocator);
1275        defer list.deinit();
1276        const result = utf8ToUtf16LeArrayList(&list, "\xf4\x90\x80\x80");
1277        try testing.expectError(error.InvalidUtf8, result);
1278    }
1279}
1280
1281test utf8ToUtf16LeAlloc {
1282    {
1283        const utf16 = try utf8ToUtf16LeAlloc(testing.allocator, "𐐷");
1284        defer testing.allocator.free(utf16);
1285        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16[0..]));
1286    }
1287    {
1288        const utf16 = try utf8ToUtf16LeAlloc(testing.allocator, "\u{10FFFF}");
1289        defer testing.allocator.free(utf16);
1290        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16[0..]));
1291    }
1292    {
1293        const result = utf8ToUtf16LeAlloc(testing.allocator, "\xf4\x90\x80\x80");
1294        try testing.expectError(error.InvalidUtf8, result);
1295    }
1296}
1297
1298test utf8ToUtf16LeAllocZ {
1299    {
1300        const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "𐐷");
1301        defer testing.allocator.free(utf16);
1302        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16));
1303        try testing.expect(utf16[2] == 0);
1304    }
1305    {
1306        const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "\u{10FFFF}");
1307        defer testing.allocator.free(utf16);
1308        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16));
1309        try testing.expect(utf16[2] == 0);
1310    }
1311    {
1312        const result = utf8ToUtf16LeAllocZ(testing.allocator, "\xf4\x90\x80\x80");
1313        try testing.expectError(error.InvalidUtf8, result);
1314    }
1315    {
1316        const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "This string has been designed to test the vectorized implementat" ++
1317            "ion by beginning with one hundred twenty-seven ASCII characters¡");
1318        defer testing.allocator.free(utf16);
1319        try testing.expectEqualSlices(u8, &.{
1320            'T', 0, 'h', 0, 'i', 0, 's', 0, ' ', 0, 's', 0, 't', 0, 'r', 0, 'i', 0, 'n', 0, 'g', 0, ' ', 0, 'h', 0, 'a', 0, 's', 0, ' ',  0,
1321            'b', 0, 'e', 0, 'e', 0, 'n', 0, ' ', 0, 'd', 0, 'e', 0, 's', 0, 'i', 0, 'g', 0, 'n', 0, 'e', 0, 'd', 0, ' ', 0, 't', 0, 'o',  0,
1322            ' ', 0, 't', 0, 'e', 0, 's', 0, 't', 0, ' ', 0, 't', 0, 'h', 0, 'e', 0, ' ', 0, 'v', 0, 'e', 0, 'c', 0, 't', 0, 'o', 0, 'r',  0,
1323            'i', 0, 'z', 0, 'e', 0, 'd', 0, ' ', 0, 'i', 0, 'm', 0, 'p', 0, 'l', 0, 'e', 0, 'm', 0, 'e', 0, 'n', 0, 't', 0, 'a', 0, 't',  0,
1324            'i', 0, 'o', 0, 'n', 0, ' ', 0, 'b', 0, 'y', 0, ' ', 0, 'b', 0, 'e', 0, 'g', 0, 'i', 0, 'n', 0, 'n', 0, 'i', 0, 'n', 0, 'g',  0,
1325            ' ', 0, 'w', 0, 'i', 0, 't', 0, 'h', 0, ' ', 0, 'o', 0, 'n', 0, 'e', 0, ' ', 0, 'h', 0, 'u', 0, 'n', 0, 'd', 0, 'r', 0, 'e',  0,
1326            'd', 0, ' ', 0, 't', 0, 'w', 0, 'e', 0, 'n', 0, 't', 0, 'y', 0, '-', 0, 's', 0, 'e', 0, 'v', 0, 'e', 0, 'n', 0, ' ', 0, 'A',  0,
1327            'S', 0, 'C', 0, 'I', 0, 'I', 0, ' ', 0, 'c', 0, 'h', 0, 'a', 0, 'r', 0, 'a', 0, 'c', 0, 't', 0, 'e', 0, 'r', 0, 's', 0, '¡', 0,
1328        }, mem.sliceAsBytes(utf16));
1329    }
1330}
1331
1332test "ArrayList functions on a re-used list" {
1333    // utf8ToUtf16LeArrayList
1334    {
1335        var list = std.array_list.Managed(u16).init(testing.allocator);
1336        defer list.deinit();
1337
1338        const init_slice = utf8ToUtf16LeStringLiteral("abcdefg");
1339        try list.ensureTotalCapacityPrecise(init_slice.len);
1340        list.appendSliceAssumeCapacity(init_slice);
1341
1342        try utf8ToUtf16LeArrayList(&list, "hijklmnopqrstuvwyxz");
1343
1344        try testing.expectEqualSlices(u16, utf8ToUtf16LeStringLiteral("abcdefghijklmnopqrstuvwyxz"), list.items);
1345    }
1346
1347    // utf16LeToUtf8ArrayList
1348    {
1349        var list = std.array_list.Managed(u8).init(testing.allocator);
1350        defer list.deinit();
1351
1352        const init_slice = "abcdefg";
1353        try list.ensureTotalCapacityPrecise(init_slice.len);
1354        list.appendSliceAssumeCapacity(init_slice);
1355
1356        try utf16LeToUtf8ArrayList(&list, utf8ToUtf16LeStringLiteral("hijklmnopqrstuvwyxz"));
1357
1358        try testing.expectEqualStrings("abcdefghijklmnopqrstuvwyxz", list.items);
1359    }
1360
1361    // wtf8ToWtf16LeArrayList
1362    {
1363        var list = std.array_list.Managed(u16).init(testing.allocator);
1364        defer list.deinit();
1365
1366        const init_slice = utf8ToUtf16LeStringLiteral("abcdefg");
1367        try list.ensureTotalCapacityPrecise(init_slice.len);
1368        list.appendSliceAssumeCapacity(init_slice);
1369
1370        try wtf8ToWtf16LeArrayList(&list, "hijklmnopqrstuvwyxz");
1371
1372        try testing.expectEqualSlices(u16, utf8ToUtf16LeStringLiteral("abcdefghijklmnopqrstuvwyxz"), list.items);
1373    }
1374
1375    // wtf16LeToWtf8ArrayList
1376    {
1377        var list = std.array_list.Managed(u8).init(testing.allocator);
1378        defer list.deinit();
1379
1380        const init_slice = "abcdefg";
1381        try list.ensureTotalCapacityPrecise(init_slice.len);
1382        list.appendSliceAssumeCapacity(init_slice);
1383
1384        try wtf16LeToWtf8ArrayList(&list, utf8ToUtf16LeStringLiteral("hijklmnopqrstuvwyxz"));
1385
1386        try testing.expectEqualStrings("abcdefghijklmnopqrstuvwyxz", list.items);
1387    }
1388}
1389
1390fn utf8ToUtf16LeStringLiteralImpl(comptime utf8: []const u8, comptime surrogates: Surrogates) *const [calcUtf16LeLenImpl(utf8, surrogates) catch |err| @compileError(err):0]u16 {
1391    return comptime blk: {
1392        const len: usize = calcUtf16LeLenImpl(utf8, surrogates) catch unreachable;
1393        var utf16le: [len:0]u16 = [_:0]u16{0} ** len;
1394        const utf16le_len = utf8ToUtf16LeImpl(&utf16le, utf8[0..], surrogates) catch |err| @compileError(err);
1395        assert(len == utf16le_len);
1396        const final = utf16le;
1397        break :blk &final;
1398    };
1399}
1400
1401/// Converts a UTF-8 string literal into a UTF-16LE string literal.
1402pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch |err| @compileError(err):0]u16 {
1403    return utf8ToUtf16LeStringLiteralImpl(utf8, .cannot_encode_surrogate_half);
1404}
1405
1406/// Converts a WTF-8 string literal into a WTF-16LE string literal.
1407pub fn wtf8ToWtf16LeStringLiteral(comptime wtf8: []const u8) *const [calcWtf16LeLen(wtf8) catch |err| @compileError(err):0]u16 {
1408    return utf8ToUtf16LeStringLiteralImpl(wtf8, .can_encode_surrogate_half);
1409}
1410
1411pub fn calcUtf16LeLenImpl(utf8: []const u8, comptime surrogates: Surrogates) !usize {
1412    const utf8DecodeImpl = switch (surrogates) {
1413        .cannot_encode_surrogate_half => utf8Decode,
1414        .can_encode_surrogate_half => wtf8Decode,
1415    };
1416    var src_i: usize = 0;
1417    var dest_len: usize = 0;
1418    while (src_i < utf8.len) {
1419        const n = try utf8ByteSequenceLength(utf8[src_i]);
1420        const next_src_i = src_i + n;
1421        const codepoint = try utf8DecodeImpl(utf8[src_i..next_src_i]);
1422        if (codepoint < 0x10000) {
1423            dest_len += 1;
1424        } else {
1425            dest_len += 2;
1426        }
1427        src_i = next_src_i;
1428    }
1429    return dest_len;
1430}
1431
1432const CalcUtf16LeLenError = Utf8DecodeError || error{Utf8InvalidStartByte};
1433
1434/// Returns length in UTF-16LE of UTF-8 slice as length of []u16.
1435/// Length in []u8 is 2*len16.
1436pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize {
1437    return calcUtf16LeLenImpl(utf8, .cannot_encode_surrogate_half);
1438}
1439
1440const CalcWtf16LeLenError = Wtf8DecodeError || error{Utf8InvalidStartByte};
1441
1442/// Returns length in WTF-16LE of WTF-8 slice as length of []u16.
1443/// Length in []u8 is 2*len16.
1444pub fn calcWtf16LeLen(wtf8: []const u8) CalcWtf16LeLenError!usize {
1445    return calcUtf16LeLenImpl(wtf8, .can_encode_surrogate_half);
1446}
1447
1448fn testCalcUtf16LeLenImpl(calcUtf16LeLenImpl_: anytype) !void {
1449    try testing.expectEqual(@as(usize, 1), try calcUtf16LeLenImpl_("a"));
1450    try testing.expectEqual(@as(usize, 10), try calcUtf16LeLenImpl_("abcdefghij"));
1451    try testing.expectEqual(@as(usize, 10), try calcUtf16LeLenImpl_("äåéëþüúíóö"));
1452    try testing.expectEqual(@as(usize, 5), try calcUtf16LeLenImpl_("こんにちは"));
1453}
1454
1455test calcUtf16LeLen {
1456    try testCalcUtf16LeLenImpl(calcUtf16LeLen);
1457    try comptime testCalcUtf16LeLenImpl(calcUtf16LeLen);
1458}
1459
1460test calcWtf16LeLen {
1461    try testCalcUtf16LeLenImpl(calcWtf16LeLen);
1462    try comptime testCalcUtf16LeLenImpl(calcWtf16LeLen);
1463}
1464
1465/// Print the given `utf16le` string, encoded as UTF-8 bytes.
1466/// Unpaired surrogates are replaced by the replacement character (U+FFFD).
1467fn formatUtf16Le(utf16le: []const u16, writer: *std.Io.Writer) std.Io.Writer.Error!void {
1468    var buf: [300]u8 = undefined; // just an arbitrary size
1469    var it = Utf16LeIterator.init(utf16le);
1470    var u8len: usize = 0;
1471    while (it.nextCodepoint() catch replacement_character) |codepoint| {
1472        u8len += utf8Encode(codepoint, buf[u8len..]) catch
1473            utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
1474        // make sure there's always enough room for another maximum length UTF-8 codepoint
1475        if (u8len + 4 > buf.len) {
1476            try writer.writeAll(buf[0..u8len]);
1477            u8len = 0;
1478        }
1479    }
1480    try writer.writeAll(buf[0..u8len]);
1481}
1482
1483/// Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
1484/// which will be converted to UTF-8 during formatting.
1485/// Unpaired surrogates are replaced by the replacement character (U+FFFD).
1486pub fn fmtUtf16Le(utf16le: []const u16) std.fmt.Alt([]const u16, formatUtf16Le) {
1487    return .{ .data = utf16le };
1488}
1489
1490test fmtUtf16Le {
1491    const expectFmt = testing.expectFmt;
1492    try expectFmt("", "{f}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral(""))});
1493    try expectFmt("", "{f}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral(""))});
1494    try expectFmt("foo", "{f}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("foo"))});
1495    try expectFmt("foo", "{f}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral("foo"))});
1496    try expectFmt("𐐷", "{f}", .{fmtUtf16Le(wtf8ToWtf16LeStringLiteral("𐐷"))});
1497    try expectFmt("", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xd7", native_endian)})});
1498    try expectFmt("", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xd8", native_endian)})});
1499    try expectFmt("", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdb", native_endian)})});
1500    try expectFmt("", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xdc", native_endian)})});
1501    try expectFmt("", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdf", native_endian)})});
1502    try expectFmt("", "{f}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xe0", native_endian)})});
1503}
1504
1505fn testUtf8ToUtf16LeStringLiteral(utf8ToUtf16LeStringLiteral_: anytype) !void {
1506    {
1507        const bytes = [_:0]u16{
1508            mem.nativeToLittle(u16, 0x41),
1509        };
1510        const utf16 = utf8ToUtf16LeStringLiteral_("A");
1511        try testing.expectEqualSlices(u16, &bytes, utf16);
1512        try testing.expect(utf16[1] == 0);
1513    }
1514    {
1515        const bytes = [_:0]u16{
1516            mem.nativeToLittle(u16, 0xD801),
1517            mem.nativeToLittle(u16, 0xDC37),
1518        };
1519        const utf16 = utf8ToUtf16LeStringLiteral_("𐐷");
1520        try testing.expectEqualSlices(u16, &bytes, utf16);
1521        try testing.expect(utf16[2] == 0);
1522    }
1523    {
1524        const bytes = [_:0]u16{
1525            mem.nativeToLittle(u16, 0x02FF),
1526        };
1527        const utf16 = utf8ToUtf16LeStringLiteral_("\u{02FF}");
1528        try testing.expectEqualSlices(u16, &bytes, utf16);
1529        try testing.expect(utf16[1] == 0);
1530    }
1531    {
1532        const bytes = [_:0]u16{
1533            mem.nativeToLittle(u16, 0x7FF),
1534        };
1535        const utf16 = utf8ToUtf16LeStringLiteral_("\u{7FF}");
1536        try testing.expectEqualSlices(u16, &bytes, utf16);
1537        try testing.expect(utf16[1] == 0);
1538    }
1539    {
1540        const bytes = [_:0]u16{
1541            mem.nativeToLittle(u16, 0x801),
1542        };
1543        const utf16 = utf8ToUtf16LeStringLiteral_("\u{801}");
1544        try testing.expectEqualSlices(u16, &bytes, utf16);
1545        try testing.expect(utf16[1] == 0);
1546    }
1547    {
1548        const bytes = [_:0]u16{
1549            mem.nativeToLittle(u16, 0xDBFF),
1550            mem.nativeToLittle(u16, 0xDFFF),
1551        };
1552        const utf16 = utf8ToUtf16LeStringLiteral_("\u{10FFFF}");
1553        try testing.expectEqualSlices(u16, &bytes, utf16);
1554        try testing.expect(utf16[2] == 0);
1555    }
1556}
1557
1558test utf8ToUtf16LeStringLiteral {
1559    try testUtf8ToUtf16LeStringLiteral(utf8ToUtf16LeStringLiteral);
1560}
1561
1562test wtf8ToWtf16LeStringLiteral {
1563    try testUtf8ToUtf16LeStringLiteral(wtf8ToWtf16LeStringLiteral);
1564}
1565
1566fn testUtf8CountCodepoints() !void {
1567    try testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
1568    try testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
1569    try testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは"));
1570    // testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80"));
1571}
1572
1573test "utf8 count codepoints" {
1574    try testUtf8CountCodepoints();
1575    try comptime testUtf8CountCodepoints();
1576}
1577
1578fn testUtf8ValidCodepoint() !void {
1579    try testing.expect(utf8ValidCodepoint('e'));
1580    try testing.expect(utf8ValidCodepoint('ë'));
1581    try testing.expect(utf8ValidCodepoint('は'));
1582    try testing.expect(utf8ValidCodepoint(0xe000));
1583    try testing.expect(utf8ValidCodepoint(0x10ffff));
1584    try testing.expect(!utf8ValidCodepoint(0xd800));
1585    try testing.expect(!utf8ValidCodepoint(0xdfff));
1586    try testing.expect(!utf8ValidCodepoint(0x110000));
1587}
1588
1589test "utf8 valid codepoint" {
1590    try testUtf8ValidCodepoint();
1591    try comptime testUtf8ValidCodepoint();
1592}
1593
1594/// Returns true if the codepoint is a surrogate (U+DC00 to U+DFFF)
1595pub fn isSurrogateCodepoint(c: u21) bool {
1596    return switch (c) {
1597        0xD800...0xDFFF => true,
1598        else => false,
1599    };
1600}
1601
1602/// Encodes the given codepoint into a WTF-8 byte sequence.
1603/// c: the codepoint.
1604/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
1605/// Errors: if c cannot be encoded in WTF-8.
1606/// Returns: the number of bytes written to out.
1607pub fn wtf8Encode(c: u21, out: []u8) error{CodepointTooLarge}!u3 {
1608    return utf8EncodeImpl(c, out, .can_encode_surrogate_half);
1609}
1610
1611const Wtf8DecodeError = Utf8Decode2Error || Utf8Decode3AllowSurrogateHalfError || Utf8Decode4Error;
1612
1613/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
1614pub fn wtf8Decode(bytes: []const u8) Wtf8DecodeError!u21 {
1615    return switch (bytes.len) {
1616        1 => bytes[0],
1617        2 => utf8Decode2(bytes[0..2].*),
1618        3 => utf8Decode3AllowSurrogateHalf(bytes[0..3].*),
1619        4 => utf8Decode4(bytes[0..4].*),
1620        else => unreachable,
1621    };
1622}
1623
1624/// Returns true if the input consists entirely of WTF-8 codepoints
1625/// (all the same restrictions as UTF-8, but allows surrogate codepoints
1626/// U+D800 to U+DFFF).
1627/// Does not check for well-formed WTF-8, meaning that this function
1628/// does not check that all surrogate halves are unpaired.
1629pub fn wtf8ValidateSlice(input: []const u8) bool {
1630    return utf8ValidateSliceImpl(input, .can_encode_surrogate_half);
1631}
1632
1633test "validate WTF-8 slice" {
1634    try testValidateWtf8Slice();
1635    try comptime testValidateWtf8Slice();
1636
1637    // We skip a variable (based on recommended vector size) chunks of
1638    // ASCII characters. Let's make sure we're chunking correctly.
1639    const str = [_]u8{'a'} ** 550 ++ "\xc0";
1640    for (0..str.len - 3) |i| {
1641        try testing.expect(!wtf8ValidateSlice(str[i..]));
1642    }
1643}
1644fn testValidateWtf8Slice() !void {
1645    // These are valid/invalid under both UTF-8 and WTF-8 rules.
1646    try testing.expect(wtf8ValidateSlice("abc"));
1647    try testing.expect(wtf8ValidateSlice("abc\xdf\xbf"));
1648    try testing.expect(wtf8ValidateSlice(""));
1649    try testing.expect(wtf8ValidateSlice("a"));
1650    try testing.expect(wtf8ValidateSlice("abc"));
1651    try testing.expect(wtf8ValidateSlice("Ж"));
1652    try testing.expect(wtf8ValidateSlice("ЖЖ"));
1653    try testing.expect(wtf8ValidateSlice("брэд-ЛГТМ"));
1654    try testing.expect(wtf8ValidateSlice("☺☻☹"));
1655    try testing.expect(wtf8ValidateSlice("a\u{fffdb}"));
1656    try testing.expect(wtf8ValidateSlice("\xf4\x8f\xbf\xbf"));
1657    try testing.expect(wtf8ValidateSlice("abc\xdf\xbf"));
1658
1659    try testing.expect(!wtf8ValidateSlice("abc\xc0"));
1660    try testing.expect(!wtf8ValidateSlice("abc\xc0abc"));
1661    try testing.expect(!wtf8ValidateSlice("aa\xe2"));
1662    try testing.expect(!wtf8ValidateSlice("\x42\xfa"));
1663    try testing.expect(!wtf8ValidateSlice("\x42\xfa\x43"));
1664    try testing.expect(!wtf8ValidateSlice("abc\xc0"));
1665    try testing.expect(!wtf8ValidateSlice("abc\xc0abc"));
1666    try testing.expect(!wtf8ValidateSlice("\xf4\x90\x80\x80"));
1667    try testing.expect(!wtf8ValidateSlice("\xf7\xbf\xbf\xbf"));
1668    try testing.expect(!wtf8ValidateSlice("\xfb\xbf\xbf\xbf\xbf"));
1669    try testing.expect(!wtf8ValidateSlice("\xc0\x80"));
1670
1671    // But surrogate codepoints are only valid in WTF-8.
1672    try testing.expect(wtf8ValidateSlice("\xed\xa0\x80"));
1673    try testing.expect(wtf8ValidateSlice("\xed\xbf\xbf"));
1674}
1675
1676/// Wtf8View iterates the code points of a WTF-8 encoded string,
1677/// including surrogate halves.
1678///
1679/// ```
1680/// var wtf8 = (try std.unicode.Wtf8View.init("hi there")).iterator();
1681/// while (wtf8.nextCodepointSlice()) |codepoint| {
1682///   // note: codepoint could be a surrogate half which is invalid
1683///   // UTF-8, avoid printing or otherwise sending/emitting this directly
1684/// }
1685/// ```
1686pub const Wtf8View = struct {
1687    bytes: []const u8,
1688
1689    pub fn init(s: []const u8) error{InvalidWtf8}!Wtf8View {
1690        if (!wtf8ValidateSlice(s)) {
1691            return error.InvalidWtf8;
1692        }
1693
1694        return initUnchecked(s);
1695    }
1696
1697    pub fn initUnchecked(s: []const u8) Wtf8View {
1698        return Wtf8View{ .bytes = s };
1699    }
1700
1701    pub inline fn initComptime(comptime s: []const u8) Wtf8View {
1702        return comptime if (init(s)) |r| r else |err| switch (err) {
1703            error.InvalidWtf8 => {
1704                @compileError("invalid wtf8");
1705            },
1706        };
1707    }
1708
1709    pub fn iterator(s: Wtf8View) Wtf8Iterator {
1710        return Wtf8Iterator{
1711            .bytes = s.bytes,
1712            .i = 0,
1713        };
1714    }
1715};
1716
1717/// Asserts that `bytes` is valid WTF-8
1718pub const Wtf8Iterator = struct {
1719    bytes: []const u8,
1720    i: usize,
1721
1722    pub fn nextCodepointSlice(it: *Wtf8Iterator) ?[]const u8 {
1723        if (it.i >= it.bytes.len) {
1724            return null;
1725        }
1726
1727        const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;
1728        it.i += cp_len;
1729        return it.bytes[it.i - cp_len .. it.i];
1730    }
1731
1732    pub fn nextCodepoint(it: *Wtf8Iterator) ?u21 {
1733        const slice = it.nextCodepointSlice() orelse return null;
1734        return wtf8Decode(slice) catch unreachable;
1735    }
1736
1737    /// Look ahead at the next n codepoints without advancing the iterator.
1738    /// If fewer than n codepoints are available, then return the remainder of the string.
1739    pub fn peek(it: *Wtf8Iterator, n: usize) []const u8 {
1740        const original_i = it.i;
1741        defer it.i = original_i;
1742
1743        var end_ix = original_i;
1744        var found: usize = 0;
1745        while (found < n) : (found += 1) {
1746            const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
1747            end_ix += next_codepoint.len;
1748        }
1749
1750        return it.bytes[original_i..end_ix];
1751    }
1752};
1753
1754pub fn wtf16LeToWtf8ArrayList(result: *std.array_list.Managed(u8), utf16le: []const u16) Allocator.Error!void {
1755    try result.ensureUnusedCapacity(utf16le.len);
1756    return utf16LeToUtf8ArrayListImpl(result, utf16le, .can_encode_surrogate_half);
1757}
1758
1759/// Caller must free returned memory.
1760pub fn wtf16LeToWtf8Alloc(allocator: Allocator, wtf16le: []const u16) Allocator.Error![]u8 {
1761    // optimistically guess that it will all be ascii.
1762    var result = try std.array_list.Managed(u8).initCapacity(allocator, wtf16le.len);
1763    errdefer result.deinit();
1764
1765    try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half);
1766    return result.toOwnedSlice();
1767}
1768
1769/// Caller must free returned memory.
1770pub fn wtf16LeToWtf8AllocZ(allocator: Allocator, wtf16le: []const u16) Allocator.Error![:0]u8 {
1771    // optimistically guess that it will all be ascii (and allocate space for the null terminator)
1772    var result = try std.array_list.Managed(u8).initCapacity(allocator, wtf16le.len + 1);
1773    errdefer result.deinit();
1774
1775    try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half);
1776    return result.toOwnedSliceSentinel(0);
1777}
1778
1779pub fn wtf16LeToWtf8(wtf8: []u8, wtf16le: []const u16) usize {
1780    return utf16LeToUtf8Impl(wtf8, wtf16le, .can_encode_surrogate_half) catch |err| switch (err) {};
1781}
1782
1783pub fn wtf8ToWtf16LeArrayList(result: *std.array_list.Managed(u16), wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }!void {
1784    try result.ensureUnusedCapacity(wtf8.len);
1785    return utf8ToUtf16LeArrayListImpl(result, wtf8, .can_encode_surrogate_half);
1786}
1787
1788pub fn wtf8ToWtf16LeAlloc(allocator: Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![]u16 {
1789    // optimistically guess that it will not require surrogate pairs
1790    var result = try std.array_list.Managed(u16).initCapacity(allocator, wtf8.len);
1791    errdefer result.deinit();
1792
1793    try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half);
1794    return result.toOwnedSlice();
1795}
1796
1797pub fn wtf8ToWtf16LeAllocZ(allocator: Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![:0]u16 {
1798    // optimistically guess that it will not require surrogate pairs
1799    var result = try std.array_list.Managed(u16).initCapacity(allocator, wtf8.len + 1);
1800    errdefer result.deinit();
1801
1802    try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half);
1803    return result.toOwnedSliceSentinel(0);
1804}
1805
1806/// Returns index of next character. If exact fit, returned index equals output slice length.
1807/// Assumes there is enough space for the output.
1808pub fn wtf8ToWtf16Le(wtf16le: []u16, wtf8: []const u8) error{InvalidWtf8}!usize {
1809    return utf8ToUtf16LeImpl(wtf16le, wtf8, .can_encode_surrogate_half);
1810}
1811
1812/// Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement
1813/// character (U+FFFD).
1814/// All surrogate codepoints and the replacement character are encoded as three
1815/// bytes, meaning the input and output slices will always be the same length.
1816/// In-place conversion is supported when `utf8` and `wtf8` refer to the same slice.
1817/// Note: If `wtf8` is entirely composed of well-formed UTF-8, then no conversion is necessary.
1818///       `utf8ValidateSlice` can be used to check if lossy conversion is worthwhile.
1819/// If `wtf8` is not valid WTF-8, then `error.InvalidWtf8` is returned.
1820pub fn wtf8ToUtf8Lossy(utf8: []u8, wtf8: []const u8) error{InvalidWtf8}!void {
1821    assert(utf8.len >= wtf8.len);
1822
1823    const in_place = utf8.ptr == wtf8.ptr;
1824    const replacement_char_bytes = comptime blk: {
1825        var buf: [3]u8 = undefined;
1826        assert((utf8Encode(replacement_character, &buf) catch unreachable) == 3);
1827        break :blk buf;
1828    };
1829
1830    var dest_i: usize = 0;
1831    const view = try Wtf8View.init(wtf8);
1832    var it = view.iterator();
1833    while (it.nextCodepointSlice()) |codepoint_slice| {
1834        // All surrogate codepoints are encoded as 3 bytes
1835        if (codepoint_slice.len == 3) {
1836            const codepoint = wtf8Decode(codepoint_slice) catch unreachable;
1837            if (isSurrogateCodepoint(codepoint)) {
1838                @memcpy(utf8[dest_i..][0..replacement_char_bytes.len], &replacement_char_bytes);
1839                dest_i += replacement_char_bytes.len;
1840                continue;
1841            }
1842        }
1843        if (!in_place) {
1844            @memcpy(utf8[dest_i..][0..codepoint_slice.len], codepoint_slice);
1845        }
1846        dest_i += codepoint_slice.len;
1847    }
1848}
1849
1850pub fn wtf8ToUtf8LossyAlloc(allocator: Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![]u8 {
1851    const utf8 = try allocator.alloc(u8, wtf8.len);
1852    errdefer allocator.free(utf8);
1853
1854    try wtf8ToUtf8Lossy(utf8, wtf8);
1855
1856    return utf8;
1857}
1858
1859pub fn wtf8ToUtf8LossyAllocZ(allocator: Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![:0]u8 {
1860    const utf8 = try allocator.allocSentinel(u8, wtf8.len, 0);
1861    errdefer allocator.free(utf8);
1862
1863    try wtf8ToUtf8Lossy(utf8, wtf8);
1864
1865    return utf8;
1866}
1867
1868test wtf8ToUtf8Lossy {
1869    var buf: [32]u8 = undefined;
1870
1871    const invalid_utf8 = "\xff";
1872    try testing.expectError(error.InvalidWtf8, wtf8ToUtf8Lossy(&buf, invalid_utf8));
1873
1874    const ascii = "abcd";
1875    try wtf8ToUtf8Lossy(&buf, ascii);
1876    try testing.expectEqualStrings("abcd", buf[0..ascii.len]);
1877
1878    const high_surrogate_half = "ab\xed\xa0\xbdcd";
1879    try wtf8ToUtf8Lossy(&buf, high_surrogate_half);
1880    try testing.expectEqualStrings("ab\u{FFFD}cd", buf[0..high_surrogate_half.len]);
1881
1882    const low_surrogate_half = "ab\xed\xb2\xa9cd";
1883    try wtf8ToUtf8Lossy(&buf, low_surrogate_half);
1884    try testing.expectEqualStrings("ab\u{FFFD}cd", buf[0..low_surrogate_half.len]);
1885
1886    // If the WTF-8 is not well-formed, each surrogate half is converted into a separate
1887    // replacement character instead of being interpreted as a surrogate pair.
1888    const encoded_surrogate_pair = "ab\xed\xa0\xbd\xed\xb2\xa9cd";
1889    try wtf8ToUtf8Lossy(&buf, encoded_surrogate_pair);
1890    try testing.expectEqualStrings("ab\u{FFFD}\u{FFFD}cd", buf[0..encoded_surrogate_pair.len]);
1891
1892    // in place
1893    @memcpy(buf[0..low_surrogate_half.len], low_surrogate_half);
1894    const slice = buf[0..low_surrogate_half.len];
1895    try wtf8ToUtf8Lossy(slice, slice);
1896    try testing.expectEqualStrings("ab\u{FFFD}cd", slice);
1897}
1898
1899test wtf8ToUtf8LossyAlloc {
1900    const invalid_utf8 = "\xff";
1901    try testing.expectError(error.InvalidWtf8, wtf8ToUtf8LossyAlloc(testing.allocator, invalid_utf8));
1902
1903    {
1904        const ascii = "abcd";
1905        const utf8 = try wtf8ToUtf8LossyAlloc(testing.allocator, ascii);
1906        defer testing.allocator.free(utf8);
1907        try testing.expectEqualStrings("abcd", utf8);
1908    }
1909
1910    {
1911        const surrogate_half = "ab\xed\xa0\xbdcd";
1912        const utf8 = try wtf8ToUtf8LossyAlloc(testing.allocator, surrogate_half);
1913        defer testing.allocator.free(utf8);
1914        try testing.expectEqualStrings("ab\u{FFFD}cd", utf8);
1915    }
1916
1917    {
1918        // If the WTF-8 is not well-formed, each surrogate half is converted into a separate
1919        // replacement character instead of being interpreted as a surrogate pair.
1920        const encoded_surrogate_pair = "ab\xed\xa0\xbd\xed\xb2\xa9cd";
1921        const utf8 = try wtf8ToUtf8LossyAlloc(testing.allocator, encoded_surrogate_pair);
1922        defer testing.allocator.free(utf8);
1923        try testing.expectEqualStrings("ab\u{FFFD}\u{FFFD}cd", utf8);
1924    }
1925}
1926
1927test wtf8ToUtf8LossyAllocZ {
1928    const invalid_utf8 = "\xff";
1929    try testing.expectError(error.InvalidWtf8, wtf8ToUtf8LossyAllocZ(testing.allocator, invalid_utf8));
1930
1931    {
1932        const ascii = "abcd";
1933        const utf8 = try wtf8ToUtf8LossyAllocZ(testing.allocator, ascii);
1934        defer testing.allocator.free(utf8);
1935        try testing.expectEqualStrings("abcd", utf8);
1936    }
1937
1938    {
1939        const surrogate_half = "ab\xed\xa0\xbdcd";
1940        const utf8 = try wtf8ToUtf8LossyAllocZ(testing.allocator, surrogate_half);
1941        defer testing.allocator.free(utf8);
1942        try testing.expectEqualStrings("ab\u{FFFD}cd", utf8);
1943    }
1944
1945    {
1946        // If the WTF-8 is not well-formed, each surrogate half is converted into a separate
1947        // replacement character instead of being interpreted as a surrogate pair.
1948        const encoded_surrogate_pair = "ab\xed\xa0\xbd\xed\xb2\xa9cd";
1949        const utf8 = try wtf8ToUtf8LossyAllocZ(testing.allocator, encoded_surrogate_pair);
1950        defer testing.allocator.free(utf8);
1951        try testing.expectEqualStrings("ab\u{FFFD}\u{FFFD}cd", utf8);
1952    }
1953}
1954
1955pub const Wtf16LeIterator = struct {
1956    bytes: []const u8,
1957    i: usize,
1958
1959    pub fn init(s: []const u16) Wtf16LeIterator {
1960        return Wtf16LeIterator{
1961            .bytes = mem.sliceAsBytes(s),
1962            .i = 0,
1963        };
1964    }
1965
1966    /// If the next codepoint is encoded by a surrogate pair, returns the
1967    /// codepoint that the surrogate pair represents.
1968    /// If the next codepoint is an unpaired surrogate, returns the codepoint
1969    /// of the unpaired surrogate.
1970    pub fn nextCodepoint(it: *Wtf16LeIterator) ?u21 {
1971        assert(it.i <= it.bytes.len);
1972        if (it.i == it.bytes.len) return null;
1973        var code_units: [2]u16 = undefined;
1974        code_units[0] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
1975        it.i += 2;
1976        surrogate_pair: {
1977            if (utf16IsHighSurrogate(code_units[0])) {
1978                if (it.i >= it.bytes.len) break :surrogate_pair;
1979                code_units[1] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
1980                const codepoint = utf16DecodeSurrogatePair(&code_units) catch break :surrogate_pair;
1981                it.i += 2;
1982                return codepoint;
1983            }
1984        }
1985        return code_units[0];
1986    }
1987};
1988
1989test "non-well-formed WTF-8 does not roundtrip" {
1990    // This encodes the surrogate pair U+D83D U+DCA9.
1991    // The well-formed version of this would be U+1F4A9 which is \xF0\x9F\x92\xA9.
1992    const non_well_formed_wtf8 = "\xed\xa0\xbd\xed\xb2\xa9";
1993
1994    var wtf16_buf: [2]u16 = undefined;
1995    const wtf16_len = try wtf8ToWtf16Le(&wtf16_buf, non_well_formed_wtf8);
1996    const wtf16 = wtf16_buf[0..wtf16_len];
1997
1998    try testing.expectEqualSlices(u16, &[_]u16{
1999        mem.nativeToLittle(u16, 0xD83D), // high surrogate
2000        mem.nativeToLittle(u16, 0xDCA9), // low surrogate
2001    }, wtf16);
2002
2003    var wtf8_buf: [4]u8 = undefined;
2004    const wtf8_len = wtf16LeToWtf8(&wtf8_buf, wtf16);
2005    const wtf8 = wtf8_buf[0..wtf8_len];
2006
2007    // Converting to WTF-16 and back results in well-formed WTF-8,
2008    // but it does not match the input WTF-8
2009    try testing.expectEqualSlices(u8, "\xf0\x9f\x92\xa9", wtf8);
2010}
2011
2012fn testRoundtripWtf8(wtf8: []const u8) !void {
2013    // Buffer
2014    {
2015        var wtf16_buf: [32]u16 = undefined;
2016        const wtf16_len = try wtf8ToWtf16Le(&wtf16_buf, wtf8);
2017        try testing.expectEqual(wtf16_len, calcWtf16LeLen(wtf8));
2018        const wtf16 = wtf16_buf[0..wtf16_len];
2019
2020        var roundtripped_buf: [32]u8 = undefined;
2021        const roundtripped_len = wtf16LeToWtf8(&roundtripped_buf, wtf16);
2022        const roundtripped = roundtripped_buf[0..roundtripped_len];
2023
2024        try testing.expectEqualSlices(u8, wtf8, roundtripped);
2025    }
2026    // Alloc
2027    {
2028        const wtf16 = try wtf8ToWtf16LeAlloc(testing.allocator, wtf8);
2029        defer testing.allocator.free(wtf16);
2030
2031        const roundtripped = try wtf16LeToWtf8Alloc(testing.allocator, wtf16);
2032        defer testing.allocator.free(roundtripped);
2033
2034        try testing.expectEqualSlices(u8, wtf8, roundtripped);
2035    }
2036    // AllocZ
2037    {
2038        const wtf16 = try wtf8ToWtf16LeAllocZ(testing.allocator, wtf8);
2039        defer testing.allocator.free(wtf16);
2040
2041        const roundtripped = try wtf16LeToWtf8AllocZ(testing.allocator, wtf16);
2042        defer testing.allocator.free(roundtripped);
2043
2044        try testing.expectEqualSlices(u8, wtf8, roundtripped);
2045    }
2046}
2047
2048test "well-formed WTF-8 roundtrips" {
2049    try testRoundtripWtf8("\xed\x9f\xbf"); // not a surrogate half
2050    try testRoundtripWtf8("\xed\xa0\xbd"); // high surrogate
2051    try testRoundtripWtf8("\xed\xb2\xa9"); // low surrogate
2052    try testRoundtripWtf8("\xed\xa0\xbd \xed\xb2\xa9"); // <high surrogate><space><low surrogate>
2053    try testRoundtripWtf8("\xed\xa0\x80\xed\xaf\xbf"); // <high surrogate><high surrogate>
2054    try testRoundtripWtf8("\xed\xa0\x80\xee\x80\x80"); // <high surrogate><not surrogate>
2055    try testRoundtripWtf8("\xed\x9f\xbf\xed\xb0\x80"); // <not surrogate><low surrogate>
2056    try testRoundtripWtf8("a\xed\xb0\x80"); // <not surrogate><low surrogate>
2057    try testRoundtripWtf8("\xf0\x9f\x92\xa9"); // U+1F4A9, encoded as a surrogate pair in WTF-16
2058}
2059
2060fn testRoundtripWtf16(wtf16le: []const u16) !void {
2061    // Buffer
2062    {
2063        var wtf8_buf: [32]u8 = undefined;
2064        const wtf8_len = wtf16LeToWtf8(&wtf8_buf, wtf16le);
2065        const wtf8 = wtf8_buf[0..wtf8_len];
2066
2067        var roundtripped_buf: [32]u16 = undefined;
2068        const roundtripped_len = try wtf8ToWtf16Le(&roundtripped_buf, wtf8);
2069        const roundtripped = roundtripped_buf[0..roundtripped_len];
2070
2071        try testing.expectEqualSlices(u16, wtf16le, roundtripped);
2072    }
2073    // Alloc
2074    {
2075        const wtf8 = try wtf16LeToWtf8Alloc(testing.allocator, wtf16le);
2076        defer testing.allocator.free(wtf8);
2077
2078        const roundtripped = try wtf8ToWtf16LeAlloc(testing.allocator, wtf8);
2079        defer testing.allocator.free(roundtripped);
2080
2081        try testing.expectEqualSlices(u16, wtf16le, roundtripped);
2082    }
2083    // AllocZ
2084    {
2085        const wtf8 = try wtf16LeToWtf8AllocZ(testing.allocator, wtf16le);
2086        defer testing.allocator.free(wtf8);
2087
2088        const roundtripped = try wtf8ToWtf16LeAllocZ(testing.allocator, wtf8);
2089        defer testing.allocator.free(roundtripped);
2090
2091        try testing.expectEqualSlices(u16, wtf16le, roundtripped);
2092    }
2093}
2094
2095test "well-formed WTF-16 roundtrips" {
2096    try testRoundtripWtf16(&[_]u16{
2097        mem.nativeToLittle(u16, 0xD83D), // high surrogate
2098        mem.nativeToLittle(u16, 0xDCA9), // low surrogate
2099    });
2100    try testRoundtripWtf16(&[_]u16{
2101        mem.nativeToLittle(u16, 0xD83D), // high surrogate
2102        mem.nativeToLittle(u16, ' '), // not surrogate
2103        mem.nativeToLittle(u16, 0xDCA9), // low surrogate
2104    });
2105    try testRoundtripWtf16(&[_]u16{
2106        mem.nativeToLittle(u16, 0xD800), // high surrogate
2107        mem.nativeToLittle(u16, 0xDBFF), // high surrogate
2108    });
2109    try testRoundtripWtf16(&[_]u16{
2110        mem.nativeToLittle(u16, 0xD800), // high surrogate
2111        mem.nativeToLittle(u16, 0xE000), // not surrogate
2112    });
2113    try testRoundtripWtf16(&[_]u16{
2114        mem.nativeToLittle(u16, 0xD7FF), // not surrogate
2115        mem.nativeToLittle(u16, 0xDC00), // low surrogate
2116    });
2117    try testRoundtripWtf16(&[_]u16{
2118        mem.nativeToLittle(u16, 0x61), // not surrogate
2119        mem.nativeToLittle(u16, 0xDC00), // low surrogate
2120    });
2121    try testRoundtripWtf16(&[_]u16{
2122        mem.nativeToLittle(u16, 0xDC00), // low surrogate
2123    });
2124}
2125
2126/// Returns the length, in bytes, that would be necessary to encode the
2127/// given WTF-16 LE slice as WTF-8.
2128pub fn calcWtf8Len(wtf16le: []const u16) usize {
2129    var it = Wtf16LeIterator.init(wtf16le);
2130    var num_wtf8_bytes: usize = 0;
2131    while (it.nextCodepoint()) |codepoint| {
2132        // Note: If utf8CodepointSequenceLength is ever changed to error on surrogate
2133        // codepoints, then it would no longer be eligible to be used in this context.
2134        num_wtf8_bytes += utf8CodepointSequenceLength(codepoint) catch |err| switch (err) {
2135            error.CodepointTooLarge => unreachable,
2136        };
2137    }
2138    return num_wtf8_bytes;
2139}
2140
2141fn testCalcWtf8Len() !void {
2142    const L = utf8ToUtf16LeStringLiteral;
2143    try testing.expectEqual(@as(usize, 1), calcWtf8Len(L("a")));
2144    try testing.expectEqual(@as(usize, 10), calcWtf8Len(L("abcdefghij")));
2145    // unpaired surrogate
2146    try testing.expectEqual(@as(usize, 3), calcWtf8Len(&[_]u16{
2147        mem.nativeToLittle(u16, 0xD800),
2148    }));
2149    try testing.expectEqual(@as(usize, 15), calcWtf8Len(L("こんにちは")));
2150    // First codepoints that are encoded as 1, 2, 3, and 4 bytes
2151    try testing.expectEqual(@as(usize, 1 + 2 + 3 + 4), calcWtf8Len(L("\u{0}\u{80}\u{800}\u{10000}")));
2152}
2153
2154test "calculate wtf8 string length of given wtf16 string" {
2155    try testCalcWtf8Len();
2156    try comptime testCalcWtf8Len();
2157}