master
1const std = @import("std");
2const windows1252 = @import("windows1252.zig");
3
4/// https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
5pub const SupportedCodePage = enum(u16) {
6 windows1252 = 1252, // windows-1252 ANSI Latin 1; Western European (Windows)
7 utf8 = 65001, // utf-8 Unicode (UTF-8)
8
9 pub fn codepointAt(code_page: SupportedCodePage, index: usize, bytes: []const u8) ?Codepoint {
10 if (index >= bytes.len) return null;
11 switch (code_page) {
12 .windows1252 => {
13 // All byte values have a representation, so just convert the byte
14 return Codepoint{
15 .value = windows1252.toCodepoint(bytes[index]),
16 .byte_len = 1,
17 };
18 },
19 .utf8 => {
20 return Utf8.WellFormedDecoder.decode(bytes[index..]);
21 },
22 }
23 }
24};
25
26/// https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
27pub const UnsupportedCodePage = enum(u16) {
28 ibm037 = 37, // IBM037 IBM EBCDIC US-Canada
29 ibm437 = 437, // IBM437 OEM United States
30 ibm500 = 500, // IBM500 IBM EBCDIC International
31 asmo708 = 708, // ASMO-708 Arabic (ASMO 708)
32 asmo449plus = 709, // Arabic (ASMO-449+, BCON V4)
33 transparent_arabic = 710, // Arabic - Transparent Arabic
34 dos720 = 720, // DOS-720 Arabic (Transparent ASMO); Arabic (DOS)
35 ibm737 = 737, // ibm737 OEM Greek (formerly 437G); Greek (DOS)
36 ibm775 = 775, // ibm775 OEM Baltic; Baltic (DOS)
37 ibm850 = 850, // ibm850 OEM Multilingual Latin 1; Western European (DOS)
38 ibm852 = 852, // ibm852 OEM Latin 2; Central European (DOS)
39 ibm855 = 855, // IBM855 OEM Cyrillic (primarily Russian)
40 ibm857 = 857, // ibm857 OEM Turkish; Turkish (DOS)
41 ibm00858 = 858, // IBM00858 OEM Multilingual Latin 1 + Euro symbol
42 ibm860 = 860, // IBM860 OEM Portuguese; Portuguese (DOS)
43 ibm861 = 861, // ibm861 OEM Icelandic; Icelandic (DOS)
44 dos862 = 862, // DOS-862 OEM Hebrew; Hebrew (DOS)
45 ibm863 = 863, // IBM863 OEM French Canadian; French Canadian (DOS)
46 ibm864 = 864, // IBM864 OEM Arabic; Arabic (864)
47 ibm865 = 865, // IBM865 OEM Nordic; Nordic (DOS)
48 cp866 = 866, // cp866 OEM Russian; Cyrillic (DOS)
49 ibm869 = 869, // ibm869 OEM Modern Greek; Greek, Modern (DOS)
50 ibm870 = 870, // IBM870 IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
51 windows874 = 874, // windows-874 Thai (Windows)
52 cp875 = 875, // cp875 IBM EBCDIC Greek Modern
53 shift_jis = 932, // shift_jis ANSI/OEM Japanese; Japanese (Shift-JIS)
54 gb2312 = 936, // gb2312 ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
55 ks_c_5601_1987 = 949, // ks_c_5601-1987 ANSI/OEM Korean (Unified Hangul Code)
56 big5 = 950, // big5 ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
57 ibm1026 = 1026, // IBM1026 IBM EBCDIC Turkish (Latin 5)
58 ibm01047 = 1047, // IBM01047 IBM EBCDIC Latin 1/Open System
59 ibm01140 = 1140, // IBM01140 IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
60 ibm01141 = 1141, // IBM01141 IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
61 ibm01142 = 1142, // IBM01142 IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
62 ibm01143 = 1143, // IBM01143 IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
63 ibm01144 = 1144, // IBM01144 IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
64 ibm01145 = 1145, // IBM01145 IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
65 ibm01146 = 1146, // IBM01146 IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
66 ibm01147 = 1147, // IBM01147 IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
67 ibm01148 = 1148, // IBM01148 IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
68 ibm01149 = 1149, // IBM01149 IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
69 utf16 = 1200, // utf-16 Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
70 utf16_fffe = 1201, // unicodeFFFE Unicode UTF-16, big endian byte order; available only to managed applications
71 windows1250 = 1250, // windows-1250 ANSI Central European; Central European (Windows)
72 windows1251 = 1251, // windows-1251 ANSI Cyrillic; Cyrillic (Windows)
73 windows1253 = 1253, // windows-1253 ANSI Greek; Greek (Windows)
74 windows1254 = 1254, // windows-1254 ANSI Turkish; Turkish (Windows)
75 windows1255 = 1255, // windows-1255 ANSI Hebrew; Hebrew (Windows)
76 windows1256 = 1256, // windows-1256 ANSI Arabic; Arabic (Windows)
77 windows1257 = 1257, // windows-1257 ANSI Baltic; Baltic (Windows)
78 windows1258 = 1258, // windows-1258 ANSI/OEM Vietnamese; Vietnamese (Windows)
79 johab = 1361, // Johab Korean (Johab)
80 macintosh = 10000, // macintosh MAC Roman; Western European (Mac)
81 x_mac_japanese = 10001, // x-mac-japanese Japanese (Mac)
82 x_mac_chinesetrad = 10002, // x-mac-chinesetrad MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
83 x_mac_korean = 10003, // x-mac-korean Korean (Mac)
84 x_mac_arabic = 10004, // x-mac-arabic Arabic (Mac)
85 x_mac_hebrew = 10005, // x-mac-hebrew Hebrew (Mac)
86 x_mac_greek = 10006, // x-mac-greek Greek (Mac)
87 x_mac_cyrillic = 10007, // x-mac-cyrillic Cyrillic (Mac)
88 x_mac_chinesesimp = 10008, // x-mac-chinesesimp MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
89 x_mac_romanian = 10010, // x-mac-romanian Romanian (Mac)
90 x_mac_ukranian = 10017, // x-mac-ukrainian Ukrainian (Mac)
91 x_mac_thai = 10021, // x-mac-thai Thai (Mac)
92 x_mac_ce = 10029, // x-mac-ce MAC Latin 2; Central European (Mac)
93 x_mac_icelandic = 10079, // x-mac-icelandic Icelandic (Mac)
94 x_mac_turkish = 10081, // x-mac-turkish Turkish (Mac)
95 x_mac_croatian = 10082, // x-mac-croatian Croatian (Mac)
96 utf32 = 12000, // utf-32 Unicode UTF-32, little endian byte order; available only to managed applications
97 utf32_be = 12001, // utf-32BE Unicode UTF-32, big endian byte order; available only to managed applications
98 x_chinese_cns = 20000, // x-Chinese_CNS CNS Taiwan; Chinese Traditional (CNS)
99 x_cp20001 = 20001, // x-cp20001 TCA Taiwan
100 x_chinese_eten = 20002, // x_Chinese-Eten Eten Taiwan; Chinese Traditional (Eten)
101 x_cp20003 = 20003, // x-cp20003 IBM5550 Taiwan
102 x_cp20004 = 20004, // x-cp20004 TeleText Taiwan
103 x_cp20005 = 20005, // x-cp20005 Wang Taiwan
104 x_ia5 = 20105, // x-IA5 IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
105 x_ia5_german = 20106, // x-IA5-German IA5 German (7-bit)
106 x_ia5_swedish = 20107, // x-IA5-Swedish IA5 Swedish (7-bit)
107 x_ia5_norwegian = 20108, // x-IA5-Norwegian IA5 Norwegian (7-bit)
108 us_ascii = 20127, // us-ascii US-ASCII (7-bit)
109 x_cp20261 = 20261, // x-cp20261 T.61
110 x_cp20269 = 20269, // x-cp20269 ISO 6937 Non-Spacing Accent
111 ibm273 = 20273, // IBM273 IBM EBCDIC Germany
112 ibm277 = 20277, // IBM277 IBM EBCDIC Denmark-Norway
113 ibm278 = 20278, // IBM278 IBM EBCDIC Finland-Sweden
114 ibm280 = 20280, // IBM280 IBM EBCDIC Italy
115 ibm284 = 20284, // IBM284 IBM EBCDIC Latin America-Spain
116 ibm285 = 20285, // IBM285 IBM EBCDIC United Kingdom
117 ibm290 = 20290, // IBM290 IBM EBCDIC Japanese Katakana Extended
118 ibm297 = 20297, // IBM297 IBM EBCDIC France
119 ibm420 = 20420, // IBM420 IBM EBCDIC Arabic
120 ibm423 = 20423, // IBM423 IBM EBCDIC Greek
121 ibm424 = 20424, // IBM424 IBM EBCDIC Hebrew
122 x_ebcdic_korean_extended = 20833, // x-EBCDIC-KoreanExtended IBM EBCDIC Korean Extended
123 ibm_thai = 20838, // IBM-Thai IBM EBCDIC Thai
124 koi8_r = 20866, // koi8-r Russian (KOI8-R); Cyrillic (KOI8-R)
125 ibm871 = 20871, // IBM871 IBM EBCDIC Icelandic
126 ibm880 = 20880, // IBM880 IBM EBCDIC Cyrillic Russian
127 ibm905 = 20905, // IBM905 IBM EBCDIC Turkish
128 ibm00924 = 20924, // IBM00924 IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
129 euc_jp_jis = 20932, // EUC-JP Japanese (JIS 0208-1990 and 0212-1990)
130 x_cp20936 = 20936, // x-cp20936 Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
131 x_cp20949 = 20949, // x-cp20949 Korean Wansung
132 cp1025 = 21025, // cp1025 IBM EBCDIC Cyrillic Serbian-Bulgarian
133 // = 21027, // (deprecated)
134 koi8_u = 21866, // koi8-u Ukrainian (KOI8-U); Cyrillic (KOI8-U)
135 iso8859_1 = 28591, // iso-8859-1 ISO 8859-1 Latin 1; Western European (ISO)
136 iso8859_2 = 28592, // iso-8859-2 ISO 8859-2 Central European; Central European (ISO)
137 iso8859_3 = 28593, // iso-8859-3 ISO 8859-3 Latin 3
138 iso8859_4 = 28594, // iso-8859-4 ISO 8859-4 Baltic
139 iso8859_5 = 28595, // iso-8859-5 ISO 8859-5 Cyrillic
140 iso8859_6 = 28596, // iso-8859-6 ISO 8859-6 Arabic
141 iso8859_7 = 28597, // iso-8859-7 ISO 8859-7 Greek
142 iso8859_8 = 28598, // iso-8859-8 ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
143 iso8859_9 = 28599, // iso-8859-9 ISO 8859-9 Turkish
144 iso8859_13 = 28603, // iso-8859-13 ISO 8859-13 Estonian
145 iso8859_15 = 28605, // iso-8859-15 ISO 8859-15 Latin 9
146 x_europa = 29001, // x-Europa Europa 3
147 is8859_8_i = 38598, // iso-8859-8-i ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
148 iso2022_jp = 50220, // iso-2022-jp ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
149 cs_iso2022_jp = 50221, // csISO2022JP ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
150 iso2022_jp_jis_x = 50222, // iso-2022-jp ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
151 iso2022_kr = 50225, // iso-2022-kr ISO 2022 Korean
152 x_cp50227 = 50227, // x-cp50227 ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
153 iso2022_chinesetrad = 50229, // ISO 2022 Traditional Chinese
154 ebcdic_jp_katakana_extended = 50930, // EBCDIC Japanese (Katakana) Extended
155 ebcdic_us_ca_jp = 50931, // EBCDIC US-Canada and Japanese
156 ebcdic_kr_extended = 50933, // EBCDIC Korean Extended and Korean
157 ebcdic_chinesesimp_extended = 50935, // EBCDIC Simplified Chinese Extended and Simplified Chinese
158 ebcdic_chinesesimp = 50936, // EBCDIC Simplified Chinese
159 ebcdic_us_ca_chinesetrad = 50937, // EBCDIC US-Canada and Traditional Chinese
160 ebcdic_jp_latin_extended = 50939, // EBCDIC Japanese (Latin) Extended and Japanese
161 euc_jp = 51932, // euc-jp EUC Japanese
162 euc_cn = 51936, // EUC-CN EUC Simplified Chinese; Chinese Simplified (EUC)
163 euc_kr = 51949, // euc-kr EUC Korean
164 euc_chinesetrad = 51950, // EUC Traditional Chinese
165 hz_gb2312 = 52936, // hz-gb-2312 HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
166 gb18030 = 54936, // GB18030 Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
167 x_iscii_de = 57002, // x-iscii-de ISCII Devanagari
168 x_iscii_be = 57003, // x-iscii-be ISCII Bangla
169 x_iscii_ta = 57004, // x-iscii-ta ISCII Tamil
170 x_iscii_te = 57005, // x-iscii-te ISCII Telugu
171 x_iscii_as = 57006, // x-iscii-as ISCII Assamese
172 x_iscii_or = 57007, // x-iscii-or ISCII Odia
173 x_iscii_ka = 57008, // x-iscii-ka ISCII Kannada
174 x_iscii_ma = 57009, // x-iscii-ma ISCII Malayalam
175 x_iscii_gu = 57010, // x-iscii-gu ISCII Gujarati
176 x_iscii_pa = 57011, // x-iscii-pa ISCII Punjabi
177 utf7 = 65000, // utf-7 Unicode (UTF-7)
178};
179
180pub const CodePage = blk: {
181 const fields = @typeInfo(SupportedCodePage).@"enum".fields ++ @typeInfo(UnsupportedCodePage).@"enum".fields;
182 var field_names: [fields.len][]const u8 = undefined;
183 var field_values: [fields.len]u16 = undefined;
184 for (fields, &field_names, &field_values) |field, *name, *val| {
185 name.* = field.name;
186 val.* = field.value;
187 }
188 break :blk @Enum(u16, .exhaustive, &field_names, &field_values);
189};
190
191pub fn isSupported(code_page: CodePage) bool {
192 inline for (@typeInfo(SupportedCodePage).@"enum".fields) |enumField| {
193 if (@intFromEnum(code_page) == @intFromEnum(@field(SupportedCodePage, enumField.name))) {
194 return true;
195 }
196 }
197 return false;
198}
199
200pub fn getByIdentifier(identifier: u16) !CodePage {
201 // There's probably a more efficient way to do this (e.g. ComptimeHashMap?) but
202 // this should be fine, especially since this function likely won't be called much.
203 inline for (@typeInfo(CodePage).@"enum".fields) |enumField| {
204 if (identifier == enumField.value) {
205 return @field(CodePage, enumField.name);
206 }
207 }
208 return error.InvalidCodePage;
209}
210
211pub fn getByIdentifierEnsureSupported(identifier: u16) !SupportedCodePage {
212 const code_page = try getByIdentifier(identifier);
213 return if (isSupported(code_page))
214 @enumFromInt(@intFromEnum(code_page))
215 else
216 error.UnsupportedCodePage;
217}
218
219pub const Utf8 = struct {
220 /// Implements decoding with rejection of ill-formed UTF-8 sequences based on section
221 /// D92 of Chapter 3 of the Unicode standard (Table 3-7 specifically).
222 ///
223 /// Note: This does not match "U+FFFD Substitution of Maximal Subparts", but instead
224 /// matches the behavior of the Windows RC compiler.
225 pub const WellFormedDecoder = struct {
226 /// Like std.unicode.utf8ByteSequenceLength, but:
227 /// - Rejects non-well-formed first bytes, i.e. C0-C1, F5-FF
228 /// - Returns an optional value instead of an error union
229 pub fn sequenceLength(first_byte: u8) ?u3 {
230 return switch (first_byte) {
231 0x00...0x7F => 1,
232 0xC2...0xDF => 2,
233 0xE0...0xEF => 3,
234 0xF0...0xF4 => 4,
235 else => null,
236 };
237 }
238
239 fn isContinuationByte(byte: u8) bool {
240 return switch (byte) {
241 0x80...0xBF => true,
242 else => false,
243 };
244 }
245
246 pub fn decode(bytes: []const u8) Codepoint {
247 std.debug.assert(bytes.len > 0);
248 const first_byte = bytes[0];
249 const expected_len = sequenceLength(first_byte) orelse {
250 return .{ .value = Codepoint.invalid, .byte_len = 1 };
251 };
252 if (expected_len == 1) return .{ .value = first_byte, .byte_len = 1 };
253
254 var value: u21 = first_byte & 0b00011111;
255 var byte_index: u8 = 1;
256 while (byte_index < @min(bytes.len, expected_len)) : (byte_index += 1) {
257 const byte = bytes[byte_index];
258 // See Table 3-7 of D92 in Chapter 3 of the Unicode Standard
259 const valid: bool = switch (byte_index) {
260 1 => switch (first_byte) {
261 0xE0 => switch (byte) {
262 0xA0...0xBF => true,
263 else => false,
264 },
265 0xED => switch (byte) {
266 0x80...0x9F => true,
267 else => false,
268 },
269 0xF0 => switch (byte) {
270 0x90...0xBF => true,
271 else => false,
272 },
273 0xF4 => switch (byte) {
274 0x80...0x8F => true,
275 else => false,
276 },
277 else => switch (byte) {
278 0x80...0xBF => true,
279 else => false,
280 },
281 },
282 else => switch (byte) {
283 0x80...0xBF => true,
284 else => false,
285 },
286 };
287
288 if (!valid) {
289 var len = byte_index;
290 // Only include the byte in the invalid sequence if it's in the range
291 // of a continuation byte. All other values should not be included in the
292 // invalid sequence.
293 if (isContinuationByte(byte)) len += 1;
294 return .{ .value = Codepoint.invalid, .byte_len = len };
295 }
296
297 value <<= 6;
298 value |= byte & 0b00111111;
299 }
300 if (byte_index != expected_len) {
301 return .{ .value = Codepoint.invalid, .byte_len = byte_index };
302 }
303 return .{ .value = value, .byte_len = expected_len };
304 }
305 };
306};
307
308test "Utf8.WellFormedDecoder" {
309 const invalid_utf8 = "\xF0\x80";
310 const decoded = Utf8.WellFormedDecoder.decode(invalid_utf8);
311 try std.testing.expectEqual(Codepoint.invalid, decoded.value);
312 try std.testing.expectEqual(@as(usize, 2), decoded.byte_len);
313}
314
315test "codepointAt invalid utf8" {
316 {
317 const invalid_utf8 = "\xf0\xf0\x80\x80\x80";
318 try std.testing.expectEqual(Codepoint{
319 .value = Codepoint.invalid,
320 .byte_len = 1,
321 }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
322 try std.testing.expectEqual(Codepoint{
323 .value = Codepoint.invalid,
324 .byte_len = 2,
325 }, SupportedCodePage.utf8.codepointAt(1, invalid_utf8).?);
326 try std.testing.expectEqual(Codepoint{
327 .value = Codepoint.invalid,
328 .byte_len = 1,
329 }, SupportedCodePage.utf8.codepointAt(3, invalid_utf8).?);
330 try std.testing.expectEqual(Codepoint{
331 .value = Codepoint.invalid,
332 .byte_len = 1,
333 }, SupportedCodePage.utf8.codepointAt(4, invalid_utf8).?);
334 try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(5, invalid_utf8));
335 }
336
337 {
338 const invalid_utf8 = "\xE1\xA0\xC0";
339 try std.testing.expectEqual(Codepoint{
340 .value = Codepoint.invalid,
341 .byte_len = 2,
342 }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
343 try std.testing.expectEqual(Codepoint{
344 .value = Codepoint.invalid,
345 .byte_len = 1,
346 }, SupportedCodePage.utf8.codepointAt(2, invalid_utf8).?);
347 try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(3, invalid_utf8));
348 }
349
350 {
351 const invalid_utf8 = "\xD2";
352 try std.testing.expectEqual(Codepoint{
353 .value = Codepoint.invalid,
354 .byte_len = 1,
355 }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
356 try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(1, invalid_utf8));
357 }
358
359 {
360 const invalid_utf8 = "\xE1\xA0";
361 try std.testing.expectEqual(Codepoint{
362 .value = Codepoint.invalid,
363 .byte_len = 2,
364 }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
365 try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, invalid_utf8));
366 }
367
368 {
369 const invalid_utf8 = "\xC5\xFF";
370 try std.testing.expectEqual(Codepoint{
371 .value = Codepoint.invalid,
372 .byte_len = 1,
373 }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
374 try std.testing.expectEqual(Codepoint{
375 .value = Codepoint.invalid,
376 .byte_len = 1,
377 }, SupportedCodePage.utf8.codepointAt(1, invalid_utf8).?);
378 try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, invalid_utf8));
379 }
380
381 {
382 // encoded high surrogate
383 const invalid_utf8 = "\xED\xA0\xBD";
384 try std.testing.expectEqual(Codepoint{
385 .value = Codepoint.invalid,
386 .byte_len = 2,
387 }, SupportedCodePage.utf8.codepointAt(0, invalid_utf8).?);
388 try std.testing.expectEqual(Codepoint{
389 .value = Codepoint.invalid,
390 .byte_len = 1,
391 }, SupportedCodePage.utf8.codepointAt(2, invalid_utf8).?);
392 }
393}
394
395test "codepointAt utf8 encoded" {
396 const utf8_encoded = "²";
397
398 // with code page utf8
399 try std.testing.expectEqual(Codepoint{
400 .value = '²',
401 .byte_len = 2,
402 }, SupportedCodePage.utf8.codepointAt(0, utf8_encoded).?);
403 try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, utf8_encoded));
404
405 // with code page windows1252
406 try std.testing.expectEqual(Codepoint{
407 .value = '\xC2',
408 .byte_len = 1,
409 }, SupportedCodePage.windows1252.codepointAt(0, utf8_encoded).?);
410 try std.testing.expectEqual(Codepoint{
411 .value = '\xB2',
412 .byte_len = 1,
413 }, SupportedCodePage.windows1252.codepointAt(1, utf8_encoded).?);
414 try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.windows1252.codepointAt(2, utf8_encoded));
415}
416
417test "codepointAt windows1252 encoded" {
418 const windows1252_encoded = "\xB2";
419
420 // with code page utf8
421 try std.testing.expectEqual(Codepoint{
422 .value = Codepoint.invalid,
423 .byte_len = 1,
424 }, SupportedCodePage.utf8.codepointAt(0, windows1252_encoded).?);
425 try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.utf8.codepointAt(2, windows1252_encoded));
426
427 // with code page windows1252
428 try std.testing.expectEqual(Codepoint{
429 .value = '\xB2',
430 .byte_len = 1,
431 }, SupportedCodePage.windows1252.codepointAt(0, windows1252_encoded).?);
432 try std.testing.expectEqual(@as(?Codepoint, null), SupportedCodePage.windows1252.codepointAt(1, windows1252_encoded));
433}
434
435pub const Codepoint = struct {
436 value: u21,
437 byte_len: usize,
438
439 pub const invalid: u21 = std.math.maxInt(u21);
440};