zig/lib/compiler/resinator/lang.zig at master

  1const std = @import("std");
  2
  3/// This function is specific to how the Win32 RC command line interprets
  4/// language IDs specified as integers.
  5/// - Always interpreted as hexadecimal, but explicit 0x prefix is also allowed
  6/// - Wraps on overflow of u16
  7/// - Stops parsing on any invalid hexadecimal digits
  8/// - Errors if a digit is not the first char
  9/// - `-` (negative) prefix is allowed
 10pub fn parseInt(str: []const u8) error{InvalidLanguageId}!u16 {
 11    var result: u16 = 0;
 12    const radix: u8 = 16;
 13    var buf = str;
 14
 15    const Prefix = enum { none, minus };
 16    var prefix: Prefix = .none;
 17    switch (buf[0]) {
 18        '-' => {
 19            prefix = .minus;
 20            buf = buf[1..];
 21        },
 22        else => {},
 23    }
 24
 25    if (buf.len > 2 and buf[0] == '0' and buf[1] == 'x') {
 26        buf = buf[2..];
 27    }
 28
 29    for (buf, 0..) |c, i| {
 30        const digit = switch (c) {
 31            // On invalid digit for the radix, just stop parsing but don't fail
 32            'a'...'f', 'A'...'F', '0'...'9' => std.fmt.charToDigit(c, radix) catch break,
 33            else => {
 34                // First digit must be valid
 35                if (i == 0) {
 36                    return error.InvalidLanguageId;
 37                }
 38                break;
 39            },
 40        };
 41
 42        if (result != 0) {
 43            result *%= radix;
 44        }
 45        result +%= digit;
 46    }
 47
 48    switch (prefix) {
 49        .none => {},
 50        .minus => result = 0 -% result,
 51    }
 52
 53    return result;
 54}
 55
 56test parseInt {
 57    try std.testing.expectEqual(@as(u16, 0x16), try parseInt("16"));
 58    try std.testing.expectEqual(@as(u16, 0x1a), try parseInt("0x1A"));
 59    try std.testing.expectEqual(@as(u16, 0x1a), try parseInt("0x1Azzzz"));
 60    try std.testing.expectEqual(@as(u16, 0xffff), try parseInt("-1"));
 61    try std.testing.expectEqual(@as(u16, 0xffea), try parseInt("-0x16"));
 62    try std.testing.expectEqual(@as(u16, 0x0), try parseInt("0o100"));
 63    try std.testing.expectEqual(@as(u16, 0x1), try parseInt("10001"));
 64    try std.testing.expectError(error.InvalidLanguageId, parseInt("--1"));
 65    try std.testing.expectError(error.InvalidLanguageId, parseInt("0xha"));
 66    try std.testing.expectError(error.InvalidLanguageId, parseInt("¹"));
 67    try std.testing.expectError(error.InvalidLanguageId, parseInt("~1"));
 68}
 69
 70/// This function is specific to how the Win32 RC command line interprets
 71/// language tags: invalid tags are rejected, but tags that don't have
 72/// a specific assigned ID but are otherwise valid enough will get
 73/// converted to an ID of LOCALE_CUSTOM_UNSPECIFIED.
 74pub fn tagToInt(tag: []const u8) error{InvalidLanguageTag}!u16 {
 75    const maybe_id = try tagToId(tag);
 76    if (maybe_id) |id| {
 77        return @intFromEnum(id);
 78    } else {
 79        return LOCALE_CUSTOM_UNSPECIFIED;
 80    }
 81}
 82
 83pub fn tagToId(tag: []const u8) error{InvalidLanguageTag}!?LanguageId {
 84    const parsed = try parse(tag);
 85    // There are currently no language tags with assigned IDs that have
 86    // multiple suffixes, so we can skip the lookup.
 87    if (parsed.multiple_suffixes) return null;
 88    const longest_known_tag = comptime blk: {
 89        var len = 0;
 90        for (@typeInfo(LanguageId).@"enum".fields) |field| {
 91            if (field.name.len > len) len = field.name.len;
 92        }
 93        break :blk len;
 94    };
 95    // If the tag is longer than the longest tag that has an assigned ID,
 96    // then we can skip the lookup.
 97    if (tag.len > longest_known_tag) return null;
 98    var normalized_buf: [longest_known_tag]u8 = undefined;
 99    // To allow e.g. `de-de_phoneb` to get looked up as `de-de`, we need to
100    // omit the suffix, but only if the tag contains a valid alternate sort order.
101    const tag_to_normalize = if (parsed.isSuffixValidSortOrder()) tag[0 .. tag.len - (parsed.suffix.?.len + 1)] else tag;
102    const normalized_tag = normalizeTag(tag_to_normalize, &normalized_buf);
103    return std.meta.stringToEnum(LanguageId, normalized_tag) orelse {
104        // special case for a tag that has been mapped to the same ID
105        // twice.
106        if (std.mem.eql(u8, "ff_latn_ng", normalized_tag)) {
107            return LanguageId.ff_ng;
108        }
109        return null;
110    };
111}
112
113test tagToId {
114    try std.testing.expectEqual(LanguageId.ar_ae, (try tagToId("ar-ae")).?);
115    try std.testing.expectEqual(LanguageId.ar_ae, (try tagToId("AR_AE")).?);
116    try std.testing.expectEqual(LanguageId.ff_ng, (try tagToId("ff-ng")).?);
117    // Special case
118    try std.testing.expectEqual(LanguageId.ff_ng, (try tagToId("ff-Latn-NG")).?);
119}
120
121test "exhaustive tagToId" {
122    @setEvalBranchQuota(2000);
123    inline for (@typeInfo(LanguageId).@"enum".fields) |field| {
124        const id = tagToId(field.name) catch |err| {
125            std.debug.print("tag: {s}\n", .{field.name});
126            return err;
127        };
128        try std.testing.expectEqual(@field(LanguageId, field.name), id orelse {
129            std.debug.print("tag: {s}, got null\n", .{field.name});
130            return error.TestExpectedEqual;
131        });
132    }
133    var buf: [32]u8 = undefined;
134    inline for (valid_alternate_sorts) |parsed_sort| {
135        var fbs: std.Io.Writer = .fixed(&buf);
136        const writer = &fbs;
137        writer.writeAll(parsed_sort.language_code) catch unreachable;
138        writer.writeAll("-") catch unreachable;
139        writer.writeAll(parsed_sort.country_code.?) catch unreachable;
140        writer.writeAll("-") catch unreachable;
141        writer.writeAll(parsed_sort.suffix.?) catch unreachable;
142        const expected_field_name = comptime field: {
143            var name_buf: [5]u8 = undefined;
144            @memcpy(name_buf[0..parsed_sort.language_code.len], parsed_sort.language_code);
145            name_buf[2] = '_';
146            @memcpy(name_buf[3..], parsed_sort.country_code.?);
147            break :field name_buf;
148        };
149        const expected = @field(LanguageId, &expected_field_name);
150        const id = tagToId(fbs.buffered()) catch |err| {
151            std.debug.print("tag: {s}\n", .{fbs.buffered()});
152            return err;
153        };
154        try std.testing.expectEqual(expected, id orelse {
155            std.debug.print("tag: {s}, expected: {}, got null\n", .{ fbs.buffered(), expected });
156            return error.TestExpectedEqual;
157        });
158    }
159}
160
161fn normalizeTag(tag: []const u8, buf: []u8) []u8 {
162    std.debug.assert(buf.len >= tag.len);
163    for (tag, 0..) |c, i| {
164        if (c == '-')
165            buf[i] = '_'
166        else
167            buf[i] = std.ascii.toLower(c);
168    }
169    return buf[0..tag.len];
170}
171
172/// https://winprotocoldoc.blob.core.windows.net/productionwindowsarchives/MS-LCID/%5bMS-LCID%5d.pdf#%5B%7B%22num%22%3A72%2C%22gen%22%3A0%7D%2C%7B%22name%22%3A%22XYZ%22%7D%2C69%2C574%2C0%5D
173/// "When an LCID is requested for a locale without a
174/// permanent LCID assignment, nor a temporary
175/// assignment as above, the protocol will respond
176/// with LOCALE_CUSTOM_UNSPECIFIED for all such
177/// locales. Because this single value is used for
178/// numerous possible locale names, it is impossible to
179/// round trip this locale, even temporarily.
180/// Applications should discard this value as soon as
181/// possible and never persist it. If the system is
182/// forced to respond to a request for
183/// LCID_CUSTOM_UNSPECIFIED, it will fall back to
184/// the current user locale. This is often incorrect but
185/// may prevent an application or component from
186/// failing. As the meaning of this temporary LCID is
187/// unstable, it should never be used for interchange
188/// or persisted data. This is a 1-to-many relationship
189/// that is very unstable."
190pub const LOCALE_CUSTOM_UNSPECIFIED = 0x1000;
191
192pub const LANG_ENGLISH = 0x09;
193pub const SUBLANG_ENGLISH_US = 0x01;
194
195/// https://learn.microsoft.com/en-us/windows/win32/intl/language-identifiers
196pub fn MAKELANGID(primary: u10, sublang: u6) u16 {
197    return (@as(u16, primary) << 10) | sublang;
198}
199
200/// Language tag format expressed as a regular expression (rough approximation):
201///
202/// [a-zA-Z]{1,3}([-_][a-zA-Z]{4})?([-_][a-zA-Z]{2})?([-_][a-zA-Z0-9]{1,8})?
203///     lang    |     script      |      country    |       suffix
204///
205/// Notes:
206/// - If lang code is 1 char, it seems to mean that everything afterwards uses suffix
207///   parsing rules (e.g. `a-0` and `a-00000000` are allowed).
208/// - There can also be any number of trailing suffix parts as long as they each
209///   would be a valid suffix part, e.g. `en-us-blah-blah1-blah2-blah3` is allowed.
210/// - When doing lookups, trailing suffix parts are taken into account, e.g.
211///   `ca-es-valencia` is not considered equivalent to `ca-es-valencia-blah`.
212/// - A suffix is only allowed if:
213///   + Lang code is 1 char long, or
214///   + A country code is present, or
215///   + A script tag is not present and:
216///      - the suffix is numeric-only and has a length of 3, or
217///      - the lang is `qps` and the suffix is `ploca` or `plocm`
218pub fn parse(lang_tag: []const u8) error{InvalidLanguageTag}!Parsed {
219    var it = std.mem.splitAny(u8, lang_tag, "-_");
220    const lang_code = it.first();
221    const is_valid_lang_code = lang_code.len >= 1 and lang_code.len <= 3 and isAllAlphabetic(lang_code);
222    if (!is_valid_lang_code) return error.InvalidLanguageTag;
223    var parsed = Parsed{
224        .language_code = lang_code,
225    };
226    // The second part could be a script tag, a country code, or a suffix
227    if (it.next()) |part_str| {
228        // The lang code being length 1 behaves strangely, so fully special case it.
229        if (lang_code.len == 1) {
230            // This is almost certainly not the 'right' way to do this, but I don't have a method
231            // to determine how exactly these language tags are parsed, and it seems like
232            // suffix parsing rules apply generally (digits allowed, length of 1 to 8).
233            //
234            // However, because we want to be able to lookup `x-iv-mathan` normally without
235            // `multiple_suffixes` being set to true, we need to make sure to treat two-length
236            // alphabetic parts as a country code.
237            if (part_str.len == 2 and isAllAlphabetic(part_str)) {
238                parsed.country_code = part_str;
239            }
240            // Everything else, though, we can just throw into the suffix as long as the normal
241            // rules apply.
242            else if (part_str.len > 0 and part_str.len <= 8 and isAllAlphanumeric(part_str)) {
243                parsed.suffix = part_str;
244            } else {
245                return error.InvalidLanguageTag;
246            }
247        } else if (part_str.len == 4 and isAllAlphabetic(part_str)) {
248            parsed.script_tag = part_str;
249        } else if (part_str.len == 2 and isAllAlphabetic(part_str)) {
250            parsed.country_code = part_str;
251        }
252        // Only a 3-len numeric suffix is allowed as the second part of a tag
253        else if (part_str.len == 3 and isAllNumeric(part_str)) {
254            parsed.suffix = part_str;
255        }
256        // Special case for qps-ploca and qps-plocm
257        else if (std.ascii.eqlIgnoreCase(lang_code, "qps") and
258            (std.ascii.eqlIgnoreCase(part_str, "ploca") or
259                std.ascii.eqlIgnoreCase(part_str, "plocm")))
260        {
261            parsed.suffix = part_str;
262        } else {
263            return error.InvalidLanguageTag;
264        }
265    } else {
266        // If there's no part besides a 1-len lang code, then it is malformed
267        if (lang_code.len == 1) return error.InvalidLanguageTag;
268        return parsed;
269    }
270    if (parsed.script_tag != null) {
271        if (it.next()) |part_str| {
272            if (part_str.len == 2 and isAllAlphabetic(part_str)) {
273                parsed.country_code = part_str;
274            } else {
275                // Suffix is not allowed when a country code is not present.
276                return error.InvalidLanguageTag;
277            }
278        } else {
279            return parsed;
280        }
281    }
282    // We've now parsed any potential script tag/country codes, so anything remaining
283    // is a suffix
284    while (it.next()) |part_str| {
285        if (part_str.len == 0 or part_str.len > 8 or !isAllAlphanumeric(part_str)) {
286            return error.InvalidLanguageTag;
287        }
288        if (parsed.suffix == null) {
289            parsed.suffix = part_str;
290        } else {
291            // In theory we could return early here but we still want to validate
292            // that each part is a valid suffix all the way to the end, e.g.
293            // we should reject `en-us-suffix-a-b-c-!!!` because of the invalid `!!!`
294            // suffix part.
295            parsed.multiple_suffixes = true;
296        }
297    }
298    return parsed;
299}
300
301pub const Parsed = struct {
302    language_code: []const u8,
303    script_tag: ?[]const u8 = null,
304    country_code: ?[]const u8 = null,
305    /// Can be a sort order (e.g. phoneb) or something like valencia, 001, etc
306    suffix: ?[]const u8 = null,
307    /// There can be any number of suffixes, but we don't need to care what their
308    /// values are, we just need to know if any exist so that e.g. `ca-es-valencia-blah`
309    /// can be seen as different from `ca-es-valencia`. Storing this as a bool
310    /// allows us to avoid needing either (a) dynamic allocation or (b) a limit to
311    /// the number of suffixes allowed when parsing.
312    multiple_suffixes: bool = false,
313
314    pub fn isSuffixValidSortOrder(self: Parsed) bool {
315        if (self.country_code == null) return false;
316        if (self.suffix == null) return false;
317        if (self.script_tag != null) return false;
318        if (self.multiple_suffixes) return false;
319        for (valid_alternate_sorts) |valid_sort| {
320            if (std.ascii.eqlIgnoreCase(valid_sort.language_code, self.language_code) and
321                std.ascii.eqlIgnoreCase(valid_sort.country_code.?, self.country_code.?) and
322                std.ascii.eqlIgnoreCase(valid_sort.suffix.?, self.suffix.?))
323            {
324                return true;
325            }
326        }
327        return false;
328    }
329};
330
331/// https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/70feba9f-294e-491e-b6eb-56532684c37f
332/// See the table following this text: "Alternate sorts can be selected by using one of the identifiers from the following table."
333const valid_alternate_sorts = [_]Parsed{
334    // Note: x-IV-mathan is omitted due to how lookups are implemented.
335    //       This table is used to make e.g. `de-de_phoneb` get looked up
336    //       as `de-de` (the suffix is omitted for the lookup), but x-iv-mathan
337    //       instead needs to be looked up with the suffix included because
338    //       `x-iv` is not a tag with an assigned ID.
339    .{ .language_code = "de", .country_code = "de", .suffix = "phoneb" },
340    .{ .language_code = "hu", .country_code = "hu", .suffix = "tchncl" },
341    .{ .language_code = "ka", .country_code = "ge", .suffix = "modern" },
342    .{ .language_code = "zh", .country_code = "cn", .suffix = "stroke" },
343    .{ .language_code = "zh", .country_code = "sg", .suffix = "stroke" },
344    .{ .language_code = "zh", .country_code = "mo", .suffix = "stroke" },
345    .{ .language_code = "zh", .country_code = "tw", .suffix = "pronun" },
346    .{ .language_code = "zh", .country_code = "tw", .suffix = "radstr" },
347    .{ .language_code = "ja", .country_code = "jp", .suffix = "radstr" },
348    .{ .language_code = "zh", .country_code = "hk", .suffix = "radstr" },
349    .{ .language_code = "zh", .country_code = "mo", .suffix = "radstr" },
350    .{ .language_code = "zh", .country_code = "cn", .suffix = "phoneb" },
351    .{ .language_code = "zh", .country_code = "sg", .suffix = "phoneb" },
352};
353
354test "parse" {
355    try std.testing.expectEqualDeep(Parsed{
356        .language_code = "en",
357    }, try parse("en"));
358    try std.testing.expectEqualDeep(Parsed{
359        .language_code = "en",
360        .country_code = "us",
361    }, try parse("en-us"));
362    try std.testing.expectEqualDeep(Parsed{
363        .language_code = "en",
364        .suffix = "123",
365    }, try parse("en-123"));
366    try std.testing.expectEqualDeep(Parsed{
367        .language_code = "en",
368        .suffix = "123",
369        .multiple_suffixes = true,
370    }, try parse("en-123-blah"));
371    try std.testing.expectEqualDeep(Parsed{
372        .language_code = "en",
373        .country_code = "us",
374        .suffix = "123",
375        .multiple_suffixes = true,
376    }, try parse("en-us_123-blah"));
377    try std.testing.expectEqualDeep(Parsed{
378        .language_code = "eng",
379        .script_tag = "Latn",
380    }, try parse("eng-Latn"));
381    try std.testing.expectEqualDeep(Parsed{
382        .language_code = "eng",
383        .script_tag = "Latn",
384    }, try parse("eng-Latn"));
385    try std.testing.expectEqualDeep(Parsed{
386        .language_code = "ff",
387        .script_tag = "Latn",
388        .country_code = "NG",
389    }, try parse("ff-Latn-NG"));
390    try std.testing.expectEqualDeep(Parsed{
391        .language_code = "qps",
392        .suffix = "Plocm",
393    }, try parse("qps-Plocm"));
394    try std.testing.expectEqualDeep(Parsed{
395        .language_code = "qps",
396        .suffix = "ploca",
397    }, try parse("qps-ploca"));
398    try std.testing.expectEqualDeep(Parsed{
399        .language_code = "x",
400        .country_code = "IV",
401        .suffix = "mathan",
402    }, try parse("x-IV-mathan"));
403    try std.testing.expectEqualDeep(Parsed{
404        .language_code = "a",
405        .suffix = "a",
406    }, try parse("a-a"));
407    try std.testing.expectEqualDeep(Parsed{
408        .language_code = "a",
409        .suffix = "000",
410    }, try parse("a-000"));
411    try std.testing.expectEqualDeep(Parsed{
412        .language_code = "a",
413        .suffix = "00000000",
414    }, try parse("a-00000000"));
415    // suffix not allowed if script tag is present without country code
416    try std.testing.expectError(error.InvalidLanguageTag, parse("eng-Latn-suffix"));
417    // suffix must be 3 numeric digits if neither script tag nor country code is present
418    try std.testing.expectError(error.InvalidLanguageTag, parse("eng-suffix"));
419    try std.testing.expectError(error.InvalidLanguageTag, parse("en-plocm"));
420    // 1-len lang code is not allowed if it's the only part
421    try std.testing.expectError(error.InvalidLanguageTag, parse("e"));
422}
423
424fn isAllAlphabetic(str: []const u8) bool {
425    for (str) |c| {
426        if (!std.ascii.isAlphabetic(c)) return false;
427    }
428    return true;
429}
430
431fn isAllAlphanumeric(str: []const u8) bool {
432    for (str) |c| {
433        if (!std.ascii.isAlphanumeric(c)) return false;
434    }
435    return true;
436}
437
438fn isAllNumeric(str: []const u8) bool {
439    for (str) |c| {
440        if (!std.ascii.isDigit(c)) return false;
441    }
442    return true;
443}
444
445/// Derived from https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/70feba9f-294e-491e-b6eb-56532684c37f
446/// - Protocol Revision: 15.0
447/// - Language / Language ID / Language Tag table in Appendix A
448/// - Removed all rows that have Language ID 0x1000 (LOCALE_CUSTOM_UNSPECIFIED)
449/// - Normalized each language tag (lowercased, replaced all `-` with `_`)
450/// - There is one special case where two tags are mapped to the same ID, the following
451///   has been omitted and must be special cased during lookup to map to the ID ff_ng / 0x0467.
452///     ff_latn_ng = 0x0467, // Fulah (Latin), Nigeria
453/// - x_iv_mathan has been added which is not in the table but does appear in the Alternate sorts
454///   table as 0x007F (LANG_INVARIANT).
455pub const LanguageId = enum(u16) {
456    // Language tag = Language ID, // Language, Location (or type)
457    af = 0x0036, // Afrikaans
458    af_za = 0x0436, // Afrikaans, South Africa
459    sq = 0x001C, // Albanian
460    sq_al = 0x041C, // Albanian, Albania
461    gsw = 0x0084, // Alsatian
462    gsw_fr = 0x0484, // Alsatian, France
463    am = 0x005E, // Amharic
464    am_et = 0x045E, // Amharic, Ethiopia
465    ar = 0x0001, // Arabic
466    ar_dz = 0x1401, // Arabic, Algeria
467    ar_bh = 0x3C01, // Arabic, Bahrain
468    ar_eg = 0x0c01, // Arabic, Egypt
469    ar_iq = 0x0801, // Arabic, Iraq
470    ar_jo = 0x2C01, // Arabic, Jordan
471    ar_kw = 0x3401, // Arabic, Kuwait
472    ar_lb = 0x3001, // Arabic, Lebanon
473    ar_ly = 0x1001, // Arabic, Libya
474    ar_ma = 0x1801, // Arabic, Morocco
475    ar_om = 0x2001, // Arabic, Oman
476    ar_qa = 0x4001, // Arabic, Qatar
477    ar_sa = 0x0401, // Arabic, Saudi Arabia
478    ar_sy = 0x2801, // Arabic, Syria
479    ar_tn = 0x1C01, // Arabic, Tunisia
480    ar_ae = 0x3801, // Arabic, U.A.E.
481    ar_ye = 0x2401, // Arabic, Yemen
482    hy = 0x002B, // Armenian
483    hy_am = 0x042B, // Armenian, Armenia
484    as = 0x004D, // Assamese
485    as_in = 0x044D, // Assamese, India
486    az_cyrl = 0x742C, // Azerbaijani (Cyrillic)
487    az_cyrl_az = 0x082C, // Azerbaijani (Cyrillic), Azerbaijan
488    az = 0x002C, // Azerbaijani (Latin)
489    az_latn = 0x782C, // Azerbaijani (Latin)
490    az_latn_az = 0x042C, // Azerbaijani (Latin), Azerbaijan
491    bn = 0x0045, // Bangla
492    bn_bd = 0x0845, // Bangla, Bangladesh
493    bn_in = 0x0445, // Bangla, India
494    ba = 0x006D, // Bashkir
495    ba_ru = 0x046D, // Bashkir, Russia
496    eu = 0x002D, // Basque
497    eu_es = 0x042D, // Basque, Spain
498    be = 0x0023, // Belarusian
499    be_by = 0x0423, // Belarusian, Belarus
500    bs_cyrl = 0x641A, // Bosnian (Cyrillic)
501    bs_cyrl_ba = 0x201A, // Bosnian (Cyrillic), Bosnia and Herzegovina
502    bs_latn = 0x681A, // Bosnian (Latin)
503    bs = 0x781A, // Bosnian (Latin)
504    bs_latn_ba = 0x141A, // Bosnian (Latin), Bosnia and Herzegovina
505    br = 0x007E, // Breton
506    br_fr = 0x047E, // Breton, France
507    bg = 0x0002, // Bulgarian
508    bg_bg = 0x0402, // Bulgarian, Bulgaria
509    my = 0x0055, // Burmese
510    my_mm = 0x0455, // Burmese, Myanmar
511    ca = 0x0003, // Catalan
512    ca_es = 0x0403, // Catalan, Spain
513    tzm_arab_ma = 0x045F, // Central Atlas Tamazight (Arabic), Morocco
514    ku = 0x0092, // Central Kurdish
515    ku_arab = 0x7c92, // Central Kurdish
516    ku_arab_iq = 0x0492, // Central Kurdish, Iraq
517    chr = 0x005C, // Cherokee
518    chr_cher = 0x7c5C, // Cherokee
519    chr_cher_us = 0x045C, // Cherokee, United States
520    zh_hans = 0x0004, // Chinese (Simplified)
521    zh = 0x7804, // Chinese (Simplified)
522    zh_cn = 0x0804, // Chinese (Simplified), People's Republic of China
523    zh_sg = 0x1004, // Chinese (Simplified), Singapore
524    zh_hant = 0x7C04, // Chinese (Traditional)
525    zh_hk = 0x0C04, // Chinese (Traditional), Hong Kong S.A.R.
526    zh_mo = 0x1404, // Chinese (Traditional), Macao S.A.R.
527    zh_tw = 0x0404, // Chinese (Traditional), Taiwan
528    co = 0x0083, // Corsican
529    co_fr = 0x0483, // Corsican, France
530    hr = 0x001A, // Croatian
531    hr_hr = 0x041A, // Croatian, Croatia
532    hr_ba = 0x101A, // Croatian (Latin), Bosnia and Herzegovina
533    cs = 0x0005, // Czech
534    cs_cz = 0x0405, // Czech, Czech Republic
535    da = 0x0006, // Danish
536    da_dk = 0x0406, // Danish, Denmark
537    prs = 0x008C, // Dari
538    prs_af = 0x048C, // Dari, Afghanistan
539    dv = 0x0065, // Divehi
540    dv_mv = 0x0465, // Divehi, Maldives
541    nl = 0x0013, // Dutch
542    nl_be = 0x0813, // Dutch, Belgium
543    nl_nl = 0x0413, // Dutch, Netherlands
544    dz_bt = 0x0C51, // Dzongkha, Bhutan
545    en = 0x0009, // English
546    en_au = 0x0C09, // English, Australia
547    en_bz = 0x2809, // English, Belize
548    en_ca = 0x1009, // English, Canada
549    en_029 = 0x2409, // English, Caribbean
550    en_hk = 0x3C09, // English, Hong Kong
551    en_in = 0x4009, // English, India
552    en_ie = 0x1809, // English, Ireland
553    en_jm = 0x2009, // English, Jamaica
554    en_my = 0x4409, // English, Malaysia
555    en_nz = 0x1409, // English, New Zealand
556    en_ph = 0x3409, // English, Republic of the Philippines
557    en_sg = 0x4809, // English, Singapore
558    en_za = 0x1C09, // English, South Africa
559    en_tt = 0x2c09, // English, Trinidad and Tobago
560    en_ae = 0x4C09, // English, United Arab Emirates
561    en_gb = 0x0809, // English, United Kingdom
562    en_us = 0x0409, // English, United States
563    en_zw = 0x3009, // English, Zimbabwe
564    et = 0x0025, // Estonian
565    et_ee = 0x0425, // Estonian, Estonia
566    fo = 0x0038, // Faroese
567    fo_fo = 0x0438, // Faroese, Faroe Islands
568    fil = 0x0064, // Filipino
569    fil_ph = 0x0464, // Filipino, Philippines
570    fi = 0x000B, // Finnish
571    fi_fi = 0x040B, // Finnish, Finland
572    fr = 0x000C, // French
573    fr_be = 0x080C, // French, Belgium
574    fr_cm = 0x2c0C, // French, Cameroon
575    fr_ca = 0x0c0C, // French, Canada
576    fr_029 = 0x1C0C, // French, Caribbean
577    fr_cd = 0x240C, // French, Congo, DRC
578    fr_ci = 0x300C, // French, Côte d'Ivoire
579    fr_fr = 0x040C, // French, France
580    fr_ht = 0x3c0C, // French, Haiti
581    fr_lu = 0x140C, // French, Luxembourg
582    fr_ml = 0x340C, // French, Mali
583    fr_ma = 0x380C, // French, Morocco
584    fr_mc = 0x180C, // French, Principality of Monaco
585    fr_re = 0x200C, // French, Reunion
586    fr_sn = 0x280C, // French, Senegal
587    fr_ch = 0x100C, // French, Switzerland
588    fy = 0x0062, // Frisian
589    fy_nl = 0x0462, // Frisian, Netherlands
590    ff = 0x0067, // Fulah
591    ff_latn = 0x7C67, // Fulah (Latin)
592    ff_ng = 0x0467, // Fulah, Nigeria
593    ff_latn_sn = 0x0867, // Fulah, Senegal
594    gl = 0x0056, // Galician
595    gl_es = 0x0456, // Galician, Spain
596    ka = 0x0037, // Georgian
597    ka_ge = 0x0437, // Georgian, Georgia
598    de = 0x0007, // German
599    de_at = 0x0C07, // German, Austria
600    de_de = 0x0407, // German, Germany
601    de_li = 0x1407, // German, Liechtenstein
602    de_lu = 0x1007, // German, Luxembourg
603    de_ch = 0x0807, // German, Switzerland
604    el = 0x0008, // Greek
605    el_gr = 0x0408, // Greek, Greece
606    kl = 0x006F, // Greenlandic
607    kl_gl = 0x046F, // Greenlandic, Greenland
608    gn = 0x0074, // Guarani
609    gn_py = 0x0474, // Guarani, Paraguay
610    gu = 0x0047, // Gujarati
611    gu_in = 0x0447, // Gujarati, India
612    ha = 0x0068, // Hausa (Latin)
613    ha_latn = 0x7C68, // Hausa (Latin)
614    ha_latn_ng = 0x0468, // Hausa (Latin), Nigeria
615    haw = 0x0075, // Hawaiian
616    haw_us = 0x0475, // Hawaiian, United States
617    he = 0x000D, // Hebrew
618    he_il = 0x040D, // Hebrew, Israel
619    hi = 0x0039, // Hindi
620    hi_in = 0x0439, // Hindi, India
621    hu = 0x000E, // Hungarian
622    hu_hu = 0x040E, // Hungarian, Hungary
623    is = 0x000F, // Icelandic
624    is_is = 0x040F, // Icelandic, Iceland
625    ig = 0x0070, // Igbo
626    ig_ng = 0x0470, // Igbo, Nigeria
627    id = 0x0021, // Indonesian
628    id_id = 0x0421, // Indonesian, Indonesia
629    iu = 0x005D, // Inuktitut (Latin)
630    iu_latn = 0x7C5D, // Inuktitut (Latin)
631    iu_latn_ca = 0x085D, // Inuktitut (Latin), Canada
632    iu_cans = 0x785D, // Inuktitut (Syllabics)
633    iu_cans_ca = 0x045d, // Inuktitut (Syllabics), Canada
634    ga = 0x003C, // Irish
635    ga_ie = 0x083C, // Irish, Ireland
636    it = 0x0010, // Italian
637    it_it = 0x0410, // Italian, Italy
638    it_ch = 0x0810, // Italian, Switzerland
639    ja = 0x0011, // Japanese
640    ja_jp = 0x0411, // Japanese, Japan
641    kn = 0x004B, // Kannada
642    kn_in = 0x044B, // Kannada, India
643    kr_latn_ng = 0x0471, // Kanuri (Latin), Nigeria
644    ks = 0x0060, // Kashmiri
645    ks_arab = 0x0460, // Kashmiri, Perso-Arabic
646    ks_deva_in = 0x0860, // Kashmiri (Devanagari), India
647    kk = 0x003F, // Kazakh
648    kk_kz = 0x043F, // Kazakh, Kazakhstan
649    km = 0x0053, // Khmer
650    km_kh = 0x0453, // Khmer, Cambodia
651    quc = 0x0086, // K'iche
652    quc_latn_gt = 0x0486, // K'iche, Guatemala
653    rw = 0x0087, // Kinyarwanda
654    rw_rw = 0x0487, // Kinyarwanda, Rwanda
655    sw = 0x0041, // Kiswahili
656    sw_ke = 0x0441, // Kiswahili, Kenya
657    kok = 0x0057, // Konkani
658    kok_in = 0x0457, // Konkani, India
659    ko = 0x0012, // Korean
660    ko_kr = 0x0412, // Korean, Korea
661    ky = 0x0040, // Kyrgyz
662    ky_kg = 0x0440, // Kyrgyz, Kyrgyzstan
663    lo = 0x0054, // Lao
664    lo_la = 0x0454, // Lao, Lao P.D.R.
665    la_va = 0x0476, // Latin, Vatican City
666    lv = 0x0026, // Latvian
667    lv_lv = 0x0426, // Latvian, Latvia
668    lt = 0x0027, // Lithuanian
669    lt_lt = 0x0427, // Lithuanian, Lithuania
670    dsb = 0x7C2E, // Lower Sorbian
671    dsb_de = 0x082E, // Lower Sorbian, Germany
672    lb = 0x006E, // Luxembourgish
673    lb_lu = 0x046E, // Luxembourgish, Luxembourg
674    mk = 0x002F, // Macedonian
675    mk_mk = 0x042F, // Macedonian, North Macedonia
676    ms = 0x003E, // Malay
677    ms_bn = 0x083E, // Malay, Brunei Darussalam
678    ms_my = 0x043E, // Malay, Malaysia
679    ml = 0x004C, // Malayalam
680    ml_in = 0x044C, // Malayalam, India
681    mt = 0x003A, // Maltese
682    mt_mt = 0x043A, // Maltese, Malta
683    mi = 0x0081, // Maori
684    mi_nz = 0x0481, // Maori, New Zealand
685    arn = 0x007A, // Mapudungun
686    arn_cl = 0x047A, // Mapudungun, Chile
687    mr = 0x004E, // Marathi
688    mr_in = 0x044E, // Marathi, India
689    moh = 0x007C, // Mohawk
690    moh_ca = 0x047C, // Mohawk, Canada
691    mn = 0x0050, // Mongolian (Cyrillic)
692    mn_cyrl = 0x7850, // Mongolian (Cyrillic)
693    mn_mn = 0x0450, // Mongolian (Cyrillic), Mongolia
694    mn_mong = 0x7C50, // Mongolian (Traditional Mongolian)
695    mn_mong_cn = 0x0850, // Mongolian (Traditional Mongolian), People's Republic of China
696    mn_mong_mn = 0x0C50, // Mongolian (Traditional Mongolian), Mongolia
697    ne = 0x0061, // Nepali
698    ne_in = 0x0861, // Nepali, India
699    ne_np = 0x0461, // Nepali, Nepal
700    no = 0x0014, // Norwegian (Bokmal)
701    nb = 0x7C14, // Norwegian (Bokmal)
702    nb_no = 0x0414, // Norwegian (Bokmal), Norway
703    nn = 0x7814, // Norwegian (Nynorsk)
704    nn_no = 0x0814, // Norwegian (Nynorsk), Norway
705    oc = 0x0082, // Occitan
706    oc_fr = 0x0482, // Occitan, France
707    @"or" = 0x0048, // Odia
708    or_in = 0x0448, // Odia, India
709    om = 0x0072, // Oromo
710    om_et = 0x0472, // Oromo, Ethiopia
711    ps = 0x0063, // Pashto
712    ps_af = 0x0463, // Pashto, Afghanistan
713    fa = 0x0029, // Persian
714    fa_ir = 0x0429, // Persian, Iran
715    pl = 0x0015, // Polish
716    pl_pl = 0x0415, // Polish, Poland
717    pt = 0x0016, // Portuguese
718    pt_br = 0x0416, // Portuguese, Brazil
719    pt_pt = 0x0816, // Portuguese, Portugal
720    qps_ploca = 0x05FE, // Pseudo Language, Pseudo locale for east Asian/complex script localization testing
721    qps_ploc = 0x0501, // Pseudo Language, Pseudo locale used for localization testing
722    qps_plocm = 0x09FF, // Pseudo Language, Pseudo locale used for localization testing of mirrored locales
723    pa = 0x0046, // Punjabi
724    pa_arab = 0x7C46, // Punjabi
725    pa_in = 0x0446, // Punjabi, India
726    pa_arab_pk = 0x0846, // Punjabi, Islamic Republic of Pakistan
727    quz = 0x006B, // Quechua
728    quz_bo = 0x046B, // Quechua, Bolivia
729    quz_ec = 0x086B, // Quechua, Ecuador
730    quz_pe = 0x0C6B, // Quechua, Peru
731    ro = 0x0018, // Romanian
732    ro_md = 0x0818, // Romanian, Moldova
733    ro_ro = 0x0418, // Romanian, Romania
734    rm = 0x0017, // Romansh
735    rm_ch = 0x0417, // Romansh, Switzerland
736    ru = 0x0019, // Russian
737    ru_md = 0x0819, // Russian, Moldova
738    ru_ru = 0x0419, // Russian, Russia
739    sah = 0x0085, // Sakha
740    sah_ru = 0x0485, // Sakha, Russia
741    smn = 0x703B, // Sami (Inari)
742    smn_fi = 0x243B, // Sami (Inari), Finland
743    smj = 0x7C3B, // Sami (Lule)
744    smj_no = 0x103B, // Sami (Lule), Norway
745    smj_se = 0x143B, // Sami (Lule), Sweden
746    se = 0x003B, // Sami (Northern)
747    se_fi = 0x0C3B, // Sami (Northern), Finland
748    se_no = 0x043B, // Sami (Northern), Norway
749    se_se = 0x083B, // Sami (Northern), Sweden
750    sms = 0x743B, // Sami (Skolt)
751    sms_fi = 0x203B, // Sami (Skolt), Finland
752    sma = 0x783B, // Sami (Southern)
753    sma_no = 0x183B, // Sami (Southern), Norway
754    sma_se = 0x1C3B, // Sami (Southern), Sweden
755    sa = 0x004F, // Sanskrit
756    sa_in = 0x044F, // Sanskrit, India
757    gd = 0x0091, // Scottish Gaelic
758    gd_gb = 0x0491, // Scottish Gaelic, United Kingdom
759    sr_cyrl = 0x6C1A, // Serbian (Cyrillic)
760    sr_cyrl_ba = 0x1C1A, // Serbian (Cyrillic), Bosnia and Herzegovina
761    sr_cyrl_me = 0x301A, // Serbian (Cyrillic), Montenegro
762    sr_cyrl_rs = 0x281A, // Serbian (Cyrillic), Serbia
763    sr_cyrl_cs = 0x0C1A, // Serbian (Cyrillic), Serbia and Montenegro (Former)
764    sr_latn = 0x701A, // Serbian (Latin)
765    sr = 0x7C1A, // Serbian (Latin)
766    sr_latn_ba = 0x181A, // Serbian (Latin), Bosnia and Herzegovina
767    sr_latn_me = 0x2c1A, // Serbian (Latin), Montenegro
768    sr_latn_rs = 0x241A, // Serbian (Latin), Serbia
769    sr_latn_cs = 0x081A, // Serbian (Latin), Serbia and Montenegro (Former)
770    nso = 0x006C, // Sesotho sa Leboa
771    nso_za = 0x046C, // Sesotho sa Leboa, South Africa
772    tn = 0x0032, // Setswana
773    tn_bw = 0x0832, // Setswana, Botswana
774    tn_za = 0x0432, // Setswana, South Africa
775    sd = 0x0059, // Sindhi
776    sd_arab = 0x7C59, // Sindhi
777    sd_arab_pk = 0x0859, // Sindhi, Islamic Republic of Pakistan
778    si = 0x005B, // Sinhala
779    si_lk = 0x045B, // Sinhala, Sri Lanka
780    sk = 0x001B, // Slovak
781    sk_sk = 0x041B, // Slovak, Slovakia
782    sl = 0x0024, // Slovenian
783    sl_si = 0x0424, // Slovenian, Slovenia
784    so = 0x0077, // Somali
785    so_so = 0x0477, // Somali, Somalia
786    st = 0x0030, // Sotho
787    st_za = 0x0430, // Sotho, South Africa
788    es = 0x000A, // Spanish
789    es_ar = 0x2C0A, // Spanish, Argentina
790    es_ve = 0x200A, // Spanish, Bolivarian Republic of Venezuela
791    es_bo = 0x400A, // Spanish, Bolivia
792    es_cl = 0x340A, // Spanish, Chile
793    es_co = 0x240A, // Spanish, Colombia
794    es_cr = 0x140A, // Spanish, Costa Rica
795    es_cu = 0x5c0A, // Spanish, Cuba
796    es_do = 0x1c0A, // Spanish, Dominican Republic
797    es_ec = 0x300A, // Spanish, Ecuador
798    es_sv = 0x440A, // Spanish, El Salvador
799    es_gt = 0x100A, // Spanish, Guatemala
800    es_hn = 0x480A, // Spanish, Honduras
801    es_419 = 0x580A, // Spanish, Latin America
802    es_mx = 0x080A, // Spanish, Mexico
803    es_ni = 0x4C0A, // Spanish, Nicaragua
804    es_pa = 0x180A, // Spanish, Panama
805    es_py = 0x3C0A, // Spanish, Paraguay
806    es_pe = 0x280A, // Spanish, Peru
807    es_pr = 0x500A, // Spanish, Puerto Rico
808    es_es_tradnl = 0x040A, // Spanish, Spain
809    es_es = 0x0c0A, // Spanish, Spain
810    es_us = 0x540A, // Spanish, United States
811    es_uy = 0x380A, // Spanish, Uruguay
812    sv = 0x001D, // Swedish
813    sv_fi = 0x081D, // Swedish, Finland
814    sv_se = 0x041D, // Swedish, Sweden
815    syr = 0x005A, // Syriac
816    syr_sy = 0x045A, // Syriac, Syria
817    tg = 0x0028, // Tajik (Cyrillic)
818    tg_cyrl = 0x7C28, // Tajik (Cyrillic)
819    tg_cyrl_tj = 0x0428, // Tajik (Cyrillic), Tajikistan
820    tzm = 0x005F, // Tamazight (Latin)
821    tzm_latn = 0x7C5F, // Tamazight (Latin)
822    tzm_latn_dz = 0x085F, // Tamazight (Latin), Algeria
823    ta = 0x0049, // Tamil
824    ta_in = 0x0449, // Tamil, India
825    ta_lk = 0x0849, // Tamil, Sri Lanka
826    tt = 0x0044, // Tatar
827    tt_ru = 0x0444, // Tatar, Russia
828    te = 0x004A, // Telugu
829    te_in = 0x044A, // Telugu, India
830    th = 0x001E, // Thai
831    th_th = 0x041E, // Thai, Thailand
832    bo = 0x0051, // Tibetan
833    bo_cn = 0x0451, // Tibetan, People's Republic of China
834    ti = 0x0073, // Tigrinya
835    ti_er = 0x0873, // Tigrinya, Eritrea
836    ti_et = 0x0473, // Tigrinya, Ethiopia
837    ts = 0x0031, // Tsonga
838    ts_za = 0x0431, // Tsonga, South Africa
839    tr = 0x001F, // Turkish
840    tr_tr = 0x041F, // Turkish, Turkey
841    tk = 0x0042, // Turkmen
842    tk_tm = 0x0442, // Turkmen, Turkmenistan
843    uk = 0x0022, // Ukrainian
844    uk_ua = 0x0422, // Ukrainian, Ukraine
845    hsb = 0x002E, // Upper Sorbian
846    hsb_de = 0x042E, // Upper Sorbian, Germany
847    ur = 0x0020, // Urdu
848    ur_in = 0x0820, // Urdu, India
849    ur_pk = 0x0420, // Urdu, Islamic Republic of Pakistan
850    ug = 0x0080, // Uyghur
851    ug_cn = 0x0480, // Uyghur, People's Republic of China
852    uz_cyrl = 0x7843, // Uzbek (Cyrillic)
853    uz_cyrl_uz = 0x0843, // Uzbek (Cyrillic), Uzbekistan
854    uz = 0x0043, // Uzbek (Latin)
855    uz_latn = 0x7C43, // Uzbek (Latin)
856    uz_latn_uz = 0x0443, // Uzbek (Latin), Uzbekistan
857    ca_es_valencia = 0x0803, // Valencian, Spain
858    ve = 0x0033, // Venda
859    ve_za = 0x0433, // Venda, South Africa
860    vi = 0x002A, // Vietnamese
861    vi_vn = 0x042A, // Vietnamese, Vietnam
862    cy = 0x0052, // Welsh
863    cy_gb = 0x0452, // Welsh, United Kingdom
864    wo = 0x0088, // Wolof
865    wo_sn = 0x0488, // Wolof, Senegal
866    xh = 0x0034, // Xhosa
867    xh_za = 0x0434, // Xhosa, South Africa
868    ii = 0x0078, // Yi
869    ii_cn = 0x0478, // Yi, People's Republic of China
870    yi_001 = 0x043D, // Yiddish, World
871    yo = 0x006A, // Yoruba
872    yo_ng = 0x046A, // Yoruba, Nigeria
873    zu = 0x0035, // Zulu
874    zu_za = 0x0435, // Zulu, South Africa
875
876    /// Special case
877    x_iv_mathan = 0x007F, // LANG_INVARIANT, "math alphanumeric sorting"
878};