master
  1#include <iconv.h>
  2#include <errno.h>
  3#include <wchar.h>
  4#include <string.h>
  5#include <stdlib.h>
  6#include <limits.h>
  7#include <stdint.h>
  8#include "locale_impl.h"
  9
 10#define UTF_32BE    0300
 11#define UTF_16LE    0301
 12#define UTF_16BE    0302
 13#define UTF_32LE    0303
 14#define UCS2BE      0304
 15#define UCS2LE      0305
 16#define WCHAR_T     0306
 17#define US_ASCII    0307
 18#define UTF_8       0310
 19#define UTF_16      0312
 20#define UTF_32      0313
 21#define UCS2        0314
 22#define EUC_JP      0320
 23#define SHIFT_JIS   0321
 24#define ISO2022_JP  0322
 25#define GB18030     0330
 26#define GBK         0331
 27#define GB2312      0332
 28#define BIG5        0340
 29#define EUC_KR      0350
 30
 31/* Definitions of charmaps. Each charmap consists of:
 32 * 1. Empty-string-terminated list of null-terminated aliases.
 33 * 2. Special type code or number of elided quads of entries.
 34 * 3. Character table (size determined by field 2), consisting
 35 *    of 5 bytes for every 4 characters, interpreted as 10-bit
 36 *    indices into the legacy_chars table. */
 37
 38static const unsigned char charmaps[] =
 39"utf8\0char\0\0\310"
 40"wchart\0\0\306"
 41"ucs2be\0\0\304"
 42"ucs2le\0\0\305"
 43"utf16be\0\0\302"
 44"utf16le\0\0\301"
 45"ucs4be\0utf32be\0\0\300"
 46"ucs4le\0utf32le\0\0\303"
 47"ascii\0usascii\0iso646\0iso646us\0\0\307"
 48"utf16\0\0\312"
 49"ucs4\0utf32\0\0\313"
 50"ucs2\0\0\314"
 51"eucjp\0\0\320"
 52"shiftjis\0sjis\0cp932\0\0\321"
 53"iso2022jp\0\0\322"
 54"gb18030\0\0\330"
 55"gbk\0\0\331"
 56"gb2312\0\0\332"
 57"big5\0bigfive\0cp950\0big5hkscs\0\0\340"
 58"euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
 59#include "codepages.h"
 60;
 61
 62/* Table of characters that appear in legacy 8-bit codepages,
 63 * limited to 1024 slots (10 bit indices). The first 256 entries
 64 * are elided since those characters are obviously all included. */
 65static const unsigned short legacy_chars[] = {
 66#include "legacychars.h"
 67};
 68
 69static const unsigned short jis0208[84][94] = {
 70#include "jis0208.h"
 71};
 72
 73static const unsigned short gb18030[126][190] = {
 74#include "gb18030.h"
 75};
 76
 77static const unsigned short big5[89][157] = {
 78#include "big5.h"
 79};
 80
 81static const unsigned short hkscs[] = {
 82#include "hkscs.h"
 83};
 84
 85static const unsigned short ksc[93][94] = {
 86#include "ksc.h"
 87};
 88
 89static const unsigned short rev_jis[] = {
 90#include "revjis.h"
 91};
 92
 93static int fuzzycmp(const unsigned char *a, const unsigned char *b)
 94{
 95	for (; *a && *b; a++, b++) {
 96		while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
 97		if ((*a|32U) != *b) return 1;
 98	}
 99	return *a != *b;
100}
101
102static size_t find_charmap(const void *name)
103{
104	const unsigned char *s;
105	if (!*(char *)name) name=charmaps; /* "utf8" */
106	for (s=charmaps; *s; ) {
107		if (!fuzzycmp(name, s)) {
108			for (; *s; s+=strlen((void *)s)+1);
109			return s+1-charmaps;
110		}
111		s += strlen((void *)s)+1;
112		if (!*s) {
113			if (s[1] > 0200) s+=2;
114			else s+=2+(64U-s[1])*5;
115		}
116	}
117	return -1;
118}
119
120struct stateful_cd {
121	iconv_t base_cd;
122	unsigned state;
123};
124
125static iconv_t combine_to_from(size_t t, size_t f)
126{
127	return (void *)(f<<16 | t<<1 | 1);
128}
129
130static size_t extract_from(iconv_t cd)
131{
132	return (size_t)cd >> 16;
133}
134
135static size_t extract_to(iconv_t cd)
136{
137	return (size_t)cd >> 1 & 0x7fff;
138}
139
140iconv_t iconv_open(const char *to, const char *from)
141{
142	size_t f, t;
143	struct stateful_cd *scd;
144
145	if ((t = find_charmap(to))==-1
146	 || (f = find_charmap(from))==-1
147	 || (charmaps[t] >= 0330)) {
148		errno = EINVAL;
149		return (iconv_t)-1;
150	}
151	iconv_t cd = combine_to_from(t, f);
152
153	switch (charmaps[f]) {
154	case UTF_16:
155	case UTF_32:
156	case UCS2:
157	case ISO2022_JP:
158		scd = malloc(sizeof *scd);
159		if (!scd) return (iconv_t)-1;
160		scd->base_cd = cd;
161		scd->state = 0;
162		cd = (iconv_t)scd;
163	}
164
165	return cd;
166}
167
168static unsigned get_16(const unsigned char *s, int e)
169{
170	e &= 1;
171	return s[e]<<8 | s[1-e];
172}
173
174static void put_16(unsigned char *s, unsigned c, int e)
175{
176	e &= 1;
177	s[e] = c>>8;
178	s[1-e] = c;
179}
180
181static unsigned get_32(const unsigned char *s, int e)
182{
183	e &= 3;
184	return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
185}
186
187static void put_32(unsigned char *s, unsigned c, int e)
188{
189	e &= 3;
190	s[e^0] = c>>24;
191	s[e^1] = c>>16;
192	s[e^2] = c>>8;
193	s[e^3] = c;
194}
195
196/* Adapt as needed */
197#define mbrtowc_utf8 mbrtowc
198#define wctomb_utf8 wctomb
199
200static unsigned legacy_map(const unsigned char *map, unsigned c)
201{
202	if (c < 4*map[-1]) return c;
203	unsigned x = c - 4*map[-1];
204	x = map[x*5/4]>>2*x%8 | map[x*5/4+1]<<8-2*x%8 & 1023;
205	return x < 256 ? x : legacy_chars[x-256];
206}
207
208static unsigned uni_to_jis(unsigned c)
209{
210	unsigned nel = sizeof rev_jis / sizeof *rev_jis;
211	unsigned d, j, i, b = 0;
212	for (;;) {
213		i = nel/2;
214		j = rev_jis[b+i];
215		d = jis0208[j/256][j%256];
216		if (d==c) return j + 0x2121;
217		else if (nel == 1) return 0;
218		else if (c < d)
219			nel /= 2;
220		else {
221			b += i;
222			nel -= nel/2;
223		}
224	}
225}
226
227size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
228{
229	size_t x=0;
230	struct stateful_cd *scd=0;
231	if (!((size_t)cd & 1)) {
232		scd = (void *)cd;
233		cd = scd->base_cd;
234	}
235	unsigned to = extract_to(cd);
236	unsigned from = extract_from(cd);
237	const unsigned char *map = charmaps+from+1;
238	const unsigned char *tomap = charmaps+to+1;
239	mbstate_t st = {0};
240	wchar_t wc;
241	unsigned c, d;
242	size_t k, l;
243	int err;
244	unsigned char type = map[-1];
245	unsigned char totype = tomap[-1];
246	locale_t *ploc = &CURRENT_LOCALE, loc = *ploc;
247
248	if (!in || !*in || !*inb) return 0;
249
250	*ploc = UTF8_LOCALE;
251
252	for (; *inb; *in+=l, *inb-=l) {
253		c = *(unsigned char *)*in;
254		l = 1;
255
256		switch (type) {
257		case UTF_8:
258			if (c < 128) break;
259			l = mbrtowc_utf8(&wc, *in, *inb, &st);
260			if (l == (size_t)-1) goto ilseq;
261			if (l == (size_t)-2) goto starved;
262			c = wc;
263			break;
264		case US_ASCII:
265			if (c >= 128) goto ilseq;
266			break;
267		case WCHAR_T:
268			l = sizeof(wchar_t);
269			if (*inb < l) goto starved;
270			c = *(wchar_t *)*in;
271			if (0) {
272		case UTF_32BE:
273		case UTF_32LE:
274			l = 4;
275			if (*inb < 4) goto starved;
276			c = get_32((void *)*in, type);
277			}
278			if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
279			break;
280		case UCS2BE:
281		case UCS2LE:
282		case UTF_16BE:
283		case UTF_16LE:
284			l = 2;
285			if (*inb < 2) goto starved;
286			c = get_16((void *)*in, type);
287			if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
288			if ((unsigned)(c-0xd800) < 0x400) {
289				if (type-UCS2BE < 2U) goto ilseq;
290				l = 4;
291				if (*inb < 4) goto starved;
292				d = get_16((void *)(*in + 2), type);
293				if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
294				c = ((c-0xd7c0)<<10) + (d-0xdc00);
295			}
296			break;
297		case UCS2:
298		case UTF_16:
299			l = 0;
300			if (!scd->state) {
301				if (*inb < 2) goto starved;
302				c = get_16((void *)*in, 0);
303				scd->state = type==UCS2
304					? c==0xfffe ? UCS2LE : UCS2BE
305					: c==0xfffe ? UTF_16LE : UTF_16BE;
306				if (c == 0xfffe || c == 0xfeff)
307					l = 2;
308			}
309			type = scd->state;
310			continue;
311		case UTF_32:
312			l = 0;
313			if (!scd->state) {
314				if (*inb < 4) goto starved;
315				c = get_32((void *)*in, 0);
316				scd->state = c==0xfffe0000 ? UTF_32LE : UTF_32BE;
317				if (c == 0xfffe0000 || c == 0xfeff)
318					l = 4;
319			}
320			type = scd->state;
321			continue;
322		case SHIFT_JIS:
323			if (c < 128) break;
324			if (c-0xa1 <= 0xdf-0xa1) {
325				c += 0xff61-0xa1;
326				break;
327			}
328			l = 2;
329			if (*inb < 2) goto starved;
330			d = *((unsigned char *)*in + 1);
331			if (c-129 <= 159-129) c -= 129;
332			else if (c-224 <= 239-224) c -= 193;
333			else goto ilseq;
334			c *= 2;
335			if (d-64 <= 158-64) {
336				if (d==127) goto ilseq;
337				if (d>127) d--;
338				d -= 64;
339			} else if (d-159 <= 252-159) {
340				c++;
341				d -= 159;
342			}
343			c = jis0208[c][d];
344			if (!c) goto ilseq;
345			break;
346		case EUC_JP:
347			if (c < 128) break;
348			l = 2;
349			if (*inb < 2) goto starved;
350			d = *((unsigned char *)*in + 1);
351			if (c==0x8e) {
352				c = d;
353				if (c-0xa1 > 0xdf-0xa1) goto ilseq;
354				c += 0xff61 - 0xa1;
355				break;
356			}
357			c -= 0xa1;
358			d -= 0xa1;
359			if (c >= 84 || d >= 94) goto ilseq;
360			c = jis0208[c][d];
361			if (!c) goto ilseq;
362			break;
363		case ISO2022_JP:
364			if (c >= 128) goto ilseq;
365			if (c == '\033') {
366				l = 3;
367				if (*inb < 3) goto starved;
368				c = *((unsigned char *)*in + 1);
369				d = *((unsigned char *)*in + 2);
370				if (c != '(' && c != '$') goto ilseq;
371				switch (128*(c=='$') + d) {
372				case 'B': scd->state=0; continue;
373				case 'J': scd->state=1; continue;
374				case 'I': scd->state=4; continue;
375				case 128+'@': scd->state=2; continue;
376				case 128+'B': scd->state=3; continue;
377				}
378				goto ilseq;
379			}
380			switch (scd->state) {
381			case 1:
382				if (c=='\\') c = 0xa5;
383				if (c=='~') c = 0x203e;
384				break;
385			case 2:
386			case 3:
387				l = 2;
388				if (*inb < 2) goto starved;
389				d = *((unsigned char *)*in + 1);
390				c -= 0x21;
391				d -= 0x21;
392				if (c >= 84 || d >= 94) goto ilseq;
393				c = jis0208[c][d];
394				if (!c) goto ilseq;
395				break;
396			case 4:
397				if (c-0x60 < 0x1f) goto ilseq;
398				if (c-0x21 < 0x5e) c += 0xff61-0x21;
399				break;
400			}
401			break;
402		case GB2312:
403			if (c < 128) break;
404			if (c < 0xa1) goto ilseq;
405		case GBK:
406		case GB18030:
407			if (c < 128) break;
408			c -= 0x81;
409			if (c >= 126) goto ilseq;
410			l = 2;
411			if (*inb < 2) goto starved;
412			d = *((unsigned char *)*in + 1);
413			if (d < 0xa1 && type == GB2312) goto ilseq;
414			if (d-0x40>=191 || d==127) {
415				if (d-'0'>9 || type != GB18030)
416					goto ilseq;
417				l = 4;
418				if (*inb < 4) goto starved;
419				c = (10*c + d-'0') * 1260;
420				d = *((unsigned char *)*in + 2);
421				if (d-0x81>126) goto ilseq;
422				c += 10*(d-0x81);
423				d = *((unsigned char *)*in + 3);
424				if (d-'0'>9) goto ilseq;
425				c += d-'0';
426				c += 128;
427				for (d=0; d<=c; ) {
428					k = 0;
429					for (int i=0; i<126; i++)
430						for (int j=0; j<190; j++)
431							if (gb18030[i][j]-d <= c-d)
432								k++;
433					d = c+1;
434					c += k;
435				}
436				break;
437			}
438			d -= 0x40;
439			if (d>63) d--;
440			c = gb18030[c][d];
441			break;
442		case BIG5:
443			if (c < 128) break;
444			l = 2;
445			if (*inb < 2) goto starved;
446			d = *((unsigned char *)*in + 1);
447			if (d-0x40>=0xff-0x40 || d-0x7f<0xa1-0x7f) goto ilseq;
448			d -= 0x40;
449			if (d > 0x3e) d -= 0x22;
450			if (c-0xa1>=0xfa-0xa1) {
451				if (c-0x87>=0xff-0x87) goto ilseq;
452				if (c < 0xa1) c -= 0x87;
453				else c -= 0x87 + (0xfa-0xa1);
454				c = (hkscs[4867+(c*157+d)/16]>>(c*157+d)%16)%2<<17
455					| hkscs[c*157+d];
456				/* A few HKSCS characters map to pairs of UCS
457				 * characters. These are mapped to surrogate
458				 * range in the hkscs table then hard-coded
459				 * here. Ugly, yes. */
460				if (c/256 == 0xdc) {
461					union {
462						char c[8];
463						wchar_t wc[2];
464					} tmp;
465					char *ptmp = tmp.c;
466					size_t tmpx = iconv(combine_to_from(to, find_charmap("utf8")),
467						&(char *){"\303\212\314\204"
468						"\303\212\314\214"
469						"\303\252\314\204"
470						"\303\252\314\214"
471						+c%256}, &(size_t){4},
472						&ptmp, &(size_t){sizeof tmp});
473					size_t tmplen = ptmp - tmp.c;
474					if (tmplen > *outb) goto toobig;
475					if (tmpx) x++;
476					memcpy(*out, &tmp, tmplen);
477					*out += tmplen;
478					*outb -= tmplen;
479					continue;
480				}
481				if (!c) goto ilseq;
482				break;
483			}
484			c -= 0xa1;
485			c = big5[c][d]|(c==0x27&&(d==0x3a||d==0x3c||d==0x42))<<17;
486			if (!c) goto ilseq;
487			break;
488		case EUC_KR:
489			if (c < 128) break;
490			l = 2;
491			if (*inb < 2) goto starved;
492			d = *((unsigned char *)*in + 1);
493			c -= 0xa1;
494			d -= 0xa1;
495			if (c >= 93 || d >= 94) {
496				c += (0xa1-0x81);
497				d += 0xa1;
498				if (c > 0xc6-0x81 || c==0xc6-0x81 && d>0x52)
499					goto ilseq;
500				if (d-'A'<26) d = d-'A';
501				else if (d-'a'<26) d = d-'a'+26;
502				else if (d-0x81<0xff-0x81) d = d-0x81+52;
503				else goto ilseq;
504				if (c < 0x20) c = 178*c + d;
505				else c = 178*0x20 + 84*(c-0x20) + d;
506				c += 0xac00;
507				for (d=0xac00; d<=c; ) {
508					k = 0;
509					for (int i=0; i<93; i++)
510						for (int j=0; j<94; j++)
511							if (ksc[i][j]-d <= c-d)
512								k++;
513					d = c+1;
514					c += k;
515				}
516				break;
517			}
518			c = ksc[c][d];
519			if (!c) goto ilseq;
520			break;
521		default:
522			if (!c) break;
523			c = legacy_map(map, c);
524			if (!c) goto ilseq;
525		}
526
527		switch (totype) {
528		case WCHAR_T:
529			if (*outb < sizeof(wchar_t)) goto toobig;
530			*(wchar_t *)*out = c;
531			*out += sizeof(wchar_t);
532			*outb -= sizeof(wchar_t);
533			break;
534		case UTF_8:
535			if (*outb < 4) {
536				char tmp[4];
537				k = wctomb_utf8(tmp, c);
538				if (*outb < k) goto toobig;
539				memcpy(*out, tmp, k);
540			} else k = wctomb_utf8(*out, c);
541			/* This failure condition should be unreachable, but
542			 * is included to prevent decoder bugs from translating
543			 * into advancement outside the output buffer range. */
544			if (k>4) goto ilseq;
545			*out += k;
546			*outb -= k;
547			break;
548		case US_ASCII:
549			if (c > 0x7f) subst: x++, c='*';
550		default:
551			if (*outb < 1) goto toobig;
552			if (c<256 && c==legacy_map(tomap, c)) {
553			revout:
554				if (*outb < 1) goto toobig;
555				*(*out)++ = c;
556				*outb -= 1;
557				break;
558			}
559			d = c;
560			for (c=4*totype; c<256; c++) {
561				if (d == legacy_map(tomap, c)) {
562					goto revout;
563				}
564			}
565			goto subst;
566		case SHIFT_JIS:
567			if (c < 128) goto revout;
568			if (c == 0xa5) {
569				x++;
570				c = '\\';
571				goto revout;
572			}
573			if (c == 0x203e) {
574				x++;
575				c = '~';
576				goto revout;
577			}
578			if (c-0xff61 <= 0xdf-0xa1) {
579				c += 0xa1 - 0xff61;
580				goto revout;
581			}
582			c = uni_to_jis(c);
583			if (!c) goto subst;
584			if (*outb < 2) goto toobig;
585			d = c%256;
586			c = c/256;
587			*(*out)++ = (c+1)/2 + (c<95 ? 112 : 176);
588			*(*out)++ = c%2 ? d + 31 + d/96 : d + 126;
589			*outb -= 2;
590			break;
591		case EUC_JP:
592			if (c < 128) goto revout;
593			if (c-0xff61 <= 0xdf-0xa1) {
594				c += 0x0e00 + 0x21 - 0xff61;
595			} else {
596				c = uni_to_jis(c);
597			}
598			if (!c) goto subst;
599			if (*outb < 2) goto toobig;
600			*(*out)++ = c/256 + 0x80;
601			*(*out)++ = c%256 + 0x80;
602			*outb -= 2;
603			break;
604		case ISO2022_JP:
605			if (c < 128) goto revout;
606			if (c-0xff61 <= 0xdf-0xa1 || c==0xa5 || c==0x203e) {
607				if (*outb < 7) goto toobig;
608				*(*out)++ = '\033';
609				*(*out)++ = '(';
610				if (c==0xa5) {
611					*(*out)++ = 'J';
612					*(*out)++ = '\\';
613				} else if (c==0x203e) {
614					*(*out)++ = 'J';
615					*(*out)++ = '~';
616				} else {
617					*(*out)++ = 'I';
618					*(*out)++ = c-0xff61+0x21;
619				}
620				*(*out)++ = '\033';
621				*(*out)++ = '(';
622				*(*out)++ = 'B';
623				*outb -= 7;
624				break;
625			}
626			c = uni_to_jis(c);
627			if (!c) goto subst;
628			if (*outb < 8) goto toobig;
629			*(*out)++ = '\033';
630			*(*out)++ = '$';
631			*(*out)++ = 'B';
632			*(*out)++ = c/256;
633			*(*out)++ = c%256;
634			*(*out)++ = '\033';
635			*(*out)++ = '(';
636			*(*out)++ = 'B';
637			*outb -= 8;
638			break;
639		case UCS2:
640			totype = UCS2BE;
641		case UCS2BE:
642		case UCS2LE:
643		case UTF_16:
644		case UTF_16BE:
645		case UTF_16LE:
646			if (c < 0x10000 || totype-UCS2BE < 2U) {
647				if (c >= 0x10000) c = 0xFFFD;
648				if (*outb < 2) goto toobig;
649				put_16((void *)*out, c, totype);
650				*out += 2;
651				*outb -= 2;
652				break;
653			}
654			if (*outb < 4) goto toobig;
655			c -= 0x10000;
656			put_16((void *)*out, (c>>10)|0xd800, totype);
657			put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
658			*out += 4;
659			*outb -= 4;
660			break;
661		case UTF_32:
662			totype = UTF_32BE;
663		case UTF_32BE:
664		case UTF_32LE:
665			if (*outb < 4) goto toobig;
666			put_32((void *)*out, c, totype);
667			*out += 4;
668			*outb -= 4;
669			break;
670		}
671	}
672	*ploc = loc;
673	return x;
674ilseq:
675	err = EILSEQ;
676	x = -1;
677	goto end;
678toobig:
679	err = E2BIG;
680	x = -1;
681	goto end;
682starved:
683	err = EINVAL;
684	x = -1;
685end:
686	errno = err;
687	*ploc = loc;
688	return x;
689}