1: /* Determine a canonical name for the current locale's character encoding.
2:
3: Copyright (C) 2000-2006, 2008-2018 Free Software Foundation, Inc.
4:
5: This program is free software; you can redistribute it and/or modify it
6: under the terms of the GNU Library General Public License as published
7: by the Free Software Foundation; either version 2, or (at your option)
8: any later version.
9:
10: This program is distributed in the hope that it will be useful,
11: but WITHOUT ANY WARRANTY; without even the implied warranty of
12: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13: Library General Public License for more details.
14:
15: You should have received a copy of the GNU Library General Public License
16: along with this program; if not, see <https://www.gnu.org/licenses/>. */
17:
18: /* Written by Bruno Haible <bruno@clisp.org>. */
19:
20: #include <config.h>
21:
22: /* Specification. */
23: #include "localcharset.h"
24:
25: #include <stddef.h>
26: #include <stdio.h>
27: #include <string.h>
28: #include <stdlib.h>
29:
30: #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
31: # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
32: #endif
33:
34: #if defined _WIN32 && !defined __CYGWIN__
35: # define WINDOWS_NATIVE
36: # include <locale.h>
37: #endif
38:
39: #if defined __EMX__
40: /* Assume EMX program runs on OS/2, even if compiled under DOS. */
41: # ifndef OS2
42: # define OS2
43: # endif
44: #endif
45:
46: #if !defined WINDOWS_NATIVE
47: # if HAVE_LANGINFO_CODESET
48: # include <langinfo.h>
49: # else
50: # if 0 /* see comment regarding use of setlocale(), below */
51: # include <locale.h>
52: # endif
53: # endif
54: # ifdef __CYGWIN__
55: # define WIN32_LEAN_AND_MEAN
56: # include <windows.h>
57: # endif
58: #elif defined WINDOWS_NATIVE
59: # define WIN32_LEAN_AND_MEAN
60: # include <windows.h>
61: #endif
62: #if defined OS2
63: # define INCL_DOS
64: # include <os2.h>
65: #endif
66:
67: /* For MB_CUR_MAX_L */
68: #if defined DARWIN7
69: # include <xlocale.h>
70: #endif
71:
72:
73: #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
74:
75: /* On these platforms, we use a mapping from non-canonical encoding name
76: to GNU canonical encoding name. */
77:
78: /* With glibc-2.1 or newer, we don't need any canonicalization,
79: because glibc has iconv and both glibc and libiconv support all
80: GNU canonical names directly. */
81: # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__)
82:
83: struct table_entry
84: {
85: const char alias[11+1];
86: const char canonical[11+1];
87: };
88:
89: /* Table of platform-dependent mappings, sorted in ascending order. */
90: static const struct table_entry alias_table[] =
91: {
92: # if defined __FreeBSD__ /* FreeBSD */
93: /*{ "ARMSCII-8", "ARMSCII-8" },*/
94: { "Big5", "BIG5" },
95: { "Big5HKSCS", "BIG5-HKSCS" },
96: { "C", "ASCII" },
97: /*{ "CP1131", "CP1131" },*/
98: /*{ "CP1251", "CP1251" },*/
99: /*{ "CP866", "CP866" },*/
100: /*{ "GB18030", "GB18030" },*/
101: /*{ "GB2312", "GB2312" },*/
102: /*{ "GBK", "GBK" },*/
103: /*{ "ISCII-DEV", "?" },*/
104: { "ISO8859-1", "ISO-8859-1" },
105: { "ISO8859-13", "ISO-8859-13" },
106: { "ISO8859-15", "ISO-8859-15" },
107: { "ISO8859-2", "ISO-8859-2" },
108: { "ISO8859-4", "ISO-8859-4" },
109: { "ISO8859-5", "ISO-8859-5" },
110: { "ISO8859-7", "ISO-8859-7" },
111: { "ISO8859-9", "ISO-8859-9" },
112: /*{ "KOI8-R", "KOI8-R" },*/
113: /*{ "KOI8-U", "KOI8-U" },*/
114: { "SJIS", "SHIFT_JIS" },
115: { "US-ASCII", "ASCII" },
116: { "eucCN", "GB2312" },
117: { "eucJP", "EUC-JP" },
118: { "eucKR", "EUC-KR" }
119: # define alias_table_defined
120: # endif
121: # if defined __NetBSD__ /* NetBSD */
122: { "646", "ASCII" },
123: /*{ "ARMSCII-8", "ARMSCII-8" },*/
124: /*{ "BIG5", "BIG5" },*/
125: { "Big5-HKSCS", "BIG5-HKSCS" },
126: /*{ "CP1251", "CP1251" },*/
127: /*{ "CP866", "CP866" },*/
128: /*{ "GB18030", "GB18030" },*/
129: /*{ "GB2312", "GB2312" },*/
130: { "ISO8859-1", "ISO-8859-1" },
131: { "ISO8859-13", "ISO-8859-13" },
132: { "ISO8859-15", "ISO-8859-15" },
133: { "ISO8859-2", "ISO-8859-2" },
134: { "ISO8859-4", "ISO-8859-4" },
135: { "ISO8859-5", "ISO-8859-5" },
136: { "ISO8859-7", "ISO-8859-7" },
137: /*{ "KOI8-R", "KOI8-R" },*/
138: /*{ "KOI8-U", "KOI8-U" },*/
139: /*{ "PT154", "PT154" },*/
140: { "SJIS", "SHIFT_JIS" },
141: { "eucCN", "GB2312" },
142: { "eucJP", "EUC-JP" },
143: { "eucKR", "EUC-KR" },
144: { "eucTW", "EUC-TW" }
145: # define alias_table_defined
146: # endif
147: # if defined __OpenBSD__ /* OpenBSD */
148: { "646", "ASCII" },
149: { "ISO8859-1", "ISO-8859-1" },
150: { "ISO8859-13", "ISO-8859-13" },
151: { "ISO8859-15", "ISO-8859-15" },
152: { "ISO8859-2", "ISO-8859-2" },
153: { "ISO8859-4", "ISO-8859-4" },
154: { "ISO8859-5", "ISO-8859-5" },
155: { "ISO8859-7", "ISO-8859-7" }
156: # define alias_table_defined
157: # endif
158: # if defined __APPLE__ && defined __MACH__ /* Mac OS X */
159: /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
160: useless:
161: - It returns the empty string when LANG is set to a locale of the
162: form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
163: LC_CTYPE file.
164: - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
165: the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
166: - The documentation says:
167: "... all code that calls BSD system routines should ensure
168: that the const *char parameters of these routines are in UTF-8
169: encoding. All BSD system functions expect their string
170: parameters to be in UTF-8 encoding and nothing else."
171: It also says
172: "An additional caveat is that string parameters for files,
173: paths, and other file-system entities must be in canonical
174: UTF-8. In a canonical UTF-8 Unicode string, all decomposable
175: characters are decomposed ..."
176: but this is not true: You can pass non-decomposed UTF-8 strings
177: to file system functions, and it is the OS which will convert
178: them to decomposed UTF-8 before accessing the file system.
179: - The Apple Terminal application displays UTF-8 by default.
180: - However, other applications are free to use different encodings:
181: - xterm uses ISO-8859-1 by default.
182: - TextEdit uses MacRoman by default.
183: We prefer UTF-8 over decomposed UTF-8-MAC because one should
184: minimize the use of decomposed Unicode. Unfortunately, through the
185: Darwin file system, decomposed UTF-8 strings are leaked into user
186: space nevertheless.
187: Then there are also the locales with encodings other than US-ASCII
188: and UTF-8. These locales can be occasionally useful to users (e.g.
189: when grepping through ISO-8859-1 encoded text files), when all their
190: file names are in US-ASCII.
191: */
192: { "ARMSCII-8", "ARMSCII-8" },
193: { "Big5", "BIG5" },
194: { "Big5HKSCS", "BIG5-HKSCS" },
195: { "CP1131", "CP1131" },
196: { "CP1251", "CP1251" },
197: { "CP866", "CP866" },
198: { "CP949", "CP949" },
199: { "GB18030", "GB18030" },
200: { "GB2312", "GB2312" },
201: { "GBK", "GBK" },
202: /*{ "ISCII-DEV", "?" },*/
203: { "ISO8859-1", "ISO-8859-1" },
204: { "ISO8859-13", "ISO-8859-13" },
205: { "ISO8859-15", "ISO-8859-15" },
206: { "ISO8859-2", "ISO-8859-2" },
207: { "ISO8859-4", "ISO-8859-4" },
208: { "ISO8859-5", "ISO-8859-5" },
209: { "ISO8859-7", "ISO-8859-7" },
210: { "ISO8859-9", "ISO-8859-9" },
211: { "KOI8-R", "KOI8-R" },
212: { "KOI8-U", "KOI8-U" },
213: { "PT154", "PT154" },
214: { "SJIS", "SHIFT_JIS" },
215: { "eucCN", "GB2312" },
216: { "eucJP", "EUC-JP" },
217: { "eucKR", "EUC-KR" }
218: # define alias_table_defined
219: # endif
220: # if defined _AIX /* AIX */
221: /*{ "GBK", "GBK" },*/
222: { "IBM-1046", "CP1046" },
223: { "IBM-1124", "CP1124" },
224: { "IBM-1129", "CP1129" },
225: { "IBM-1252", "CP1252" },
226: { "IBM-850", "CP850" },
227: { "IBM-856", "CP856" },
228: { "IBM-921", "ISO-8859-13" },
229: { "IBM-922", "CP922" },
230: { "IBM-932", "CP932" },
231: { "IBM-943", "CP943" },
232: { "IBM-eucCN", "GB2312" },
233: { "IBM-eucJP", "EUC-JP" },
234: { "IBM-eucKR", "EUC-KR" },
235: { "IBM-eucTW", "EUC-TW" },
236: { "ISO8859-1", "ISO-8859-1" },
237: { "ISO8859-15", "ISO-8859-15" },
238: { "ISO8859-2", "ISO-8859-2" },
239: { "ISO8859-5", "ISO-8859-5" },
240: { "ISO8859-6", "ISO-8859-6" },
241: { "ISO8859-7", "ISO-8859-7" },
242: { "ISO8859-8", "ISO-8859-8" },
243: { "ISO8859-9", "ISO-8859-9" },
244: { "TIS-620", "TIS-620" },
245: /*{ "UTF-8", "UTF-8" },*/
246: { "big5", "BIG5" }
247: # define alias_table_defined
248: # endif
249: # if defined __hpux /* HP-UX */
250: { "SJIS", "SHIFT_JIS" },
251: { "arabic8", "HP-ARABIC8" },
252: { "big5", "BIG5" },
253: { "cp1251", "CP1251" },
254: { "eucJP", "EUC-JP" },
255: { "eucKR", "EUC-KR" },
256: { "eucTW", "EUC-TW" },
257: { "gb18030", "GB18030" },
258: { "greek8", "HP-GREEK8" },
259: { "hebrew8", "HP-HEBREW8" },
260: { "hkbig5", "BIG5-HKSCS" },
261: { "hp15CN", "GB2312" },
262: { "iso88591", "ISO-8859-1" },
263: { "iso885913", "ISO-8859-13" },
264: { "iso885915", "ISO-8859-15" },
265: { "iso88592", "ISO-8859-2" },
266: { "iso88594", "ISO-8859-4" },
267: { "iso88595", "ISO-8859-5" },
268: { "iso88596", "ISO-8859-6" },
269: { "iso88597", "ISO-8859-7" },
270: { "iso88598", "ISO-8859-8" },
271: { "iso88599", "ISO-8859-9" },
272: { "kana8", "HP-KANA8" },
273: { "koi8r", "KOI8-R" },
274: { "roman8", "HP-ROMAN8" },
275: { "tis620", "TIS-620" },
276: { "turkish8", "HP-TURKISH8" },
277: { "utf8", "UTF-8" }
278: # define alias_table_defined
279: # endif
280: # if defined __sgi /* IRIX */
281: { "ISO8859-1", "ISO-8859-1" },
282: { "ISO8859-15", "ISO-8859-15" },
283: { "ISO8859-2", "ISO-8859-2" },
284: { "ISO8859-5", "ISO-8859-5" },
285: { "ISO8859-7", "ISO-8859-7" },
286: { "ISO8859-9", "ISO-8859-9" },
287: { "eucCN", "GB2312" },
288: { "eucJP", "EUC-JP" },
289: { "eucKR", "EUC-KR" },
290: { "eucTW", "EUC-TW" }
291: # define alias_table_defined
292: # endif
293: # if defined __osf__ /* OSF/1 */
294: /*{ "GBK", "GBK" },*/
295: { "ISO8859-1", "ISO-8859-1" },
296: { "ISO8859-15", "ISO-8859-15" },
297: { "ISO8859-2", "ISO-8859-2" },
298: { "ISO8859-4", "ISO-8859-4" },
299: { "ISO8859-5", "ISO-8859-5" },
300: { "ISO8859-7", "ISO-8859-7" },
301: { "ISO8859-8", "ISO-8859-8" },
302: { "ISO8859-9", "ISO-8859-9" },
303: { "KSC5601", "CP949" },
304: { "SJIS", "SHIFT_JIS" },
305: { "TACTIS", "TIS-620" },
306: /*{ "UTF-8", "UTF-8" },*/
307: { "big5", "BIG5" },
308: { "cp850", "CP850" },
309: { "dechanyu", "DEC-HANYU" },
310: { "dechanzi", "GB2312" },
311: { "deckanji", "DEC-KANJI" },
312: { "deckorean", "EUC-KR" },
313: { "eucJP", "EUC-JP" },
314: { "eucKR", "EUC-KR" },
315: { "eucTW", "EUC-TW" },
316: { "sdeckanji", "EUC-JP" }
317: # define alias_table_defined
318: # endif
319: # if defined __sun /* Solaris */
320: { "5601", "EUC-KR" },
321: { "646", "ASCII" },
322: /*{ "BIG5", "BIG5" },*/
323: { "Big5-HKSCS", "BIG5-HKSCS" },
324: { "GB18030", "GB18030" },
325: /*{ "GBK", "GBK" },*/
326: { "ISO8859-1", "ISO-8859-1" },
327: { "ISO8859-11", "TIS-620" },
328: { "ISO8859-13", "ISO-8859-13" },
329: { "ISO8859-15", "ISO-8859-15" },
330: { "ISO8859-2", "ISO-8859-2" },
331: { "ISO8859-3", "ISO-8859-3" },
332: { "ISO8859-4", "ISO-8859-4" },
333: { "ISO8859-5", "ISO-8859-5" },
334: { "ISO8859-6", "ISO-8859-6" },
335: { "ISO8859-7", "ISO-8859-7" },
336: { "ISO8859-8", "ISO-8859-8" },
337: { "ISO8859-9", "ISO-8859-9" },
338: { "PCK", "SHIFT_JIS" },
339: { "TIS620.2533", "TIS-620" },
340: /*{ "UTF-8", "UTF-8" },*/
341: { "ansi-1251", "CP1251" },
342: { "cns11643", "EUC-TW" },
343: { "eucJP", "EUC-JP" },
344: { "gb2312", "GB2312" },
345: { "koi8-r", "KOI8-R" }
346: # define alias_table_defined
347: # endif
348: # if defined __minix /* Minix */
349: { "646", "ASCII" }
350: # define alias_table_defined
351: # endif
352: # if defined WINDOWS_NATIVE || defined __CYGWIN__ /* Windows */
353: { "CP1361", "JOHAB" },
354: { "CP20127", "ASCII" },
355: { "CP20866", "KOI8-R" },
356: { "CP20936", "GB2312" },
357: { "CP21866", "KOI8-RU" },
358: { "CP28591", "ISO-8859-1" },
359: { "CP28592", "ISO-8859-2" },
360: { "CP28593", "ISO-8859-3" },
361: { "CP28594", "ISO-8859-4" },
362: { "CP28595", "ISO-8859-5" },
363: { "CP28596", "ISO-8859-6" },
364: { "CP28597", "ISO-8859-7" },
365: { "CP28598", "ISO-8859-8" },
366: { "CP28599", "ISO-8859-9" },
367: { "CP28605", "ISO-8859-15" },
368: { "CP38598", "ISO-8859-8" },
369: { "CP51932", "EUC-JP" },
370: { "CP51936", "GB2312" },
371: { "CP51949", "EUC-KR" },
372: { "CP51950", "EUC-TW" },
373: { "CP54936", "GB18030" },
374: { "CP65001", "UTF-8" },
375: { "CP936", "GBK" }
376: # define alias_table_defined
377: # endif
378: # if defined OS2 /* OS/2 */
379: /* The list of encodings is taken from "List of OS/2 Codepages"
380: by Alex Taylor:
381: <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
382: See also "IBM Globalization - Code page identifiers":
383: <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>. */
384: { "CP1089", "ISO-8859-6" },
385: { "CP1208", "UTF-8" },
386: { "CP1381", "GB2312" },
387: { "CP1386", "GBK" },
388: { "CP3372", "EUC-JP" },
389: { "CP813", "ISO-8859-7" },
390: { "CP819", "ISO-8859-1" },
391: { "CP878", "KOI8-R" },
392: { "CP912", "ISO-8859-2" },
393: { "CP913", "ISO-8859-3" },
394: { "CP914", "ISO-8859-4" },
395: { "CP915", "ISO-8859-5" },
396: { "CP916", "ISO-8859-8" },
397: { "CP920", "ISO-8859-9" },
398: { "CP921", "ISO-8859-13" },
399: { "CP923", "ISO-8859-15" },
400: { "CP954", "EUC-JP" },
401: { "CP964", "EUC-TW" },
402: { "CP970", "EUC-KR" }
403: # define alias_table_defined
404: # endif
405: # if defined VMS /* OpenVMS */
406: /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
407: "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
408: section 10.7 "Handling Different Character Sets". */
409: { "DECHANYU", "DEC-HANYU" },
410: { "DECHANZI", "GB2312" },
411: { "DECKANJI", "DEC-KANJI" },
412: { "DECKOREAN", "EUC-KR" },
413: { "ISO8859-1", "ISO-8859-1" },
414: { "ISO8859-2", "ISO-8859-2" },
415: { "ISO8859-5", "ISO-8859-5" },
416: { "ISO8859-7", "ISO-8859-7" },
417: { "ISO8859-8", "ISO-8859-8" },
418: { "ISO8859-9", "ISO-8859-9" },
419: { "SDECKANJI", "EUC-JP" },
420: { "SJIS", "SHIFT_JIS" },
421: { "eucJP", "EUC-JP" },
422: { "eucTW", "EUC-TW" }
423: # define alias_table_defined
424: # endif
425: # ifndef alias_table_defined
426: /* Just a dummy entry, to avoid a C syntax error. */
427: { "", "" }
428: # endif
429: };
430:
431: # endif
432:
433: #else
434:
435: /* On these platforms, we use a mapping from locale name to GNU canonical
436: encoding name. */
437:
438: struct table_entry
439: {
440: const char locale[17+1];
441: const char canonical[11+1];
442: };
443:
444: /* Table of platform-dependent mappings, sorted in ascending order. */
445: static const struct table_entry locale_table[] =
446: {
447: # if defined __FreeBSD__ /* FreeBSD 4.2 */
448: { "cs_CZ.ISO_8859-2", "ISO-8859-2" },
449: { "da_DK.DIS_8859-15", "ISO-8859-15" },
450: { "da_DK.ISO_8859-1", "ISO-8859-1" },
451: { "de_AT.DIS_8859-15", "ISO-8859-15" },
452: { "de_AT.ISO_8859-1", "ISO-8859-1" },
453: { "de_CH.DIS_8859-15", "ISO-8859-15" },
454: { "de_CH.ISO_8859-1", "ISO-8859-1" },
455: { "de_DE.DIS_8859-15", "ISO-8859-15" },
456: { "de_DE.ISO_8859-1", "ISO-8859-1" },
457: { "en_AU.DIS_8859-15", "ISO-8859-15" },
458: { "en_AU.ISO_8859-1", "ISO-8859-1" },
459: { "en_CA.DIS_8859-15", "ISO-8859-15" },
460: { "en_CA.ISO_8859-1", "ISO-8859-1" },
461: { "en_GB.DIS_8859-15", "ISO-8859-15" },
462: { "en_GB.ISO_8859-1", "ISO-8859-1" },
463: { "en_US.DIS_8859-15", "ISO-8859-15" },
464: { "en_US.ISO_8859-1", "ISO-8859-1" },
465: { "es_ES.DIS_8859-15", "ISO-8859-15" },
466: { "es_ES.ISO_8859-1", "ISO-8859-1" },
467: { "fi_FI.DIS_8859-15", "ISO-8859-15" },
468: { "fi_FI.ISO_8859-1", "ISO-8859-1" },
469: { "fr_BE.DIS_8859-15", "ISO-8859-15" },
470: { "fr_BE.ISO_8859-1", "ISO-8859-1" },
471: { "fr_CA.DIS_8859-15", "ISO-8859-15" },
472: { "fr_CA.ISO_8859-1", "ISO-8859-1" },
473: { "fr_CH.DIS_8859-15", "ISO-8859-15" },
474: { "fr_CH.ISO_8859-1", "ISO-8859-1" },
475: { "fr_FR.DIS_8859-15", "ISO-8859-15" },
476: { "fr_FR.ISO_8859-1", "ISO-8859-1" },
477: { "hr_HR.ISO_8859-2", "ISO-8859-2" },
478: { "hu_HU.ISO_8859-2", "ISO-8859-2" },
479: { "is_IS.DIS_8859-15", "ISO-8859-15" },
480: { "is_IS.ISO_8859-1", "ISO-8859-1" },
481: { "it_CH.DIS_8859-15", "ISO-8859-15" },
482: { "it_CH.ISO_8859-1", "ISO-8859-1" },
483: { "it_IT.DIS_8859-15", "ISO-8859-15" },
484: { "it_IT.ISO_8859-1", "ISO-8859-1" },
485: { "ja_JP.EUC", "EUC-JP" },
486: { "ja_JP.SJIS", "SHIFT_JIS" },
487: { "ja_JP.Shift_JIS", "SHIFT_JIS" },
488: { "ko_KR.EUC", "EUC-KR" },
489: { "la_LN.ASCII", "ASCII" },
490: { "la_LN.DIS_8859-15", "ISO-8859-15" },
491: { "la_LN.ISO_8859-1", "ISO-8859-1" },
492: { "la_LN.ISO_8859-2", "ISO-8859-2" },
493: { "la_LN.ISO_8859-4", "ISO-8859-4" },
494: { "lt_LN.ASCII", "ASCII" },
495: { "lt_LN.DIS_8859-15", "ISO-8859-15" },
496: { "lt_LN.ISO_8859-1", "ISO-8859-1" },
497: { "lt_LN.ISO_8859-2", "ISO-8859-2" },
498: { "lt_LT.ISO_8859-4", "ISO-8859-4" },
499: { "nl_BE.DIS_8859-15", "ISO-8859-15" },
500: { "nl_BE.ISO_8859-1", "ISO-8859-1" },
501: { "nl_NL.DIS_8859-15", "ISO-8859-15" },
502: { "nl_NL.ISO_8859-1", "ISO-8859-1" },
503: { "no_NO.DIS_8859-15", "ISO-8859-15" },
504: { "no_NO.ISO_8859-1", "ISO-8859-1" },
505: { "pl_PL.ISO_8859-2", "ISO-8859-2" },
506: { "pt_PT.DIS_8859-15", "ISO-8859-15" },
507: { "pt_PT.ISO_8859-1", "ISO-8859-1" },
508: { "ru_RU.CP866", "CP866" },
509: { "ru_RU.ISO_8859-5", "ISO-8859-5" },
510: { "ru_RU.KOI8-R", "KOI8-R" },
511: { "ru_SU.CP866", "CP866" },
512: { "ru_SU.ISO_8859-5", "ISO-8859-5" },
513: { "ru_SU.KOI8-R", "KOI8-R" },
514: { "sl_SI.ISO_8859-2", "ISO-8859-2" },
515: { "sv_SE.DIS_8859-15", "ISO-8859-15" },
516: { "sv_SE.ISO_8859-1", "ISO-8859-1" },
517: { "uk_UA.KOI8-U", "KOI8-U" },
518: { "zh_CN.EUC", "GB2312" },
519: { "zh_TW.BIG5", "BIG5" },
520: { "zh_TW.Big5", "BIG5" }
521: # define locale_table_defined
522: # endif
523: # if defined __DJGPP__ /* DOS / DJGPP 2.03 */
524: /* The encodings given here may not all be correct.
525: If you find that the encoding given for your language and
526: country is not the one your DOS machine actually uses, just
527: correct it in this file, and send a mail to
528: Juan Manuel Guerrero <juan.guerrero@gmx.de>
529: and <bug-gnulib@gnu.org>. */
530: { "C", "ASCII" },
531: { "ar", "CP864" },
532: { "ar_AE", "CP864" },
533: { "ar_DZ", "CP864" },
534: { "ar_EG", "CP864" },
535: { "ar_IQ", "CP864" },
536: { "ar_IR", "CP864" },
537: { "ar_JO", "CP864" },
538: { "ar_KW", "CP864" },
539: { "ar_MA", "CP864" },
540: { "ar_OM", "CP864" },
541: { "ar_QA", "CP864" },
542: { "ar_SA", "CP864" },
543: { "ar_SY", "CP864" },
544: { "be", "CP866" },
545: { "be_BE", "CP866" },
546: { "bg", "CP866" }, /* not CP855 ?? */
547: { "bg_BG", "CP866" }, /* not CP855 ?? */
548: { "ca", "CP850" },
549: { "ca_ES", "CP850" },
550: { "cs", "CP852" },
551: { "cs_CZ", "CP852" },
552: { "da", "CP865" }, /* not CP850 ?? */
553: { "da_DK", "CP865" }, /* not CP850 ?? */
554: { "de", "CP850" },
555: { "de_AT", "CP850" },
556: { "de_CH", "CP850" },
557: { "de_DE", "CP850" },
558: { "el", "CP869" },
559: { "el_GR", "CP869" },
560: { "en", "CP850" },
561: { "en_AU", "CP850" }, /* not CP437 ?? */
562: { "en_CA", "CP850" },
563: { "en_GB", "CP850" },
564: { "en_NZ", "CP437" },
565: { "en_US", "CP437" },
566: { "en_ZA", "CP850" }, /* not CP437 ?? */
567: { "eo", "CP850" },
568: { "eo_EO", "CP850" },
569: { "es", "CP850" },
570: { "es_AR", "CP850" },
571: { "es_BO", "CP850" },
572: { "es_CL", "CP850" },
573: { "es_CO", "CP850" },
574: { "es_CR", "CP850" },
575: { "es_CU", "CP850" },
576: { "es_DO", "CP850" },
577: { "es_EC", "CP850" },
578: { "es_ES", "CP850" },
579: { "es_GT", "CP850" },
580: { "es_HN", "CP850" },
581: { "es_MX", "CP850" },
582: { "es_NI", "CP850" },
583: { "es_PA", "CP850" },
584: { "es_PE", "CP850" },
585: { "es_PY", "CP850" },
586: { "es_SV", "CP850" },
587: { "es_UY", "CP850" },
588: { "es_VE", "CP850" },
589: { "et", "CP850" },
590: { "et_EE", "CP850" },
591: { "eu", "CP850" },
592: { "eu_ES", "CP850" },
593: { "fi", "CP850" },
594: { "fi_FI", "CP850" },
595: { "fr", "CP850" },
596: { "fr_BE", "CP850" },
597: { "fr_CA", "CP850" },
598: { "fr_CH", "CP850" },
599: { "fr_FR", "CP850" },
600: { "ga", "CP850" },
601: { "ga_IE", "CP850" },
602: { "gd", "CP850" },
603: { "gd_GB", "CP850" },
604: { "gl", "CP850" },
605: { "gl_ES", "CP850" },
606: { "he", "CP862" },
607: { "he_IL", "CP862" },
608: { "hr", "CP852" },
609: { "hr_HR", "CP852" },
610: { "hu", "CP852" },
611: { "hu_HU", "CP852" },
612: { "id", "CP850" }, /* not CP437 ?? */
613: { "id_ID", "CP850" }, /* not CP437 ?? */
614: { "is", "CP861" }, /* not CP850 ?? */
615: { "is_IS", "CP861" }, /* not CP850 ?? */
616: { "it", "CP850" },
617: { "it_CH", "CP850" },
618: { "it_IT", "CP850" },
619: { "ja", "CP932" },
620: { "ja_JP", "CP932" },
621: { "kr", "CP949" }, /* not CP934 ?? */
622: { "kr_KR", "CP949" }, /* not CP934 ?? */
623: { "lt", "CP775" },
624: { "lt_LT", "CP775" },
625: { "lv", "CP775" },
626: { "lv_LV", "CP775" },
627: { "mk", "CP866" }, /* not CP855 ?? */
628: { "mk_MK", "CP866" }, /* not CP855 ?? */
629: { "mt", "CP850" },
630: { "mt_MT", "CP850" },
631: { "nb", "CP865" }, /* not CP850 ?? */
632: { "nb_NO", "CP865" }, /* not CP850 ?? */
633: { "nl", "CP850" },
634: { "nl_BE", "CP850" },
635: { "nl_NL", "CP850" },
636: { "nn", "CP865" }, /* not CP850 ?? */
637: { "nn_NO", "CP865" }, /* not CP850 ?? */
638: { "no", "CP865" }, /* not CP850 ?? */
639: { "no_NO", "CP865" }, /* not CP850 ?? */
640: { "pl", "CP852" },
641: { "pl_PL", "CP852" },
642: { "pt", "CP850" },
643: { "pt_BR", "CP850" },
644: { "pt_PT", "CP850" },
645: { "ro", "CP852" },
646: { "ro_RO", "CP852" },
647: { "ru", "CP866" },
648: { "ru_RU", "CP866" },
649: { "sk", "CP852" },
650: { "sk_SK", "CP852" },
651: { "sl", "CP852" },
652: { "sl_SI", "CP852" },
653: { "sq", "CP852" },
654: { "sq_AL", "CP852" },
655: { "sr", "CP852" }, /* CP852 or CP866 or CP855 ?? */
656: { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */
657: { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */
658: { "sv", "CP850" },
659: { "sv_SE", "CP850" },
660: { "th", "CP874" },
661: { "th_TH", "CP874" },
662: { "tr", "CP857" },
663: { "tr_TR", "CP857" },
664: { "uk", "CP1125" },
665: { "uk_UA", "CP1125" },
666: { "zh_CN", "GBK" },
667: { "zh_TW", "CP950" } /* not CP938 ?? */
668: # define locale_table_defined
669: # endif
670: # ifndef locale_table_defined
671: /* Just a dummy entry, to avoid a C syntax error. */
672: { "", "" }
673: # endif
674: };
675:
676: #endif
677:
678:
679: /* Determine the current locale's character encoding, and canonicalize it
680: into one of the canonical names listed in localcharset.h.
681: The result must not be freed; it is statically allocated.
682: If the canonical name cannot be determined, the result is a non-canonical
683: name. */
684:
685: #ifdef STATIC
686: STATIC
687: #endif
688: const char *
689: locale_charset (void)
690: {
691: const char *codeset;
692:
693: #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
694:
695: # if HAVE_LANGINFO_CODESET
696:
697: /* Most systems support nl_langinfo (CODESET) nowadays. */
698: codeset = nl_langinfo (CODESET);
699:
700: # ifdef __CYGWIN__
701: /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
702: returns "US-ASCII". Return the suffix of the locale name from the
703: environment variables (if present) or the codepage as a number. */
704: if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
705: {
706: const char *locale;
707: static char buf[2 + 10 + 1];
708:
709: locale = getenv ("LC_ALL");
710: if (locale == NULL || locale[0] == '\0')
711: {
712: locale = getenv ("LC_CTYPE");
713: if (locale == NULL || locale[0] == '\0')
714: locale = getenv ("LANG");
715: }
716: if (locale != NULL && locale[0] != '\0')
717: {
718: /* If the locale name contains an encoding after the dot, return
719: it. */
720: const char *dot = strchr (locale, '.');
721:
722: if (dot != NULL)
723: {
724: const char *modifier;
725:
726: dot++;
727: /* Look for the possible @... trailer and remove it, if any. */
728: modifier = strchr (dot, '@');
729: if (modifier == NULL)
730: return dot;
731: if (modifier - dot < sizeof (buf))
732: {
733: memcpy (buf, dot, modifier - dot);
734: buf [modifier - dot] = '\0';
735: return buf;
736: }
737: }
738: }
739:
740: /* The Windows API has a function returning the locale's codepage as a
741: number: GetACP(). This encoding is used by Cygwin, unless the user
742: has set the environment variable CYGWIN=codepage:oem (which very few
743: people do).
744: Output directed to console windows needs to be converted (to
745: GetOEMCP() if the console is using a raster font, or to
746: GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
747: this conversion transparently (see winsup/cygwin/fhandler_console.cc),
748: converting to GetConsoleOutputCP(). This leads to correct results,
749: except when SetConsoleOutputCP has been called and a raster font is
750: in use. */
751: sprintf (buf, "CP%u", GetACP ());
752: codeset = buf;
753: }
754: # endif
755:
756: if (codeset == NULL)
757: /* The canonical name cannot be determined. */
758: codeset = "";
759:
760: # elif defined WINDOWS_NATIVE
761:
762: static char buf[2 + 10 + 1];
763:
764: /* The Windows API has a function returning the locale's codepage as
765: a number, but the value doesn't change according to what the
766: 'setlocale' call specified. So we use it as a last resort, in
767: case the string returned by 'setlocale' doesn't specify the
768: codepage. */
769: char *current_locale = setlocale (LC_ALL, NULL);
770: char *pdot;
771:
772: /* If they set different locales for different categories,
773: 'setlocale' will return a semi-colon separated list of locale
774: values. To make sure we use the correct one, we choose LC_CTYPE. */
775: if (strchr (current_locale, ';'))
776: current_locale = setlocale (LC_CTYPE, NULL);
777:
778: pdot = strrchr (current_locale, '.');
779: if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
780: sprintf (buf, "CP%s", pdot + 1);
781: else
782: {
783: /* The Windows API has a function returning the locale's codepage as a
784: number: GetACP().
785: When the output goes to a console window, it needs to be provided in
786: GetOEMCP() encoding if the console is using a raster font, or in
787: GetConsoleOutputCP() encoding if it is using a TrueType font.
788: But in GUI programs and for output sent to files and pipes, GetACP()
789: encoding is the best bet. */
790: sprintf (buf, "CP%u", GetACP ());
791: }
792: codeset = buf;
793:
794: # elif defined OS2
795:
796: const char *locale;
797: static char buf[2 + 10 + 1];
798: ULONG cp[3];
799: ULONG cplen;
800:
801: codeset = NULL;
802:
803: /* Allow user to override the codeset, as set in the operating system,
804: with standard language environment variables. */
805: locale = getenv ("LC_ALL");
806: if (locale == NULL || locale[0] == '\0')
807: {
808: locale = getenv ("LC_CTYPE");
809: if (locale == NULL || locale[0] == '\0')
810: locale = getenv ("LANG");
811: }
812: if (locale != NULL && locale[0] != '\0')
813: {
814: /* If the locale name contains an encoding after the dot, return it. */
815: const char *dot = strchr (locale, '.');
816:
817: if (dot != NULL)
818: {
819: const char *modifier;
820:
821: dot++;
822: /* Look for the possible @... trailer and remove it, if any. */
823: modifier = strchr (dot, '@');
824: if (modifier == NULL)
825: return dot;
826: if (modifier - dot < sizeof (buf))
827: {
828: memcpy (buf, dot, modifier - dot);
829: buf [modifier - dot] = '\0';
830: return buf;
831: }
832: }
833:
834: /* For the POSIX locale, don't use the system's codepage. */
835: if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0)
836: codeset = "";
837: }
838:
839: if (codeset == NULL)
840: {
841: /* OS/2 has a function returning the locale's codepage as a number. */
842: if (DosQueryCp (sizeof (cp), cp, &cplen))
843: codeset = "";
844: else
845: {
846: sprintf (buf, "CP%u", cp[0]);
847: codeset = buf;
848: }
849: }
850:
851: # else
852:
853: # error "Add code for other platforms here."
854:
855: # endif
856:
857: /* Resolve alias. */
858: {
859: # ifdef alias_table_defined
860: /* On some platforms, UTF-8 locales are the most frequently used ones.
861: Speed up the common case and slow down the less common cases by
862: testing for this case first. */
863: # if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
864: if (strcmp (codeset, "UTF-8") == 0)
865: goto done_table_lookup;
866: else
867: # endif
868: {
869: const struct table_entry * const table = alias_table;
870: size_t const table_size =
871: sizeof (alias_table) / sizeof (struct table_entry);
872: /* The table is sorted. Perform a binary search. */
873: size_t hi = table_size;
874: size_t lo = 0;
875: while (lo < hi)
876: {
877: /* Invariant:
878: for i < lo, strcmp (table[i].alias, codeset) < 0,
879: for i >= hi, strcmp (table[i].alias, codeset) > 0. */
880: size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
881: int cmp = strcmp (table[mid].alias, codeset);
882: if (cmp < 0)
883: lo = mid + 1;
884: else if (cmp > 0)
885: hi = mid;
886: else
887: {
888: /* Found an i with
889: strcmp (table[i].alias, codeset) == 0. */
890: codeset = table[mid].canonical;
891: goto done_table_lookup;
892: }
893: }
894: }
895: if (0)
896: done_table_lookup: ;
897: else
898: # endif
899: {
900: /* Did not find it in the table. */
901: /* On Mac OS X, all modern locales use the UTF-8 encoding.
902: BeOS and Haiku have a single locale, and it has UTF-8 encoding. */
903: # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
904: codeset = "UTF-8";
905: # else
906: /* Don't return an empty string. GNU libc and GNU libiconv interpret
907: the empty string as denoting "the locale's character encoding",
908: thus GNU libiconv would call this function a second time. */
909: if (codeset[0] == '\0')
910: codeset = "ASCII";
911: # endif
912: }
913: }
914:
915: #else
916:
917: /* On old systems which lack it, use setlocale or getenv. */
918: const char *locale = NULL;
919:
920: /* But most old systems don't have a complete set of locales. Some
921: (like DJGPP) have only the C locale. Therefore we don't use setlocale
922: here; it would return "C" when it doesn't support the locale name the
923: user has set. */
924: # if 0
925: locale = setlocale (LC_CTYPE, NULL);
926: # endif
927: if (locale == NULL || locale[0] == '\0')
928: {
929: locale = getenv ("LC_ALL");
930: if (locale == NULL || locale[0] == '\0')
931: {
932: locale = getenv ("LC_CTYPE");
933: if (locale == NULL || locale[0] == '\0')
934: locale = getenv ("LANG");
935: if (locale == NULL)
936: locale = "";
937: }
938: }
939:
940: /* Map locale name to canonical encoding name. */
941: {
942: # ifdef locale_table_defined
943: const struct table_entry * const table = locale_table;
944: size_t const table_size =
945: sizeof (locale_table) / sizeof (struct table_entry);
946: /* The table is sorted. Perform a binary search. */
947: size_t hi = table_size;
948: size_t lo = 0;
949: while (lo < hi)
950: {
951: /* Invariant:
952: for i < lo, strcmp (table[i].locale, locale) < 0,
953: for i >= hi, strcmp (table[i].locale, locale) > 0. */
954: size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
955: int cmp = strcmp (table[mid].locale, locale);
956: if (cmp < 0)
957: lo = mid + 1;
958: else if (cmp > 0)
959: hi = mid;
960: else
961: {
962: /* Found an i with
963: strcmp (table[i].locale, locale) == 0. */
964: codeset = table[mid].canonical;
965: goto done_table_lookup;
966: }
967: }
968: if (0)
969: done_table_lookup: ;
970: else
971: # endif
972: {
973: /* Did not find it in the table. */
974: /* On Mac OS X, all modern locales use the UTF-8 encoding.
975: BeOS and Haiku have a single locale, and it has UTF-8 encoding. */
976: # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
977: codeset = "UTF-8";
978: # else
979: /* The canonical name cannot be determined. */
980: /* Don't return an empty string. GNU libc and GNU libiconv interpret
981: the empty string as denoting "the locale's character encoding",
982: thus GNU libiconv would call this function a second time. */
983: codeset = "ASCII";
984: # endif
985: }
986: }
987:
988: #endif
989:
990: #ifdef DARWIN7
991: /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
992: (the default codeset) does not work when MB_CUR_MAX is 1. */
993: if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
994: codeset = "ASCII";
995: #endif
996:
997: return codeset;
998: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>