Annotation of embedaddon/libiconv/libcharset/lib/localcharset.c, revision 1.1.1.3
1.1 misho 1: /* Determine a canonical name for the current locale's character encoding.
2:
1.1.1.3 ! misho 3: Copyright (C) 2000-2006, 2008-2018 Free Software Foundation, Inc.
1.1 misho 4:
5: This program is free software; you can redistribute it and/or modify it
6: under the terms of the GNU Library General Public License as published
7: by the Free Software Foundation; either version 2, or (at your option)
8: any later version.
9:
10: This program is distributed in the hope that it will be useful,
11: but WITHOUT ANY WARRANTY; without even the implied warranty of
12: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13: Library General Public License for more details.
14:
1.1.1.3 ! misho 15: You should have received a copy of the GNU Library General Public License
! 16: along with this program; if not, see <https://www.gnu.org/licenses/>. */
1.1 misho 17:
18: /* Written by Bruno Haible <bruno@clisp.org>. */
19:
20: #include <config.h>
21:
22: /* Specification. */
23: #include "localcharset.h"
24:
25: #include <stddef.h>
26: #include <stdio.h>
27: #include <string.h>
28: #include <stdlib.h>
29:
30: #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
1.1.1.3 ! misho 31: # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
1.1 misho 32: #endif
33:
1.1.1.3 ! misho 34: #if defined _WIN32 && !defined __CYGWIN__
! 35: # define WINDOWS_NATIVE
! 36: # include <locale.h>
1.1 misho 37: #endif
38:
39: #if defined __EMX__
40: /* Assume EMX program runs on OS/2, even if compiled under DOS. */
41: # ifndef OS2
42: # define OS2
43: # endif
44: #endif
45:
1.1.1.3 ! misho 46: #if !defined WINDOWS_NATIVE
1.1 misho 47: # if HAVE_LANGINFO_CODESET
48: # include <langinfo.h>
49: # else
1.1.1.3 ! misho 50: # if 0 /* see comment regarding use of setlocale(), below */
1.1 misho 51: # include <locale.h>
52: # endif
53: # endif
54: # ifdef __CYGWIN__
55: # define WIN32_LEAN_AND_MEAN
56: # include <windows.h>
57: # endif
1.1.1.3 ! misho 58: #elif defined WINDOWS_NATIVE
1.1 misho 59: # define WIN32_LEAN_AND_MEAN
60: # include <windows.h>
61: #endif
62: #if defined OS2
63: # define INCL_DOS
64: # include <os2.h>
65: #endif
66:
1.1.1.3 ! misho 67: /* For MB_CUR_MAX_L */
! 68: #if defined DARWIN7
! 69: # include <xlocale.h>
1.1 misho 70: #endif
71:
72:
1.1.1.3 ! misho 73: #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
1.1.1.2 misho 74:
1.1.1.3 ! misho 75: /* On these platforms, we use a mapping from non-canonical encoding name
! 76: to GNU canonical encoding name. */
1.1 misho 77:
1.1.1.3 ! misho 78: /* With glibc-2.1 or newer, we don't need any canonicalization,
! 79: because glibc has iconv and both glibc and libiconv support all
! 80: GNU canonical names directly. */
! 81: # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__)
1.1 misho 82:
1.1.1.3 ! misho 83: struct table_entry
1.1 misho 84: {
1.1.1.3 ! misho 85: const char alias[11+1];
! 86: const char canonical[11+1];
! 87: };
! 88:
! 89: /* Table of platform-dependent mappings, sorted in ascending order. */
! 90: static const struct table_entry alias_table[] =
! 91: {
! 92: # if defined __FreeBSD__ /* FreeBSD */
! 93: /*{ "ARMSCII-8", "ARMSCII-8" },*/
! 94: { "Big5", "BIG5" },
! 95: { "Big5HKSCS", "BIG5-HKSCS" },
! 96: { "C", "ASCII" },
! 97: /*{ "CP1131", "CP1131" },*/
! 98: /*{ "CP1251", "CP1251" },*/
! 99: /*{ "CP866", "CP866" },*/
! 100: /*{ "GB18030", "GB18030" },*/
! 101: /*{ "GB2312", "GB2312" },*/
! 102: /*{ "GBK", "GBK" },*/
! 103: /*{ "ISCII-DEV", "?" },*/
! 104: { "ISO8859-1", "ISO-8859-1" },
! 105: { "ISO8859-13", "ISO-8859-13" },
! 106: { "ISO8859-15", "ISO-8859-15" },
! 107: { "ISO8859-2", "ISO-8859-2" },
! 108: { "ISO8859-4", "ISO-8859-4" },
! 109: { "ISO8859-5", "ISO-8859-5" },
! 110: { "ISO8859-7", "ISO-8859-7" },
! 111: { "ISO8859-9", "ISO-8859-9" },
! 112: /*{ "KOI8-R", "KOI8-R" },*/
! 113: /*{ "KOI8-U", "KOI8-U" },*/
! 114: { "SJIS", "SHIFT_JIS" },
! 115: { "US-ASCII", "ASCII" },
! 116: { "eucCN", "GB2312" },
! 117: { "eucJP", "EUC-JP" },
! 118: { "eucKR", "EUC-KR" }
! 119: # define alias_table_defined
! 120: # endif
! 121: # if defined __NetBSD__ /* NetBSD */
! 122: { "646", "ASCII" },
! 123: /*{ "ARMSCII-8", "ARMSCII-8" },*/
! 124: /*{ "BIG5", "BIG5" },*/
! 125: { "Big5-HKSCS", "BIG5-HKSCS" },
! 126: /*{ "CP1251", "CP1251" },*/
! 127: /*{ "CP866", "CP866" },*/
! 128: /*{ "GB18030", "GB18030" },*/
! 129: /*{ "GB2312", "GB2312" },*/
! 130: { "ISO8859-1", "ISO-8859-1" },
! 131: { "ISO8859-13", "ISO-8859-13" },
! 132: { "ISO8859-15", "ISO-8859-15" },
! 133: { "ISO8859-2", "ISO-8859-2" },
! 134: { "ISO8859-4", "ISO-8859-4" },
! 135: { "ISO8859-5", "ISO-8859-5" },
! 136: { "ISO8859-7", "ISO-8859-7" },
! 137: /*{ "KOI8-R", "KOI8-R" },*/
! 138: /*{ "KOI8-U", "KOI8-U" },*/
! 139: /*{ "PT154", "PT154" },*/
! 140: { "SJIS", "SHIFT_JIS" },
! 141: { "eucCN", "GB2312" },
! 142: { "eucJP", "EUC-JP" },
! 143: { "eucKR", "EUC-KR" },
! 144: { "eucTW", "EUC-TW" }
! 145: # define alias_table_defined
! 146: # endif
! 147: # if defined __OpenBSD__ /* OpenBSD */
! 148: { "646", "ASCII" },
! 149: { "ISO8859-1", "ISO-8859-1" },
! 150: { "ISO8859-13", "ISO-8859-13" },
! 151: { "ISO8859-15", "ISO-8859-15" },
! 152: { "ISO8859-2", "ISO-8859-2" },
! 153: { "ISO8859-4", "ISO-8859-4" },
! 154: { "ISO8859-5", "ISO-8859-5" },
! 155: { "ISO8859-7", "ISO-8859-7" }
! 156: # define alias_table_defined
! 157: # endif
! 158: # if defined __APPLE__ && defined __MACH__ /* Mac OS X */
! 159: /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
! 160: useless:
! 161: - It returns the empty string when LANG is set to a locale of the
! 162: form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
! 163: LC_CTYPE file.
! 164: - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
! 165: the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
! 166: - The documentation says:
! 167: "... all code that calls BSD system routines should ensure
! 168: that the const *char parameters of these routines are in UTF-8
! 169: encoding. All BSD system functions expect their string
! 170: parameters to be in UTF-8 encoding and nothing else."
! 171: It also says
! 172: "An additional caveat is that string parameters for files,
! 173: paths, and other file-system entities must be in canonical
! 174: UTF-8. In a canonical UTF-8 Unicode string, all decomposable
! 175: characters are decomposed ..."
! 176: but this is not true: You can pass non-decomposed UTF-8 strings
! 177: to file system functions, and it is the OS which will convert
! 178: them to decomposed UTF-8 before accessing the file system.
! 179: - The Apple Terminal application displays UTF-8 by default.
! 180: - However, other applications are free to use different encodings:
! 181: - xterm uses ISO-8859-1 by default.
! 182: - TextEdit uses MacRoman by default.
! 183: We prefer UTF-8 over decomposed UTF-8-MAC because one should
! 184: minimize the use of decomposed Unicode. Unfortunately, through the
! 185: Darwin file system, decomposed UTF-8 strings are leaked into user
! 186: space nevertheless.
! 187: Then there are also the locales with encodings other than US-ASCII
! 188: and UTF-8. These locales can be occasionally useful to users (e.g.
! 189: when grepping through ISO-8859-1 encoded text files), when all their
! 190: file names are in US-ASCII.
! 191: */
! 192: { "ARMSCII-8", "ARMSCII-8" },
! 193: { "Big5", "BIG5" },
! 194: { "Big5HKSCS", "BIG5-HKSCS" },
! 195: { "CP1131", "CP1131" },
! 196: { "CP1251", "CP1251" },
! 197: { "CP866", "CP866" },
! 198: { "CP949", "CP949" },
! 199: { "GB18030", "GB18030" },
! 200: { "GB2312", "GB2312" },
! 201: { "GBK", "GBK" },
! 202: /*{ "ISCII-DEV", "?" },*/
! 203: { "ISO8859-1", "ISO-8859-1" },
! 204: { "ISO8859-13", "ISO-8859-13" },
! 205: { "ISO8859-15", "ISO-8859-15" },
! 206: { "ISO8859-2", "ISO-8859-2" },
! 207: { "ISO8859-4", "ISO-8859-4" },
! 208: { "ISO8859-5", "ISO-8859-5" },
! 209: { "ISO8859-7", "ISO-8859-7" },
! 210: { "ISO8859-9", "ISO-8859-9" },
! 211: { "KOI8-R", "KOI8-R" },
! 212: { "KOI8-U", "KOI8-U" },
! 213: { "PT154", "PT154" },
! 214: { "SJIS", "SHIFT_JIS" },
! 215: { "eucCN", "GB2312" },
! 216: { "eucJP", "EUC-JP" },
! 217: { "eucKR", "EUC-KR" }
! 218: # define alias_table_defined
! 219: # endif
! 220: # if defined _AIX /* AIX */
! 221: /*{ "GBK", "GBK" },*/
! 222: { "IBM-1046", "CP1046" },
! 223: { "IBM-1124", "CP1124" },
! 224: { "IBM-1129", "CP1129" },
! 225: { "IBM-1252", "CP1252" },
! 226: { "IBM-850", "CP850" },
! 227: { "IBM-856", "CP856" },
! 228: { "IBM-921", "ISO-8859-13" },
! 229: { "IBM-922", "CP922" },
! 230: { "IBM-932", "CP932" },
! 231: { "IBM-943", "CP943" },
! 232: { "IBM-eucCN", "GB2312" },
! 233: { "IBM-eucJP", "EUC-JP" },
! 234: { "IBM-eucKR", "EUC-KR" },
! 235: { "IBM-eucTW", "EUC-TW" },
! 236: { "ISO8859-1", "ISO-8859-1" },
! 237: { "ISO8859-15", "ISO-8859-15" },
! 238: { "ISO8859-2", "ISO-8859-2" },
! 239: { "ISO8859-5", "ISO-8859-5" },
! 240: { "ISO8859-6", "ISO-8859-6" },
! 241: { "ISO8859-7", "ISO-8859-7" },
! 242: { "ISO8859-8", "ISO-8859-8" },
! 243: { "ISO8859-9", "ISO-8859-9" },
! 244: { "TIS-620", "TIS-620" },
! 245: /*{ "UTF-8", "UTF-8" },*/
! 246: { "big5", "BIG5" }
! 247: # define alias_table_defined
! 248: # endif
! 249: # if defined __hpux /* HP-UX */
! 250: { "SJIS", "SHIFT_JIS" },
! 251: { "arabic8", "HP-ARABIC8" },
! 252: { "big5", "BIG5" },
! 253: { "cp1251", "CP1251" },
! 254: { "eucJP", "EUC-JP" },
! 255: { "eucKR", "EUC-KR" },
! 256: { "eucTW", "EUC-TW" },
! 257: { "gb18030", "GB18030" },
! 258: { "greek8", "HP-GREEK8" },
! 259: { "hebrew8", "HP-HEBREW8" },
! 260: { "hkbig5", "BIG5-HKSCS" },
! 261: { "hp15CN", "GB2312" },
! 262: { "iso88591", "ISO-8859-1" },
! 263: { "iso885913", "ISO-8859-13" },
! 264: { "iso885915", "ISO-8859-15" },
! 265: { "iso88592", "ISO-8859-2" },
! 266: { "iso88594", "ISO-8859-4" },
! 267: { "iso88595", "ISO-8859-5" },
! 268: { "iso88596", "ISO-8859-6" },
! 269: { "iso88597", "ISO-8859-7" },
! 270: { "iso88598", "ISO-8859-8" },
! 271: { "iso88599", "ISO-8859-9" },
! 272: { "kana8", "HP-KANA8" },
! 273: { "koi8r", "KOI8-R" },
! 274: { "roman8", "HP-ROMAN8" },
! 275: { "tis620", "TIS-620" },
! 276: { "turkish8", "HP-TURKISH8" },
! 277: { "utf8", "UTF-8" }
! 278: # define alias_table_defined
! 279: # endif
! 280: # if defined __sgi /* IRIX */
! 281: { "ISO8859-1", "ISO-8859-1" },
! 282: { "ISO8859-15", "ISO-8859-15" },
! 283: { "ISO8859-2", "ISO-8859-2" },
! 284: { "ISO8859-5", "ISO-8859-5" },
! 285: { "ISO8859-7", "ISO-8859-7" },
! 286: { "ISO8859-9", "ISO-8859-9" },
! 287: { "eucCN", "GB2312" },
! 288: { "eucJP", "EUC-JP" },
! 289: { "eucKR", "EUC-KR" },
! 290: { "eucTW", "EUC-TW" }
! 291: # define alias_table_defined
! 292: # endif
! 293: # if defined __osf__ /* OSF/1 */
! 294: /*{ "GBK", "GBK" },*/
! 295: { "ISO8859-1", "ISO-8859-1" },
! 296: { "ISO8859-15", "ISO-8859-15" },
! 297: { "ISO8859-2", "ISO-8859-2" },
! 298: { "ISO8859-4", "ISO-8859-4" },
! 299: { "ISO8859-5", "ISO-8859-5" },
! 300: { "ISO8859-7", "ISO-8859-7" },
! 301: { "ISO8859-8", "ISO-8859-8" },
! 302: { "ISO8859-9", "ISO-8859-9" },
! 303: { "KSC5601", "CP949" },
! 304: { "SJIS", "SHIFT_JIS" },
! 305: { "TACTIS", "TIS-620" },
! 306: /*{ "UTF-8", "UTF-8" },*/
! 307: { "big5", "BIG5" },
! 308: { "cp850", "CP850" },
! 309: { "dechanyu", "DEC-HANYU" },
! 310: { "dechanzi", "GB2312" },
! 311: { "deckanji", "DEC-KANJI" },
! 312: { "deckorean", "EUC-KR" },
! 313: { "eucJP", "EUC-JP" },
! 314: { "eucKR", "EUC-KR" },
! 315: { "eucTW", "EUC-TW" },
! 316: { "sdeckanji", "EUC-JP" }
! 317: # define alias_table_defined
! 318: # endif
! 319: # if defined __sun /* Solaris */
! 320: { "5601", "EUC-KR" },
! 321: { "646", "ASCII" },
! 322: /*{ "BIG5", "BIG5" },*/
! 323: { "Big5-HKSCS", "BIG5-HKSCS" },
! 324: { "GB18030", "GB18030" },
! 325: /*{ "GBK", "GBK" },*/
! 326: { "ISO8859-1", "ISO-8859-1" },
! 327: { "ISO8859-11", "TIS-620" },
! 328: { "ISO8859-13", "ISO-8859-13" },
! 329: { "ISO8859-15", "ISO-8859-15" },
! 330: { "ISO8859-2", "ISO-8859-2" },
! 331: { "ISO8859-3", "ISO-8859-3" },
! 332: { "ISO8859-4", "ISO-8859-4" },
! 333: { "ISO8859-5", "ISO-8859-5" },
! 334: { "ISO8859-6", "ISO-8859-6" },
! 335: { "ISO8859-7", "ISO-8859-7" },
! 336: { "ISO8859-8", "ISO-8859-8" },
! 337: { "ISO8859-9", "ISO-8859-9" },
! 338: { "PCK", "SHIFT_JIS" },
! 339: { "TIS620.2533", "TIS-620" },
! 340: /*{ "UTF-8", "UTF-8" },*/
! 341: { "ansi-1251", "CP1251" },
! 342: { "cns11643", "EUC-TW" },
! 343: { "eucJP", "EUC-JP" },
! 344: { "gb2312", "GB2312" },
! 345: { "koi8-r", "KOI8-R" }
! 346: # define alias_table_defined
! 347: # endif
! 348: # if defined __minix /* Minix */
! 349: { "646", "ASCII" }
! 350: # define alias_table_defined
! 351: # endif
! 352: # if defined WINDOWS_NATIVE || defined __CYGWIN__ /* Windows */
! 353: { "CP1361", "JOHAB" },
! 354: { "CP20127", "ASCII" },
! 355: { "CP20866", "KOI8-R" },
! 356: { "CP20936", "GB2312" },
! 357: { "CP21866", "KOI8-RU" },
! 358: { "CP28591", "ISO-8859-1" },
! 359: { "CP28592", "ISO-8859-2" },
! 360: { "CP28593", "ISO-8859-3" },
! 361: { "CP28594", "ISO-8859-4" },
! 362: { "CP28595", "ISO-8859-5" },
! 363: { "CP28596", "ISO-8859-6" },
! 364: { "CP28597", "ISO-8859-7" },
! 365: { "CP28598", "ISO-8859-8" },
! 366: { "CP28599", "ISO-8859-9" },
! 367: { "CP28605", "ISO-8859-15" },
! 368: { "CP38598", "ISO-8859-8" },
! 369: { "CP51932", "EUC-JP" },
! 370: { "CP51936", "GB2312" },
! 371: { "CP51949", "EUC-KR" },
! 372: { "CP51950", "EUC-TW" },
! 373: { "CP54936", "GB18030" },
! 374: { "CP65001", "UTF-8" },
! 375: { "CP936", "GBK" }
! 376: # define alias_table_defined
! 377: # endif
! 378: # if defined OS2 /* OS/2 */
! 379: /* The list of encodings is taken from "List of OS/2 Codepages"
! 380: by Alex Taylor:
! 381: <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
! 382: See also "IBM Globalization - Code page identifiers":
! 383: <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>. */
! 384: { "CP1089", "ISO-8859-6" },
! 385: { "CP1208", "UTF-8" },
! 386: { "CP1381", "GB2312" },
! 387: { "CP1386", "GBK" },
! 388: { "CP3372", "EUC-JP" },
! 389: { "CP813", "ISO-8859-7" },
! 390: { "CP819", "ISO-8859-1" },
! 391: { "CP878", "KOI8-R" },
! 392: { "CP912", "ISO-8859-2" },
! 393: { "CP913", "ISO-8859-3" },
! 394: { "CP914", "ISO-8859-4" },
! 395: { "CP915", "ISO-8859-5" },
! 396: { "CP916", "ISO-8859-8" },
! 397: { "CP920", "ISO-8859-9" },
! 398: { "CP921", "ISO-8859-13" },
! 399: { "CP923", "ISO-8859-15" },
! 400: { "CP954", "EUC-JP" },
! 401: { "CP964", "EUC-TW" },
! 402: { "CP970", "EUC-KR" }
! 403: # define alias_table_defined
! 404: # endif
! 405: # if defined VMS /* OpenVMS */
! 406: /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
! 407: "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
! 408: section 10.7 "Handling Different Character Sets". */
! 409: { "DECHANYU", "DEC-HANYU" },
! 410: { "DECHANZI", "GB2312" },
! 411: { "DECKANJI", "DEC-KANJI" },
! 412: { "DECKOREAN", "EUC-KR" },
! 413: { "ISO8859-1", "ISO-8859-1" },
! 414: { "ISO8859-2", "ISO-8859-2" },
! 415: { "ISO8859-5", "ISO-8859-5" },
! 416: { "ISO8859-7", "ISO-8859-7" },
! 417: { "ISO8859-8", "ISO-8859-8" },
! 418: { "ISO8859-9", "ISO-8859-9" },
! 419: { "SDECKANJI", "EUC-JP" },
! 420: { "SJIS", "SHIFT_JIS" },
! 421: { "eucJP", "EUC-JP" },
! 422: { "eucTW", "EUC-TW" }
! 423: # define alias_table_defined
! 424: # endif
! 425: # ifndef alias_table_defined
! 426: /* Just a dummy entry, to avoid a C syntax error. */
! 427: { "", "" }
! 428: # endif
! 429: };
1.1.1.2 misho 430:
1.1.1.3 ! misho 431: # endif
1.1 misho 432:
433: #else
434:
1.1.1.3 ! misho 435: /* On these platforms, we use a mapping from locale name to GNU canonical
! 436: encoding name. */
! 437:
! 438: struct table_entry
! 439: {
! 440: const char locale[17+1];
! 441: const char canonical[11+1];
! 442: };
! 443:
! 444: /* Table of platform-dependent mappings, sorted in ascending order. */
! 445: static const struct table_entry locale_table[] =
! 446: {
! 447: # if defined __FreeBSD__ /* FreeBSD 4.2 */
! 448: { "cs_CZ.ISO_8859-2", "ISO-8859-2" },
! 449: { "da_DK.DIS_8859-15", "ISO-8859-15" },
! 450: { "da_DK.ISO_8859-1", "ISO-8859-1" },
! 451: { "de_AT.DIS_8859-15", "ISO-8859-15" },
! 452: { "de_AT.ISO_8859-1", "ISO-8859-1" },
! 453: { "de_CH.DIS_8859-15", "ISO-8859-15" },
! 454: { "de_CH.ISO_8859-1", "ISO-8859-1" },
! 455: { "de_DE.DIS_8859-15", "ISO-8859-15" },
! 456: { "de_DE.ISO_8859-1", "ISO-8859-1" },
! 457: { "en_AU.DIS_8859-15", "ISO-8859-15" },
! 458: { "en_AU.ISO_8859-1", "ISO-8859-1" },
! 459: { "en_CA.DIS_8859-15", "ISO-8859-15" },
! 460: { "en_CA.ISO_8859-1", "ISO-8859-1" },
! 461: { "en_GB.DIS_8859-15", "ISO-8859-15" },
! 462: { "en_GB.ISO_8859-1", "ISO-8859-1" },
! 463: { "en_US.DIS_8859-15", "ISO-8859-15" },
! 464: { "en_US.ISO_8859-1", "ISO-8859-1" },
! 465: { "es_ES.DIS_8859-15", "ISO-8859-15" },
! 466: { "es_ES.ISO_8859-1", "ISO-8859-1" },
! 467: { "fi_FI.DIS_8859-15", "ISO-8859-15" },
! 468: { "fi_FI.ISO_8859-1", "ISO-8859-1" },
! 469: { "fr_BE.DIS_8859-15", "ISO-8859-15" },
! 470: { "fr_BE.ISO_8859-1", "ISO-8859-1" },
! 471: { "fr_CA.DIS_8859-15", "ISO-8859-15" },
! 472: { "fr_CA.ISO_8859-1", "ISO-8859-1" },
! 473: { "fr_CH.DIS_8859-15", "ISO-8859-15" },
! 474: { "fr_CH.ISO_8859-1", "ISO-8859-1" },
! 475: { "fr_FR.DIS_8859-15", "ISO-8859-15" },
! 476: { "fr_FR.ISO_8859-1", "ISO-8859-1" },
! 477: { "hr_HR.ISO_8859-2", "ISO-8859-2" },
! 478: { "hu_HU.ISO_8859-2", "ISO-8859-2" },
! 479: { "is_IS.DIS_8859-15", "ISO-8859-15" },
! 480: { "is_IS.ISO_8859-1", "ISO-8859-1" },
! 481: { "it_CH.DIS_8859-15", "ISO-8859-15" },
! 482: { "it_CH.ISO_8859-1", "ISO-8859-1" },
! 483: { "it_IT.DIS_8859-15", "ISO-8859-15" },
! 484: { "it_IT.ISO_8859-1", "ISO-8859-1" },
! 485: { "ja_JP.EUC", "EUC-JP" },
! 486: { "ja_JP.SJIS", "SHIFT_JIS" },
! 487: { "ja_JP.Shift_JIS", "SHIFT_JIS" },
! 488: { "ko_KR.EUC", "EUC-KR" },
! 489: { "la_LN.ASCII", "ASCII" },
! 490: { "la_LN.DIS_8859-15", "ISO-8859-15" },
! 491: { "la_LN.ISO_8859-1", "ISO-8859-1" },
! 492: { "la_LN.ISO_8859-2", "ISO-8859-2" },
! 493: { "la_LN.ISO_8859-4", "ISO-8859-4" },
! 494: { "lt_LN.ASCII", "ASCII" },
! 495: { "lt_LN.DIS_8859-15", "ISO-8859-15" },
! 496: { "lt_LN.ISO_8859-1", "ISO-8859-1" },
! 497: { "lt_LN.ISO_8859-2", "ISO-8859-2" },
! 498: { "lt_LT.ISO_8859-4", "ISO-8859-4" },
! 499: { "nl_BE.DIS_8859-15", "ISO-8859-15" },
! 500: { "nl_BE.ISO_8859-1", "ISO-8859-1" },
! 501: { "nl_NL.DIS_8859-15", "ISO-8859-15" },
! 502: { "nl_NL.ISO_8859-1", "ISO-8859-1" },
! 503: { "no_NO.DIS_8859-15", "ISO-8859-15" },
! 504: { "no_NO.ISO_8859-1", "ISO-8859-1" },
! 505: { "pl_PL.ISO_8859-2", "ISO-8859-2" },
! 506: { "pt_PT.DIS_8859-15", "ISO-8859-15" },
! 507: { "pt_PT.ISO_8859-1", "ISO-8859-1" },
! 508: { "ru_RU.CP866", "CP866" },
! 509: { "ru_RU.ISO_8859-5", "ISO-8859-5" },
! 510: { "ru_RU.KOI8-R", "KOI8-R" },
! 511: { "ru_SU.CP866", "CP866" },
! 512: { "ru_SU.ISO_8859-5", "ISO-8859-5" },
! 513: { "ru_SU.KOI8-R", "KOI8-R" },
! 514: { "sl_SI.ISO_8859-2", "ISO-8859-2" },
! 515: { "sv_SE.DIS_8859-15", "ISO-8859-15" },
! 516: { "sv_SE.ISO_8859-1", "ISO-8859-1" },
! 517: { "uk_UA.KOI8-U", "KOI8-U" },
! 518: { "zh_CN.EUC", "GB2312" },
! 519: { "zh_TW.BIG5", "BIG5" },
! 520: { "zh_TW.Big5", "BIG5" }
! 521: # define locale_table_defined
1.1 misho 522: # endif
1.1.1.3 ! misho 523: # if defined __DJGPP__ /* DOS / DJGPP 2.03 */
! 524: /* The encodings given here may not all be correct.
! 525: If you find that the encoding given for your language and
! 526: country is not the one your DOS machine actually uses, just
! 527: correct it in this file, and send a mail to
! 528: Juan Manuel Guerrero <juan.guerrero@gmx.de>
! 529: and <bug-gnulib@gnu.org>. */
! 530: { "C", "ASCII" },
! 531: { "ar", "CP864" },
! 532: { "ar_AE", "CP864" },
! 533: { "ar_DZ", "CP864" },
! 534: { "ar_EG", "CP864" },
! 535: { "ar_IQ", "CP864" },
! 536: { "ar_IR", "CP864" },
! 537: { "ar_JO", "CP864" },
! 538: { "ar_KW", "CP864" },
! 539: { "ar_MA", "CP864" },
! 540: { "ar_OM", "CP864" },
! 541: { "ar_QA", "CP864" },
! 542: { "ar_SA", "CP864" },
! 543: { "ar_SY", "CP864" },
! 544: { "be", "CP866" },
! 545: { "be_BE", "CP866" },
! 546: { "bg", "CP866" }, /* not CP855 ?? */
! 547: { "bg_BG", "CP866" }, /* not CP855 ?? */
! 548: { "ca", "CP850" },
! 549: { "ca_ES", "CP850" },
! 550: { "cs", "CP852" },
! 551: { "cs_CZ", "CP852" },
! 552: { "da", "CP865" }, /* not CP850 ?? */
! 553: { "da_DK", "CP865" }, /* not CP850 ?? */
! 554: { "de", "CP850" },
! 555: { "de_AT", "CP850" },
! 556: { "de_CH", "CP850" },
! 557: { "de_DE", "CP850" },
! 558: { "el", "CP869" },
! 559: { "el_GR", "CP869" },
! 560: { "en", "CP850" },
! 561: { "en_AU", "CP850" }, /* not CP437 ?? */
! 562: { "en_CA", "CP850" },
! 563: { "en_GB", "CP850" },
! 564: { "en_NZ", "CP437" },
! 565: { "en_US", "CP437" },
! 566: { "en_ZA", "CP850" }, /* not CP437 ?? */
! 567: { "eo", "CP850" },
! 568: { "eo_EO", "CP850" },
! 569: { "es", "CP850" },
! 570: { "es_AR", "CP850" },
! 571: { "es_BO", "CP850" },
! 572: { "es_CL", "CP850" },
! 573: { "es_CO", "CP850" },
! 574: { "es_CR", "CP850" },
! 575: { "es_CU", "CP850" },
! 576: { "es_DO", "CP850" },
! 577: { "es_EC", "CP850" },
! 578: { "es_ES", "CP850" },
! 579: { "es_GT", "CP850" },
! 580: { "es_HN", "CP850" },
! 581: { "es_MX", "CP850" },
! 582: { "es_NI", "CP850" },
! 583: { "es_PA", "CP850" },
! 584: { "es_PE", "CP850" },
! 585: { "es_PY", "CP850" },
! 586: { "es_SV", "CP850" },
! 587: { "es_UY", "CP850" },
! 588: { "es_VE", "CP850" },
! 589: { "et", "CP850" },
! 590: { "et_EE", "CP850" },
! 591: { "eu", "CP850" },
! 592: { "eu_ES", "CP850" },
! 593: { "fi", "CP850" },
! 594: { "fi_FI", "CP850" },
! 595: { "fr", "CP850" },
! 596: { "fr_BE", "CP850" },
! 597: { "fr_CA", "CP850" },
! 598: { "fr_CH", "CP850" },
! 599: { "fr_FR", "CP850" },
! 600: { "ga", "CP850" },
! 601: { "ga_IE", "CP850" },
! 602: { "gd", "CP850" },
! 603: { "gd_GB", "CP850" },
! 604: { "gl", "CP850" },
! 605: { "gl_ES", "CP850" },
! 606: { "he", "CP862" },
! 607: { "he_IL", "CP862" },
! 608: { "hr", "CP852" },
! 609: { "hr_HR", "CP852" },
! 610: { "hu", "CP852" },
! 611: { "hu_HU", "CP852" },
! 612: { "id", "CP850" }, /* not CP437 ?? */
! 613: { "id_ID", "CP850" }, /* not CP437 ?? */
! 614: { "is", "CP861" }, /* not CP850 ?? */
! 615: { "is_IS", "CP861" }, /* not CP850 ?? */
! 616: { "it", "CP850" },
! 617: { "it_CH", "CP850" },
! 618: { "it_IT", "CP850" },
! 619: { "ja", "CP932" },
! 620: { "ja_JP", "CP932" },
! 621: { "kr", "CP949" }, /* not CP934 ?? */
! 622: { "kr_KR", "CP949" }, /* not CP934 ?? */
! 623: { "lt", "CP775" },
! 624: { "lt_LT", "CP775" },
! 625: { "lv", "CP775" },
! 626: { "lv_LV", "CP775" },
! 627: { "mk", "CP866" }, /* not CP855 ?? */
! 628: { "mk_MK", "CP866" }, /* not CP855 ?? */
! 629: { "mt", "CP850" },
! 630: { "mt_MT", "CP850" },
! 631: { "nb", "CP865" }, /* not CP850 ?? */
! 632: { "nb_NO", "CP865" }, /* not CP850 ?? */
! 633: { "nl", "CP850" },
! 634: { "nl_BE", "CP850" },
! 635: { "nl_NL", "CP850" },
! 636: { "nn", "CP865" }, /* not CP850 ?? */
! 637: { "nn_NO", "CP865" }, /* not CP850 ?? */
! 638: { "no", "CP865" }, /* not CP850 ?? */
! 639: { "no_NO", "CP865" }, /* not CP850 ?? */
! 640: { "pl", "CP852" },
! 641: { "pl_PL", "CP852" },
! 642: { "pt", "CP850" },
! 643: { "pt_BR", "CP850" },
! 644: { "pt_PT", "CP850" },
! 645: { "ro", "CP852" },
! 646: { "ro_RO", "CP852" },
! 647: { "ru", "CP866" },
! 648: { "ru_RU", "CP866" },
! 649: { "sk", "CP852" },
! 650: { "sk_SK", "CP852" },
! 651: { "sl", "CP852" },
! 652: { "sl_SI", "CP852" },
! 653: { "sq", "CP852" },
! 654: { "sq_AL", "CP852" },
! 655: { "sr", "CP852" }, /* CP852 or CP866 or CP855 ?? */
! 656: { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */
! 657: { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */
! 658: { "sv", "CP850" },
! 659: { "sv_SE", "CP850" },
! 660: { "th", "CP874" },
! 661: { "th_TH", "CP874" },
! 662: { "tr", "CP857" },
! 663: { "tr_TR", "CP857" },
! 664: { "uk", "CP1125" },
! 665: { "uk_UA", "CP1125" },
! 666: { "zh_CN", "GBK" },
! 667: { "zh_TW", "CP950" } /* not CP938 ?? */
! 668: # define locale_table_defined
! 669: # endif
! 670: # ifndef locale_table_defined
! 671: /* Just a dummy entry, to avoid a C syntax error. */
! 672: { "", "" }
! 673: # endif
! 674: };
1.1 misho 675:
1.1.1.3 ! misho 676: #endif
1.1 misho 677:
678:
679: /* Determine the current locale's character encoding, and canonicalize it
1.1.1.3 ! misho 680: into one of the canonical names listed in localcharset.h.
1.1 misho 681: The result must not be freed; it is statically allocated.
682: If the canonical name cannot be determined, the result is a non-canonical
683: name. */
684:
685: #ifdef STATIC
686: STATIC
687: #endif
688: const char *
689: locale_charset (void)
690: {
691: const char *codeset;
692:
1.1.1.3 ! misho 693: #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
1.1 misho 694:
695: # if HAVE_LANGINFO_CODESET
696:
697: /* Most systems support nl_langinfo (CODESET) nowadays. */
698: codeset = nl_langinfo (CODESET);
699:
700: # ifdef __CYGWIN__
1.1.1.2 misho 701: /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
702: returns "US-ASCII". Return the suffix of the locale name from the
703: environment variables (if present) or the codepage as a number. */
1.1 misho 704: if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
705: {
706: const char *locale;
707: static char buf[2 + 10 + 1];
708:
709: locale = getenv ("LC_ALL");
710: if (locale == NULL || locale[0] == '\0')
1.1.1.2 misho 711: {
712: locale = getenv ("LC_CTYPE");
713: if (locale == NULL || locale[0] == '\0')
714: locale = getenv ("LANG");
715: }
1.1 misho 716: if (locale != NULL && locale[0] != '\0')
1.1.1.2 misho 717: {
718: /* If the locale name contains an encoding after the dot, return
719: it. */
720: const char *dot = strchr (locale, '.');
721:
722: if (dot != NULL)
723: {
724: const char *modifier;
725:
726: dot++;
727: /* Look for the possible @... trailer and remove it, if any. */
728: modifier = strchr (dot, '@');
729: if (modifier == NULL)
730: return dot;
731: if (modifier - dot < sizeof (buf))
732: {
733: memcpy (buf, dot, modifier - dot);
734: buf [modifier - dot] = '\0';
735: return buf;
736: }
737: }
738: }
739:
1.1.1.3 ! misho 740: /* The Windows API has a function returning the locale's codepage as a
! 741: number: GetACP(). This encoding is used by Cygwin, unless the user
! 742: has set the environment variable CYGWIN=codepage:oem (which very few
! 743: people do).
1.1.1.2 misho 744: Output directed to console windows needs to be converted (to
745: GetOEMCP() if the console is using a raster font, or to
746: GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
747: this conversion transparently (see winsup/cygwin/fhandler_console.cc),
748: converting to GetConsoleOutputCP(). This leads to correct results,
749: except when SetConsoleOutputCP has been called and a raster font is
750: in use. */
1.1 misho 751: sprintf (buf, "CP%u", GetACP ());
752: codeset = buf;
753: }
754: # endif
755:
1.1.1.3 ! misho 756: if (codeset == NULL)
! 757: /* The canonical name cannot be determined. */
! 758: codeset = "";
1.1 misho 759:
1.1.1.3 ! misho 760: # elif defined WINDOWS_NATIVE
1.1 misho 761:
762: static char buf[2 + 10 + 1];
763:
1.1.1.3 ! misho 764: /* The Windows API has a function returning the locale's codepage as
! 765: a number, but the value doesn't change according to what the
! 766: 'setlocale' call specified. So we use it as a last resort, in
! 767: case the string returned by 'setlocale' doesn't specify the
! 768: codepage. */
! 769: char *current_locale = setlocale (LC_ALL, NULL);
! 770: char *pdot;
! 771:
! 772: /* If they set different locales for different categories,
! 773: 'setlocale' will return a semi-colon separated list of locale
! 774: values. To make sure we use the correct one, we choose LC_CTYPE. */
! 775: if (strchr (current_locale, ';'))
! 776: current_locale = setlocale (LC_CTYPE, NULL);
! 777:
! 778: pdot = strrchr (current_locale, '.');
! 779: if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
! 780: sprintf (buf, "CP%s", pdot + 1);
! 781: else
! 782: {
! 783: /* The Windows API has a function returning the locale's codepage as a
! 784: number: GetACP().
! 785: When the output goes to a console window, it needs to be provided in
! 786: GetOEMCP() encoding if the console is using a raster font, or in
! 787: GetConsoleOutputCP() encoding if it is using a TrueType font.
! 788: But in GUI programs and for output sent to files and pipes, GetACP()
! 789: encoding is the best bet. */
! 790: sprintf (buf, "CP%u", GetACP ());
! 791: }
1.1 misho 792: codeset = buf;
793:
1.1.1.3 ! misho 794: # elif defined OS2
1.1 misho 795:
796: const char *locale;
797: static char buf[2 + 10 + 1];
798: ULONG cp[3];
799: ULONG cplen;
800:
1.1.1.3 ! misho 801: codeset = NULL;
! 802:
1.1 misho 803: /* Allow user to override the codeset, as set in the operating system,
804: with standard language environment variables. */
805: locale = getenv ("LC_ALL");
806: if (locale == NULL || locale[0] == '\0')
807: {
808: locale = getenv ("LC_CTYPE");
809: if (locale == NULL || locale[0] == '\0')
1.1.1.2 misho 810: locale = getenv ("LANG");
1.1 misho 811: }
812: if (locale != NULL && locale[0] != '\0')
813: {
814: /* If the locale name contains an encoding after the dot, return it. */
815: const char *dot = strchr (locale, '.');
816:
817: if (dot != NULL)
1.1.1.2 misho 818: {
819: const char *modifier;
1.1 misho 820:
1.1.1.2 misho 821: dot++;
822: /* Look for the possible @... trailer and remove it, if any. */
823: modifier = strchr (dot, '@');
824: if (modifier == NULL)
825: return dot;
826: if (modifier - dot < sizeof (buf))
827: {
828: memcpy (buf, dot, modifier - dot);
829: buf [modifier - dot] = '\0';
830: return buf;
831: }
832: }
1.1 misho 833:
1.1.1.3 ! misho 834: /* For the POSIX locale, don't use the system's codepage. */
! 835: if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0)
! 836: codeset = "";
1.1 misho 837: }
1.1.1.3 ! misho 838:
! 839: if (codeset == NULL)
1.1 misho 840: {
841: /* OS/2 has a function returning the locale's codepage as a number. */
842: if (DosQueryCp (sizeof (cp), cp, &cplen))
1.1.1.2 misho 843: codeset = "";
1.1 misho 844: else
1.1.1.2 misho 845: {
846: sprintf (buf, "CP%u", cp[0]);
847: codeset = buf;
848: }
1.1 misho 849: }
850:
1.1.1.3 ! misho 851: # else
1.1 misho 852:
1.1.1.3 ! misho 853: # error "Add code for other platforms here."
! 854:
! 855: # endif
! 856:
! 857: /* Resolve alias. */
! 858: {
! 859: # ifdef alias_table_defined
! 860: /* On some platforms, UTF-8 locales are the most frequently used ones.
! 861: Speed up the common case and slow down the less common cases by
! 862: testing for this case first. */
! 863: # if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
! 864: if (strcmp (codeset, "UTF-8") == 0)
! 865: goto done_table_lookup;
! 866: else
! 867: # endif
! 868: {
! 869: const struct table_entry * const table = alias_table;
! 870: size_t const table_size =
! 871: sizeof (alias_table) / sizeof (struct table_entry);
! 872: /* The table is sorted. Perform a binary search. */
! 873: size_t hi = table_size;
! 874: size_t lo = 0;
! 875: while (lo < hi)
! 876: {
! 877: /* Invariant:
! 878: for i < lo, strcmp (table[i].alias, codeset) < 0,
! 879: for i >= hi, strcmp (table[i].alias, codeset) > 0. */
! 880: size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
! 881: int cmp = strcmp (table[mid].alias, codeset);
! 882: if (cmp < 0)
! 883: lo = mid + 1;
! 884: else if (cmp > 0)
! 885: hi = mid;
! 886: else
! 887: {
! 888: /* Found an i with
! 889: strcmp (table[i].alias, codeset) == 0. */
! 890: codeset = table[mid].canonical;
! 891: goto done_table_lookup;
! 892: }
! 893: }
! 894: }
! 895: if (0)
! 896: done_table_lookup: ;
! 897: else
! 898: # endif
! 899: {
! 900: /* Did not find it in the table. */
! 901: /* On Mac OS X, all modern locales use the UTF-8 encoding.
! 902: BeOS and Haiku have a single locale, and it has UTF-8 encoding. */
! 903: # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
! 904: codeset = "UTF-8";
! 905: # else
! 906: /* Don't return an empty string. GNU libc and GNU libiconv interpret
! 907: the empty string as denoting "the locale's character encoding",
! 908: thus GNU libiconv would call this function a second time. */
! 909: if (codeset[0] == '\0')
! 910: codeset = "ASCII";
! 911: # endif
! 912: }
! 913: }
! 914:
! 915: #else
! 916:
! 917: /* On old systems which lack it, use setlocale or getenv. */
! 918: const char *locale = NULL;
1.1 misho 919:
1.1.1.3 ! misho 920: /* But most old systems don't have a complete set of locales. Some
! 921: (like DJGPP) have only the C locale. Therefore we don't use setlocale
! 922: here; it would return "C" when it doesn't support the locale name the
! 923: user has set. */
! 924: # if 0
! 925: locale = setlocale (LC_CTYPE, NULL);
! 926: # endif
! 927: if (locale == NULL || locale[0] == '\0')
! 928: {
! 929: locale = getenv ("LC_ALL");
! 930: if (locale == NULL || locale[0] == '\0')
! 931: {
! 932: locale = getenv ("LC_CTYPE");
! 933: if (locale == NULL || locale[0] == '\0')
! 934: locale = getenv ("LANG");
! 935: if (locale == NULL)
! 936: locale = "";
! 937: }
! 938: }
! 939:
! 940: /* Map locale name to canonical encoding name. */
! 941: {
! 942: # ifdef locale_table_defined
! 943: const struct table_entry * const table = locale_table;
! 944: size_t const table_size =
! 945: sizeof (locale_table) / sizeof (struct table_entry);
! 946: /* The table is sorted. Perform a binary search. */
! 947: size_t hi = table_size;
! 948: size_t lo = 0;
! 949: while (lo < hi)
! 950: {
! 951: /* Invariant:
! 952: for i < lo, strcmp (table[i].locale, locale) < 0,
! 953: for i >= hi, strcmp (table[i].locale, locale) > 0. */
! 954: size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
! 955: int cmp = strcmp (table[mid].locale, locale);
! 956: if (cmp < 0)
! 957: lo = mid + 1;
! 958: else if (cmp > 0)
! 959: hi = mid;
! 960: else
! 961: {
! 962: /* Found an i with
! 963: strcmp (table[i].locale, locale) == 0. */
! 964: codeset = table[mid].canonical;
! 965: goto done_table_lookup;
! 966: }
! 967: }
! 968: if (0)
! 969: done_table_lookup: ;
! 970: else
! 971: # endif
1.1 misho 972: {
1.1.1.3 ! misho 973: /* Did not find it in the table. */
! 974: /* On Mac OS X, all modern locales use the UTF-8 encoding.
! 975: BeOS and Haiku have a single locale, and it has UTF-8 encoding. */
! 976: # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
! 977: codeset = "UTF-8";
! 978: # else
! 979: /* The canonical name cannot be determined. */
! 980: /* Don't return an empty string. GNU libc and GNU libiconv interpret
! 981: the empty string as denoting "the locale's character encoding",
! 982: thus GNU libiconv would call this function a second time. */
! 983: codeset = "ASCII";
! 984: # endif
1.1 misho 985: }
1.1.1.3 ! misho 986: }
! 987:
! 988: #endif
1.1 misho 989:
1.1.1.3 ! misho 990: #ifdef DARWIN7
! 991: /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
! 992: (the default codeset) does not work when MB_CUR_MAX is 1. */
! 993: if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
1.1 misho 994: codeset = "ASCII";
1.1.1.3 ! misho 995: #endif
1.1 misho 996:
997: return codeset;
998: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>