embedaddon/libiconv/libcharset/lib/localcharset.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / libiconv / libcharset / lib / localcharset.c
Revision 1.1.1.3 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 13:38:46 2021 UTC (4 years, 3 months ago) by misho
Branches: libiconv, MAIN
CVS tags: v1_16p0, HEAD

libiconv 1.16

1: /* Determine a canonical name for the current locale's character encoding. 2: 3: Copyright (C) 2000-2006, 2008-2018 Free Software Foundation, Inc. 4: 5: This program is free software; you can redistribute it and/or modify it 6: under the terms of the GNU Library General Public License as published 7: by the Free Software Foundation; either version 2, or (at your option) 8: any later version. 9: 10: This program is distributed in the hope that it will be useful, 11: but WITHOUT ANY WARRANTY; without even the implied warranty of 12: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13: Library General Public License for more details. 14: 15: You should have received a copy of the GNU Library General Public License 16: along with this program; if not, see <https://www.gnu.org/licenses/>. */ 17: 18: /* Written by Bruno Haible <bruno@clisp.org>. */ 19: 20: #include <config.h> 21: 22: /* Specification. */ 23: #include "localcharset.h" 24: 25: #include <stddef.h> 26: #include <stdio.h> 27: #include <string.h> 28: #include <stdlib.h> 29: 30: #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET 31: # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */ 32: #endif 33: 34: #if defined _WIN32 && !defined __CYGWIN__ 35: # define WINDOWS_NATIVE 36: # include <locale.h> 37: #endif 38: 39: #if defined __EMX__ 40: /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 41: # ifndef OS2 42: # define OS2 43: # endif 44: #endif 45: 46: #if !defined WINDOWS_NATIVE 47: # if HAVE_LANGINFO_CODESET 48: # include <langinfo.h> 49: # else 50: # if 0 /* see comment regarding use of setlocale(), below */ 51: # include <locale.h> 52: # endif 53: # endif 54: # ifdef __CYGWIN__ 55: # define WIN32_LEAN_AND_MEAN 56: # include <windows.h> 57: # endif 58: #elif defined WINDOWS_NATIVE 59: # define WIN32_LEAN_AND_MEAN 60: # include <windows.h> 61: #endif 62: #if defined OS2 63: # define INCL_DOS 64: # include <os2.h> 65: #endif 66: 67: /* For MB_CUR_MAX_L */ 68: #if defined DARWIN7 69: # include <xlocale.h> 70: #endif 71: 72: 73: #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2 74: 75: /* On these platforms, we use a mapping from non-canonical encoding name 76: to GNU canonical encoding name. */ 77: 78: /* With glibc-2.1 or newer, we don't need any canonicalization, 79: because glibc has iconv and both glibc and libiconv support all 80: GNU canonical names directly. */ 81: # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__) 82: 83: struct table_entry 84: { 85: const char alias[11+1]; 86: const char canonical[11+1]; 87: }; 88: 89: /* Table of platform-dependent mappings, sorted in ascending order. */ 90: static const struct table_entry alias_table[] = 91: { 92: # if defined __FreeBSD__ /* FreeBSD */ 93: /*{ "ARMSCII-8", "ARMSCII-8" },*/ 94: { "Big5", "BIG5" }, 95: { "Big5HKSCS", "BIG5-HKSCS" }, 96: { "C", "ASCII" }, 97: /*{ "CP1131", "CP1131" },*/ 98: /*{ "CP1251", "CP1251" },*/ 99: /*{ "CP866", "CP866" },*/ 100: /*{ "GB18030", "GB18030" },*/ 101: /*{ "GB2312", "GB2312" },*/ 102: /*{ "GBK", "GBK" },*/ 103: /*{ "ISCII-DEV", "?" },*/ 104: { "ISO8859-1", "ISO-8859-1" }, 105: { "ISO8859-13", "ISO-8859-13" }, 106: { "ISO8859-15", "ISO-8859-15" }, 107: { "ISO8859-2", "ISO-8859-2" }, 108: { "ISO8859-4", "ISO-8859-4" }, 109: { "ISO8859-5", "ISO-8859-5" }, 110: { "ISO8859-7", "ISO-8859-7" }, 111: { "ISO8859-9", "ISO-8859-9" }, 112: /*{ "KOI8-R", "KOI8-R" },*/ 113: /*{ "KOI8-U", "KOI8-U" },*/ 114: { "SJIS", "SHIFT_JIS" }, 115: { "US-ASCII", "ASCII" }, 116: { "eucCN", "GB2312" }, 117: { "eucJP", "EUC-JP" }, 118: { "eucKR", "EUC-KR" } 119: # define alias_table_defined 120: # endif 121: # if defined __NetBSD__ /* NetBSD */ 122: { "646", "ASCII" }, 123: /*{ "ARMSCII-8", "ARMSCII-8" },*/ 124: /*{ "BIG5", "BIG5" },*/ 125: { "Big5-HKSCS", "BIG5-HKSCS" }, 126: /*{ "CP1251", "CP1251" },*/ 127: /*{ "CP866", "CP866" },*/ 128: /*{ "GB18030", "GB18030" },*/ 129: /*{ "GB2312", "GB2312" },*/ 130: { "ISO8859-1", "ISO-8859-1" }, 131: { "ISO8859-13", "ISO-8859-13" }, 132: { "ISO8859-15", "ISO-8859-15" }, 133: { "ISO8859-2", "ISO-8859-2" }, 134: { "ISO8859-4", "ISO-8859-4" }, 135: { "ISO8859-5", "ISO-8859-5" }, 136: { "ISO8859-7", "ISO-8859-7" }, 137: /*{ "KOI8-R", "KOI8-R" },*/ 138: /*{ "KOI8-U", "KOI8-U" },*/ 139: /*{ "PT154", "PT154" },*/ 140: { "SJIS", "SHIFT_JIS" }, 141: { "eucCN", "GB2312" }, 142: { "eucJP", "EUC-JP" }, 143: { "eucKR", "EUC-KR" }, 144: { "eucTW", "EUC-TW" } 145: # define alias_table_defined 146: # endif 147: # if defined __OpenBSD__ /* OpenBSD */ 148: { "646", "ASCII" }, 149: { "ISO8859-1", "ISO-8859-1" }, 150: { "ISO8859-13", "ISO-8859-13" }, 151: { "ISO8859-15", "ISO-8859-15" }, 152: { "ISO8859-2", "ISO-8859-2" }, 153: { "ISO8859-4", "ISO-8859-4" }, 154: { "ISO8859-5", "ISO-8859-5" }, 155: { "ISO8859-7", "ISO-8859-7" } 156: # define alias_table_defined 157: # endif 158: # if defined __APPLE__ && defined __MACH__ /* Mac OS X */ 159: /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is 160: useless: 161: - It returns the empty string when LANG is set to a locale of the 162: form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8 163: LC_CTYPE file. 164: - The environment variables LANG, LC_CTYPE, LC_ALL are not set by 165: the system; nl_langinfo(CODESET) returns "US-ASCII" in this case. 166: - The documentation says: 167: "... all code that calls BSD system routines should ensure 168: that the const *char parameters of these routines are in UTF-8 169: encoding. All BSD system functions expect their string 170: parameters to be in UTF-8 encoding and nothing else." 171: It also says 172: "An additional caveat is that string parameters for files, 173: paths, and other file-system entities must be in canonical 174: UTF-8. In a canonical UTF-8 Unicode string, all decomposable 175: characters are decomposed ..." 176: but this is not true: You can pass non-decomposed UTF-8 strings 177: to file system functions, and it is the OS which will convert 178: them to decomposed UTF-8 before accessing the file system. 179: - The Apple Terminal application displays UTF-8 by default. 180: - However, other applications are free to use different encodings: 181: - xterm uses ISO-8859-1 by default. 182: - TextEdit uses MacRoman by default. 183: We prefer UTF-8 over decomposed UTF-8-MAC because one should 184: minimize the use of decomposed Unicode. Unfortunately, through the 185: Darwin file system, decomposed UTF-8 strings are leaked into user 186: space nevertheless. 187: Then there are also the locales with encodings other than US-ASCII 188: and UTF-8. These locales can be occasionally useful to users (e.g. 189: when grepping through ISO-8859-1 encoded text files), when all their 190: file names are in US-ASCII. 191: */ 192: { "ARMSCII-8", "ARMSCII-8" }, 193: { "Big5", "BIG5" }, 194: { "Big5HKSCS", "BIG5-HKSCS" }, 195: { "CP1131", "CP1131" }, 196: { "CP1251", "CP1251" }, 197: { "CP866", "CP866" }, 198: { "CP949", "CP949" }, 199: { "GB18030", "GB18030" }, 200: { "GB2312", "GB2312" }, 201: { "GBK", "GBK" }, 202: /*{ "ISCII-DEV", "?" },*/ 203: { "ISO8859-1", "ISO-8859-1" }, 204: { "ISO8859-13", "ISO-8859-13" }, 205: { "ISO8859-15", "ISO-8859-15" }, 206: { "ISO8859-2", "ISO-8859-2" }, 207: { "ISO8859-4", "ISO-8859-4" }, 208: { "ISO8859-5", "ISO-8859-5" }, 209: { "ISO8859-7", "ISO-8859-7" }, 210: { "ISO8859-9", "ISO-8859-9" }, 211: { "KOI8-R", "KOI8-R" }, 212: { "KOI8-U", "KOI8-U" }, 213: { "PT154", "PT154" }, 214: { "SJIS", "SHIFT_JIS" }, 215: { "eucCN", "GB2312" }, 216: { "eucJP", "EUC-JP" }, 217: { "eucKR", "EUC-KR" } 218: # define alias_table_defined 219: # endif 220: # if defined _AIX /* AIX */ 221: /*{ "GBK", "GBK" },*/ 222: { "IBM-1046", "CP1046" }, 223: { "IBM-1124", "CP1124" }, 224: { "IBM-1129", "CP1129" }, 225: { "IBM-1252", "CP1252" }, 226: { "IBM-850", "CP850" }, 227: { "IBM-856", "CP856" }, 228: { "IBM-921", "ISO-8859-13" }, 229: { "IBM-922", "CP922" }, 230: { "IBM-932", "CP932" }, 231: { "IBM-943", "CP943" }, 232: { "IBM-eucCN", "GB2312" }, 233: { "IBM-eucJP", "EUC-JP" }, 234: { "IBM-eucKR", "EUC-KR" }, 235: { "IBM-eucTW", "EUC-TW" }, 236: { "ISO8859-1", "ISO-8859-1" }, 237: { "ISO8859-15", "ISO-8859-15" }, 238: { "ISO8859-2", "ISO-8859-2" }, 239: { "ISO8859-5", "ISO-8859-5" }, 240: { "ISO8859-6", "ISO-8859-6" }, 241: { "ISO8859-7", "ISO-8859-7" }, 242: { "ISO8859-8", "ISO-8859-8" }, 243: { "ISO8859-9", "ISO-8859-9" }, 244: { "TIS-620", "TIS-620" }, 245: /*{ "UTF-8", "UTF-8" },*/ 246: { "big5", "BIG5" } 247: # define alias_table_defined 248: # endif 249: # if defined __hpux /* HP-UX */ 250: { "SJIS", "SHIFT_JIS" }, 251: { "arabic8", "HP-ARABIC8" }, 252: { "big5", "BIG5" }, 253: { "cp1251", "CP1251" }, 254: { "eucJP", "EUC-JP" }, 255: { "eucKR", "EUC-KR" }, 256: { "eucTW", "EUC-TW" }, 257: { "gb18030", "GB18030" }, 258: { "greek8", "HP-GREEK8" }, 259: { "hebrew8", "HP-HEBREW8" }, 260: { "hkbig5", "BIG5-HKSCS" }, 261: { "hp15CN", "GB2312" }, 262: { "iso88591", "ISO-8859-1" }, 263: { "iso885913", "ISO-8859-13" }, 264: { "iso885915", "ISO-8859-15" }, 265: { "iso88592", "ISO-8859-2" }, 266: { "iso88594", "ISO-8859-4" }, 267: { "iso88595", "ISO-8859-5" }, 268: { "iso88596", "ISO-8859-6" }, 269: { "iso88597", "ISO-8859-7" }, 270: { "iso88598", "ISO-8859-8" }, 271: { "iso88599", "ISO-8859-9" }, 272: { "kana8", "HP-KANA8" }, 273: { "koi8r", "KOI8-R" }, 274: { "roman8", "HP-ROMAN8" }, 275: { "tis620", "TIS-620" }, 276: { "turkish8", "HP-TURKISH8" }, 277: { "utf8", "UTF-8" } 278: # define alias_table_defined 279: # endif 280: # if defined __sgi /* IRIX */ 281: { "ISO8859-1", "ISO-8859-1" }, 282: { "ISO8859-15", "ISO-8859-15" }, 283: { "ISO8859-2", "ISO-8859-2" }, 284: { "ISO8859-5", "ISO-8859-5" }, 285: { "ISO8859-7", "ISO-8859-7" }, 286: { "ISO8859-9", "ISO-8859-9" }, 287: { "eucCN", "GB2312" }, 288: { "eucJP", "EUC-JP" }, 289: { "eucKR", "EUC-KR" }, 290: { "eucTW", "EUC-TW" } 291: # define alias_table_defined 292: # endif 293: # if defined __osf__ /* OSF/1 */ 294: /*{ "GBK", "GBK" },*/ 295: { "ISO8859-1", "ISO-8859-1" }, 296: { "ISO8859-15", "ISO-8859-15" }, 297: { "ISO8859-2", "ISO-8859-2" }, 298: { "ISO8859-4", "ISO-8859-4" }, 299: { "ISO8859-5", "ISO-8859-5" }, 300: { "ISO8859-7", "ISO-8859-7" }, 301: { "ISO8859-8", "ISO-8859-8" }, 302: { "ISO8859-9", "ISO-8859-9" }, 303: { "KSC5601", "CP949" }, 304: { "SJIS", "SHIFT_JIS" }, 305: { "TACTIS", "TIS-620" }, 306: /*{ "UTF-8", "UTF-8" },*/ 307: { "big5", "BIG5" }, 308: { "cp850", "CP850" }, 309: { "dechanyu", "DEC-HANYU" }, 310: { "dechanzi", "GB2312" }, 311: { "deckanji", "DEC-KANJI" }, 312: { "deckorean", "EUC-KR" }, 313: { "eucJP", "EUC-JP" }, 314: { "eucKR", "EUC-KR" }, 315: { "eucTW", "EUC-TW" }, 316: { "sdeckanji", "EUC-JP" } 317: # define alias_table_defined 318: # endif 319: # if defined __sun /* Solaris */ 320: { "5601", "EUC-KR" }, 321: { "646", "ASCII" }, 322: /*{ "BIG5", "BIG5" },*/ 323: { "Big5-HKSCS", "BIG5-HKSCS" }, 324: { "GB18030", "GB18030" }, 325: /*{ "GBK", "GBK" },*/ 326: { "ISO8859-1", "ISO-8859-1" }, 327: { "ISO8859-11", "TIS-620" }, 328: { "ISO8859-13", "ISO-8859-13" }, 329: { "ISO8859-15", "ISO-8859-15" }, 330: { "ISO8859-2", "ISO-8859-2" }, 331: { "ISO8859-3", "ISO-8859-3" }, 332: { "ISO8859-4", "ISO-8859-4" }, 333: { "ISO8859-5", "ISO-8859-5" }, 334: { "ISO8859-6", "ISO-8859-6" }, 335: { "ISO8859-7", "ISO-8859-7" }, 336: { "ISO8859-8", "ISO-8859-8" }, 337: { "ISO8859-9", "ISO-8859-9" }, 338: { "PCK", "SHIFT_JIS" }, 339: { "TIS620.2533", "TIS-620" }, 340: /*{ "UTF-8", "UTF-8" },*/ 341: { "ansi-1251", "CP1251" }, 342: { "cns11643", "EUC-TW" }, 343: { "eucJP", "EUC-JP" }, 344: { "gb2312", "GB2312" }, 345: { "koi8-r", "KOI8-R" } 346: # define alias_table_defined 347: # endif 348: # if defined __minix /* Minix */ 349: { "646", "ASCII" } 350: # define alias_table_defined 351: # endif 352: # if defined WINDOWS_NATIVE || defined __CYGWIN__ /* Windows */ 353: { "CP1361", "JOHAB" }, 354: { "CP20127", "ASCII" }, 355: { "CP20866", "KOI8-R" }, 356: { "CP20936", "GB2312" }, 357: { "CP21866", "KOI8-RU" }, 358: { "CP28591", "ISO-8859-1" }, 359: { "CP28592", "ISO-8859-2" }, 360: { "CP28593", "ISO-8859-3" }, 361: { "CP28594", "ISO-8859-4" }, 362: { "CP28595", "ISO-8859-5" }, 363: { "CP28596", "ISO-8859-6" }, 364: { "CP28597", "ISO-8859-7" }, 365: { "CP28598", "ISO-8859-8" }, 366: { "CP28599", "ISO-8859-9" }, 367: { "CP28605", "ISO-8859-15" }, 368: { "CP38598", "ISO-8859-8" }, 369: { "CP51932", "EUC-JP" }, 370: { "CP51936", "GB2312" }, 371: { "CP51949", "EUC-KR" }, 372: { "CP51950", "EUC-TW" }, 373: { "CP54936", "GB18030" }, 374: { "CP65001", "UTF-8" }, 375: { "CP936", "GBK" } 376: # define alias_table_defined 377: # endif 378: # if defined OS2 /* OS/2 */ 379: /* The list of encodings is taken from "List of OS/2 Codepages" 380: by Alex Taylor: 381: <http://altsan.org/os2/toolkits/uls/index.html#codepages>. 382: See also "IBM Globalization - Code page identifiers": 383: <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>. */ 384: { "CP1089", "ISO-8859-6" }, 385: { "CP1208", "UTF-8" }, 386: { "CP1381", "GB2312" }, 387: { "CP1386", "GBK" }, 388: { "CP3372", "EUC-JP" }, 389: { "CP813", "ISO-8859-7" }, 390: { "CP819", "ISO-8859-1" }, 391: { "CP878", "KOI8-R" }, 392: { "CP912", "ISO-8859-2" }, 393: { "CP913", "ISO-8859-3" }, 394: { "CP914", "ISO-8859-4" }, 395: { "CP915", "ISO-8859-5" }, 396: { "CP916", "ISO-8859-8" }, 397: { "CP920", "ISO-8859-9" }, 398: { "CP921", "ISO-8859-13" }, 399: { "CP923", "ISO-8859-15" }, 400: { "CP954", "EUC-JP" }, 401: { "CP964", "EUC-TW" }, 402: { "CP970", "EUC-KR" } 403: # define alias_table_defined 404: # endif 405: # if defined VMS /* OpenVMS */ 406: /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 407: "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 408: section 10.7 "Handling Different Character Sets". */ 409: { "DECHANYU", "DEC-HANYU" }, 410: { "DECHANZI", "GB2312" }, 411: { "DECKANJI", "DEC-KANJI" }, 412: { "DECKOREAN", "EUC-KR" }, 413: { "ISO8859-1", "ISO-8859-1" }, 414: { "ISO8859-2", "ISO-8859-2" }, 415: { "ISO8859-5", "ISO-8859-5" }, 416: { "ISO8859-7", "ISO-8859-7" }, 417: { "ISO8859-8", "ISO-8859-8" }, 418: { "ISO8859-9", "ISO-8859-9" }, 419: { "SDECKANJI", "EUC-JP" }, 420: { "SJIS", "SHIFT_JIS" }, 421: { "eucJP", "EUC-JP" }, 422: { "eucTW", "EUC-TW" } 423: # define alias_table_defined 424: # endif 425: # ifndef alias_table_defined 426: /* Just a dummy entry, to avoid a C syntax error. */ 427: { "", "" } 428: # endif 429: }; 430: 431: # endif 432: 433: #else 434: 435: /* On these platforms, we use a mapping from locale name to GNU canonical 436: encoding name. */ 437: 438: struct table_entry 439: { 440: const char locale[17+1]; 441: const char canonical[11+1]; 442: }; 443: 444: /* Table of platform-dependent mappings, sorted in ascending order. */ 445: static const struct table_entry locale_table[] = 446: { 447: # if defined __FreeBSD__ /* FreeBSD 4.2 */ 448: { "cs_CZ.ISO_8859-2", "ISO-8859-2" }, 449: { "da_DK.DIS_8859-15", "ISO-8859-15" }, 450: { "da_DK.ISO_8859-1", "ISO-8859-1" }, 451: { "de_AT.DIS_8859-15", "ISO-8859-15" }, 452: { "de_AT.ISO_8859-1", "ISO-8859-1" }, 453: { "de_CH.DIS_8859-15", "ISO-8859-15" }, 454: { "de_CH.ISO_8859-1", "ISO-8859-1" }, 455: { "de_DE.DIS_8859-15", "ISO-8859-15" }, 456: { "de_DE.ISO_8859-1", "ISO-8859-1" }, 457: { "en_AU.DIS_8859-15", "ISO-8859-15" }, 458: { "en_AU.ISO_8859-1", "ISO-8859-1" }, 459: { "en_CA.DIS_8859-15", "ISO-8859-15" }, 460: { "en_CA.ISO_8859-1", "ISO-8859-1" }, 461: { "en_GB.DIS_8859-15", "ISO-8859-15" }, 462: { "en_GB.ISO_8859-1", "ISO-8859-1" }, 463: { "en_US.DIS_8859-15", "ISO-8859-15" }, 464: { "en_US.ISO_8859-1", "ISO-8859-1" }, 465: { "es_ES.DIS_8859-15", "ISO-8859-15" }, 466: { "es_ES.ISO_8859-1", "ISO-8859-1" }, 467: { "fi_FI.DIS_8859-15", "ISO-8859-15" }, 468: { "fi_FI.ISO_8859-1", "ISO-8859-1" }, 469: { "fr_BE.DIS_8859-15", "ISO-8859-15" }, 470: { "fr_BE.ISO_8859-1", "ISO-8859-1" }, 471: { "fr_CA.DIS_8859-15", "ISO-8859-15" }, 472: { "fr_CA.ISO_8859-1", "ISO-8859-1" }, 473: { "fr_CH.DIS_8859-15", "ISO-8859-15" }, 474: { "fr_CH.ISO_8859-1", "ISO-8859-1" }, 475: { "fr_FR.DIS_8859-15", "ISO-8859-15" }, 476: { "fr_FR.ISO_8859-1", "ISO-8859-1" }, 477: { "hr_HR.ISO_8859-2", "ISO-8859-2" }, 478: { "hu_HU.ISO_8859-2", "ISO-8859-2" }, 479: { "is_IS.DIS_8859-15", "ISO-8859-15" }, 480: { "is_IS.ISO_8859-1", "ISO-8859-1" }, 481: { "it_CH.DIS_8859-15", "ISO-8859-15" }, 482: { "it_CH.ISO_8859-1", "ISO-8859-1" }, 483: { "it_IT.DIS_8859-15", "ISO-8859-15" }, 484: { "it_IT.ISO_8859-1", "ISO-8859-1" }, 485: { "ja_JP.EUC", "EUC-JP" }, 486: { "ja_JP.SJIS", "SHIFT_JIS" }, 487: { "ja_JP.Shift_JIS", "SHIFT_JIS" }, 488: { "ko_KR.EUC", "EUC-KR" }, 489: { "la_LN.ASCII", "ASCII" }, 490: { "la_LN.DIS_8859-15", "ISO-8859-15" }, 491: { "la_LN.ISO_8859-1", "ISO-8859-1" }, 492: { "la_LN.ISO_8859-2", "ISO-8859-2" }, 493: { "la_LN.ISO_8859-4", "ISO-8859-4" }, 494: { "lt_LN.ASCII", "ASCII" }, 495: { "lt_LN.DIS_8859-15", "ISO-8859-15" }, 496: { "lt_LN.ISO_8859-1", "ISO-8859-1" }, 497: { "lt_LN.ISO_8859-2", "ISO-8859-2" }, 498: { "lt_LT.ISO_8859-4", "ISO-8859-4" }, 499: { "nl_BE.DIS_8859-15", "ISO-8859-15" }, 500: { "nl_BE.ISO_8859-1", "ISO-8859-1" }, 501: { "nl_NL.DIS_8859-15", "ISO-8859-15" }, 502: { "nl_NL.ISO_8859-1", "ISO-8859-1" }, 503: { "no_NO.DIS_8859-15", "ISO-8859-15" }, 504: { "no_NO.ISO_8859-1", "ISO-8859-1" }, 505: { "pl_PL.ISO_8859-2", "ISO-8859-2" }, 506: { "pt_PT.DIS_8859-15", "ISO-8859-15" }, 507: { "pt_PT.ISO_8859-1", "ISO-8859-1" }, 508: { "ru_RU.CP866", "CP866" }, 509: { "ru_RU.ISO_8859-5", "ISO-8859-5" }, 510: { "ru_RU.KOI8-R", "KOI8-R" }, 511: { "ru_SU.CP866", "CP866" }, 512: { "ru_SU.ISO_8859-5", "ISO-8859-5" }, 513: { "ru_SU.KOI8-R", "KOI8-R" }, 514: { "sl_SI.ISO_8859-2", "ISO-8859-2" }, 515: { "sv_SE.DIS_8859-15", "ISO-8859-15" }, 516: { "sv_SE.ISO_8859-1", "ISO-8859-1" }, 517: { "uk_UA.KOI8-U", "KOI8-U" }, 518: { "zh_CN.EUC", "GB2312" }, 519: { "zh_TW.BIG5", "BIG5" }, 520: { "zh_TW.Big5", "BIG5" } 521: # define locale_table_defined 522: # endif 523: # if defined __DJGPP__ /* DOS / DJGPP 2.03 */ 524: /* The encodings given here may not all be correct. 525: If you find that the encoding given for your language and 526: country is not the one your DOS machine actually uses, just 527: correct it in this file, and send a mail to 528: Juan Manuel Guerrero <juan.guerrero@gmx.de> 529: and <bug-gnulib@gnu.org>. */ 530: { "C", "ASCII" }, 531: { "ar", "CP864" }, 532: { "ar_AE", "CP864" }, 533: { "ar_DZ", "CP864" }, 534: { "ar_EG", "CP864" }, 535: { "ar_IQ", "CP864" }, 536: { "ar_IR", "CP864" }, 537: { "ar_JO", "CP864" }, 538: { "ar_KW", "CP864" }, 539: { "ar_MA", "CP864" }, 540: { "ar_OM", "CP864" }, 541: { "ar_QA", "CP864" }, 542: { "ar_SA", "CP864" }, 543: { "ar_SY", "CP864" }, 544: { "be", "CP866" }, 545: { "be_BE", "CP866" }, 546: { "bg", "CP866" }, /* not CP855 ?? */ 547: { "bg_BG", "CP866" }, /* not CP855 ?? */ 548: { "ca", "CP850" }, 549: { "ca_ES", "CP850" }, 550: { "cs", "CP852" }, 551: { "cs_CZ", "CP852" }, 552: { "da", "CP865" }, /* not CP850 ?? */ 553: { "da_DK", "CP865" }, /* not CP850 ?? */ 554: { "de", "CP850" }, 555: { "de_AT", "CP850" }, 556: { "de_CH", "CP850" }, 557: { "de_DE", "CP850" }, 558: { "el", "CP869" }, 559: { "el_GR", "CP869" }, 560: { "en", "CP850" }, 561: { "en_AU", "CP850" }, /* not CP437 ?? */ 562: { "en_CA", "CP850" }, 563: { "en_GB", "CP850" }, 564: { "en_NZ", "CP437" }, 565: { "en_US", "CP437" }, 566: { "en_ZA", "CP850" }, /* not CP437 ?? */ 567: { "eo", "CP850" }, 568: { "eo_EO", "CP850" }, 569: { "es", "CP850" }, 570: { "es_AR", "CP850" }, 571: { "es_BO", "CP850" }, 572: { "es_CL", "CP850" }, 573: { "es_CO", "CP850" }, 574: { "es_CR", "CP850" }, 575: { "es_CU", "CP850" }, 576: { "es_DO", "CP850" }, 577: { "es_EC", "CP850" }, 578: { "es_ES", "CP850" }, 579: { "es_GT", "CP850" }, 580: { "es_HN", "CP850" }, 581: { "es_MX", "CP850" }, 582: { "es_NI", "CP850" }, 583: { "es_PA", "CP850" }, 584: { "es_PE", "CP850" }, 585: { "es_PY", "CP850" }, 586: { "es_SV", "CP850" }, 587: { "es_UY", "CP850" }, 588: { "es_VE", "CP850" }, 589: { "et", "CP850" }, 590: { "et_EE", "CP850" }, 591: { "eu", "CP850" }, 592: { "eu_ES", "CP850" }, 593: { "fi", "CP850" }, 594: { "fi_FI", "CP850" }, 595: { "fr", "CP850" }, 596: { "fr_BE", "CP850" }, 597: { "fr_CA", "CP850" }, 598: { "fr_CH", "CP850" }, 599: { "fr_FR", "CP850" }, 600: { "ga", "CP850" }, 601: { "ga_IE", "CP850" }, 602: { "gd", "CP850" }, 603: { "gd_GB", "CP850" }, 604: { "gl", "CP850" }, 605: { "gl_ES", "CP850" }, 606: { "he", "CP862" }, 607: { "he_IL", "CP862" }, 608: { "hr", "CP852" }, 609: { "hr_HR", "CP852" }, 610: { "hu", "CP852" }, 611: { "hu_HU", "CP852" }, 612: { "id", "CP850" }, /* not CP437 ?? */ 613: { "id_ID", "CP850" }, /* not CP437 ?? */ 614: { "is", "CP861" }, /* not CP850 ?? */ 615: { "is_IS", "CP861" }, /* not CP850 ?? */ 616: { "it", "CP850" }, 617: { "it_CH", "CP850" }, 618: { "it_IT", "CP850" }, 619: { "ja", "CP932" }, 620: { "ja_JP", "CP932" }, 621: { "kr", "CP949" }, /* not CP934 ?? */ 622: { "kr_KR", "CP949" }, /* not CP934 ?? */ 623: { "lt", "CP775" }, 624: { "lt_LT", "CP775" }, 625: { "lv", "CP775" }, 626: { "lv_LV", "CP775" }, 627: { "mk", "CP866" }, /* not CP855 ?? */ 628: { "mk_MK", "CP866" }, /* not CP855 ?? */ 629: { "mt", "CP850" }, 630: { "mt_MT", "CP850" }, 631: { "nb", "CP865" }, /* not CP850 ?? */ 632: { "nb_NO", "CP865" }, /* not CP850 ?? */ 633: { "nl", "CP850" }, 634: { "nl_BE", "CP850" }, 635: { "nl_NL", "CP850" }, 636: { "nn", "CP865" }, /* not CP850 ?? */ 637: { "nn_NO", "CP865" }, /* not CP850 ?? */ 638: { "no", "CP865" }, /* not CP850 ?? */ 639: { "no_NO", "CP865" }, /* not CP850 ?? */ 640: { "pl", "CP852" }, 641: { "pl_PL", "CP852" }, 642: { "pt", "CP850" }, 643: { "pt_BR", "CP850" }, 644: { "pt_PT", "CP850" }, 645: { "ro", "CP852" }, 646: { "ro_RO", "CP852" }, 647: { "ru", "CP866" }, 648: { "ru_RU", "CP866" }, 649: { "sk", "CP852" }, 650: { "sk_SK", "CP852" }, 651: { "sl", "CP852" }, 652: { "sl_SI", "CP852" }, 653: { "sq", "CP852" }, 654: { "sq_AL", "CP852" }, 655: { "sr", "CP852" }, /* CP852 or CP866 or CP855 ?? */ 656: { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */ 657: { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */ 658: { "sv", "CP850" }, 659: { "sv_SE", "CP850" }, 660: { "th", "CP874" }, 661: { "th_TH", "CP874" }, 662: { "tr", "CP857" }, 663: { "tr_TR", "CP857" }, 664: { "uk", "CP1125" }, 665: { "uk_UA", "CP1125" }, 666: { "zh_CN", "GBK" }, 667: { "zh_TW", "CP950" } /* not CP938 ?? */ 668: # define locale_table_defined 669: # endif 670: # ifndef locale_table_defined 671: /* Just a dummy entry, to avoid a C syntax error. */ 672: { "", "" } 673: # endif 674: }; 675: 676: #endif 677: 678: 679: /* Determine the current locale's character encoding, and canonicalize it 680: into one of the canonical names listed in localcharset.h. 681: The result must not be freed; it is statically allocated. 682: If the canonical name cannot be determined, the result is a non-canonical 683: name. */ 684: 685: #ifdef STATIC 686: STATIC 687: #endif 688: const char * 689: locale_charset (void) 690: { 691: const char *codeset; 692: 693: #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2 694: 695: # if HAVE_LANGINFO_CODESET 696: 697: /* Most systems support nl_langinfo (CODESET) nowadays. */ 698: codeset = nl_langinfo (CODESET); 699: 700: # ifdef __CYGWIN__ 701: /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always 702: returns "US-ASCII". Return the suffix of the locale name from the 703: environment variables (if present) or the codepage as a number. */ 704: if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 705: { 706: const char *locale; 707: static char buf[2 + 10 + 1]; 708: 709: locale = getenv ("LC_ALL"); 710: if (locale == NULL || locale[0] == '\0') 711: { 712: locale = getenv ("LC_CTYPE"); 713: if (locale == NULL || locale[0] == '\0') 714: locale = getenv ("LANG"); 715: } 716: if (locale != NULL && locale[0] != '\0') 717: { 718: /* If the locale name contains an encoding after the dot, return 719: it. */ 720: const char *dot = strchr (locale, '.'); 721: 722: if (dot != NULL) 723: { 724: const char *modifier; 725: 726: dot++; 727: /* Look for the possible @... trailer and remove it, if any. */ 728: modifier = strchr (dot, '@'); 729: if (modifier == NULL) 730: return dot; 731: if (modifier - dot < sizeof (buf)) 732: { 733: memcpy (buf, dot, modifier - dot); 734: buf [modifier - dot] = '\0'; 735: return buf; 736: } 737: } 738: } 739: 740: /* The Windows API has a function returning the locale's codepage as a 741: number: GetACP(). This encoding is used by Cygwin, unless the user 742: has set the environment variable CYGWIN=codepage:oem (which very few 743: people do). 744: Output directed to console windows needs to be converted (to 745: GetOEMCP() if the console is using a raster font, or to 746: GetConsoleOutputCP() if it is using a TrueType font). Cygwin does 747: this conversion transparently (see winsup/cygwin/fhandler_console.cc), 748: converting to GetConsoleOutputCP(). This leads to correct results, 749: except when SetConsoleOutputCP has been called and a raster font is 750: in use. */ 751: sprintf (buf, "CP%u", GetACP ()); 752: codeset = buf; 753: } 754: # endif 755: 756: if (codeset == NULL) 757: /* The canonical name cannot be determined. */ 758: codeset = ""; 759: 760: # elif defined WINDOWS_NATIVE 761: 762: static char buf[2 + 10 + 1]; 763: 764: /* The Windows API has a function returning the locale's codepage as 765: a number, but the value doesn't change according to what the 766: 'setlocale' call specified. So we use it as a last resort, in 767: case the string returned by 'setlocale' doesn't specify the 768: codepage. */ 769: char *current_locale = setlocale (LC_ALL, NULL); 770: char *pdot; 771: 772: /* If they set different locales for different categories, 773: 'setlocale' will return a semi-colon separated list of locale 774: values. To make sure we use the correct one, we choose LC_CTYPE. */ 775: if (strchr (current_locale, ';')) 776: current_locale = setlocale (LC_CTYPE, NULL); 777: 778: pdot = strrchr (current_locale, '.'); 779: if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf)) 780: sprintf (buf, "CP%s", pdot + 1); 781: else 782: { 783: /* The Windows API has a function returning the locale's codepage as a 784: number: GetACP(). 785: When the output goes to a console window, it needs to be provided in 786: GetOEMCP() encoding if the console is using a raster font, or in 787: GetConsoleOutputCP() encoding if it is using a TrueType font. 788: But in GUI programs and for output sent to files and pipes, GetACP() 789: encoding is the best bet. */ 790: sprintf (buf, "CP%u", GetACP ()); 791: } 792: codeset = buf; 793: 794: # elif defined OS2 795: 796: const char *locale; 797: static char buf[2 + 10 + 1]; 798: ULONG cp[3]; 799: ULONG cplen; 800: 801: codeset = NULL; 802: 803: /* Allow user to override the codeset, as set in the operating system, 804: with standard language environment variables. */ 805: locale = getenv ("LC_ALL"); 806: if (locale == NULL || locale[0] == '\0') 807: { 808: locale = getenv ("LC_CTYPE"); 809: if (locale == NULL || locale[0] == '\0') 810: locale = getenv ("LANG"); 811: } 812: if (locale != NULL && locale[0] != '\0') 813: { 814: /* If the locale name contains an encoding after the dot, return it. */ 815: const char *dot = strchr (locale, '.'); 816: 817: if (dot != NULL) 818: { 819: const char *modifier; 820: 821: dot++; 822: /* Look for the possible @... trailer and remove it, if any. */ 823: modifier = strchr (dot, '@'); 824: if (modifier == NULL) 825: return dot; 826: if (modifier - dot < sizeof (buf)) 827: { 828: memcpy (buf, dot, modifier - dot); 829: buf [modifier - dot] = '\0'; 830: return buf; 831: } 832: } 833: 834: /* For the POSIX locale, don't use the system's codepage. */ 835: if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0) 836: codeset = ""; 837: } 838: 839: if (codeset == NULL) 840: { 841: /* OS/2 has a function returning the locale's codepage as a number. */ 842: if (DosQueryCp (sizeof (cp), cp, &cplen)) 843: codeset = ""; 844: else 845: { 846: sprintf (buf, "CP%u", cp[0]); 847: codeset = buf; 848: } 849: } 850: 851: # else 852: 853: # error "Add code for other platforms here." 854: 855: # endif 856: 857: /* Resolve alias. */ 858: { 859: # ifdef alias_table_defined 860: /* On some platforms, UTF-8 locales are the most frequently used ones. 861: Speed up the common case and slow down the less common cases by 862: testing for this case first. */ 863: # if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__ 864: if (strcmp (codeset, "UTF-8") == 0) 865: goto done_table_lookup; 866: else 867: # endif 868: { 869: const struct table_entry * const table = alias_table; 870: size_t const table_size = 871: sizeof (alias_table) / sizeof (struct table_entry); 872: /* The table is sorted. Perform a binary search. */ 873: size_t hi = table_size; 874: size_t lo = 0; 875: while (lo < hi) 876: { 877: /* Invariant: 878: for i < lo, strcmp (table[i].alias, codeset) < 0, 879: for i >= hi, strcmp (table[i].alias, codeset) > 0. */ 880: size_t mid = (hi + lo) >> 1; /* >= lo, < hi */ 881: int cmp = strcmp (table[mid].alias, codeset); 882: if (cmp < 0) 883: lo = mid + 1; 884: else if (cmp > 0) 885: hi = mid; 886: else 887: { 888: /* Found an i with 889: strcmp (table[i].alias, codeset) == 0. */ 890: codeset = table[mid].canonical; 891: goto done_table_lookup; 892: } 893: } 894: } 895: if (0) 896: done_table_lookup: ; 897: else 898: # endif 899: { 900: /* Did not find it in the table. */ 901: /* On Mac OS X, all modern locales use the UTF-8 encoding. 902: BeOS and Haiku have a single locale, and it has UTF-8 encoding. */ 903: # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__ 904: codeset = "UTF-8"; 905: # else 906: /* Don't return an empty string. GNU libc and GNU libiconv interpret 907: the empty string as denoting "the locale's character encoding", 908: thus GNU libiconv would call this function a second time. */ 909: if (codeset[0] == '\0') 910: codeset = "ASCII"; 911: # endif 912: } 913: } 914: 915: #else 916: 917: /* On old systems which lack it, use setlocale or getenv. */ 918: const char *locale = NULL; 919: 920: /* But most old systems don't have a complete set of locales. Some 921: (like DJGPP) have only the C locale. Therefore we don't use setlocale 922: here; it would return "C" when it doesn't support the locale name the 923: user has set. */ 924: # if 0 925: locale = setlocale (LC_CTYPE, NULL); 926: # endif 927: if (locale == NULL || locale[0] == '\0') 928: { 929: locale = getenv ("LC_ALL"); 930: if (locale == NULL || locale[0] == '\0') 931: { 932: locale = getenv ("LC_CTYPE"); 933: if (locale == NULL || locale[0] == '\0') 934: locale = getenv ("LANG"); 935: if (locale == NULL) 936: locale = ""; 937: } 938: } 939: 940: /* Map locale name to canonical encoding name. */ 941: { 942: # ifdef locale_table_defined 943: const struct table_entry * const table = locale_table; 944: size_t const table_size = 945: sizeof (locale_table) / sizeof (struct table_entry); 946: /* The table is sorted. Perform a binary search. */ 947: size_t hi = table_size; 948: size_t lo = 0; 949: while (lo < hi) 950: { 951: /* Invariant: 952: for i < lo, strcmp (table[i].locale, locale) < 0, 953: for i >= hi, strcmp (table[i].locale, locale) > 0. */ 954: size_t mid = (hi + lo) >> 1; /* >= lo, < hi */ 955: int cmp = strcmp (table[mid].locale, locale); 956: if (cmp < 0) 957: lo = mid + 1; 958: else if (cmp > 0) 959: hi = mid; 960: else 961: { 962: /* Found an i with 963: strcmp (table[i].locale, locale) == 0. */ 964: codeset = table[mid].canonical; 965: goto done_table_lookup; 966: } 967: } 968: if (0) 969: done_table_lookup: ; 970: else 971: # endif 972: { 973: /* Did not find it in the table. */ 974: /* On Mac OS X, all modern locales use the UTF-8 encoding. 975: BeOS and Haiku have a single locale, and it has UTF-8 encoding. */ 976: # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__ 977: codeset = "UTF-8"; 978: # else 979: /* The canonical name cannot be determined. */ 980: /* Don't return an empty string. GNU libc and GNU libiconv interpret 981: the empty string as denoting "the locale's character encoding", 982: thus GNU libiconv would call this function a second time. */ 983: codeset = "ASCII"; 984: # endif 985: } 986: } 987: 988: #endif 989: 990: #ifdef DARWIN7 991: /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8" 992: (the default codeset) does not work when MB_CUR_MAX is 1. */ 993: if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1) 994: codeset = "ASCII"; 995: #endif 996: 997: return codeset; 998: }