Return to localcharset.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / libiconv / libcharset / lib |
1.1 ! misho 1: /* Determine a canonical name for the current locale's character encoding. ! 2: ! 3: Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc. ! 4: ! 5: This program is free software; you can redistribute it and/or modify it ! 6: under the terms of the GNU Library General Public License as published ! 7: by the Free Software Foundation; either version 2, or (at your option) ! 8: any later version. ! 9: ! 10: This program is distributed in the hope that it will be useful, ! 11: but WITHOUT ANY WARRANTY; without even the implied warranty of ! 12: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ! 13: Library General Public License for more details. ! 14: ! 15: You should have received a copy of the GNU Library General Public ! 16: License along with this program; if not, write to the Free Software ! 17: Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, ! 18: USA. */ ! 19: ! 20: /* Written by Bruno Haible <bruno@clisp.org>. */ ! 21: ! 22: #include <config.h> ! 23: ! 24: /* Specification. */ ! 25: #include "localcharset.h" ! 26: ! 27: #include <stddef.h> ! 28: #include <stdio.h> ! 29: #include <string.h> ! 30: #include <stdlib.h> ! 31: ! 32: #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET ! 33: # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */ ! 34: #endif ! 35: ! 36: #if defined _WIN32 || defined __WIN32__ ! 37: # define WIN32_NATIVE ! 38: #endif ! 39: ! 40: #if defined __EMX__ ! 41: /* Assume EMX program runs on OS/2, even if compiled under DOS. */ ! 42: # ifndef OS2 ! 43: # define OS2 ! 44: # endif ! 45: #endif ! 46: ! 47: #if !defined WIN32_NATIVE ! 48: # if HAVE_LANGINFO_CODESET ! 49: # include <langinfo.h> ! 50: # else ! 51: # if 0 /* see comment below */ ! 52: # include <locale.h> ! 53: # endif ! 54: # endif ! 55: # ifdef __CYGWIN__ ! 56: # define WIN32_LEAN_AND_MEAN ! 57: # include <windows.h> ! 58: # endif ! 59: #elif defined WIN32_NATIVE ! 60: # define WIN32_LEAN_AND_MEAN ! 61: # include <windows.h> ! 62: #endif ! 63: #if defined OS2 ! 64: # define INCL_DOS ! 65: # include <os2.h> ! 66: #endif ! 67: ! 68: #if ENABLE_RELOCATABLE ! 69: # include "relocatable.h" ! 70: #else ! 71: # define relocate(pathname) (pathname) ! 72: #endif ! 73: ! 74: /* Get LIBDIR. */ ! 75: #ifndef LIBDIR ! 76: # include "configmake.h" ! 77: #endif ! 78: ! 79: #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ ! 80: /* Win32, Cygwin, OS/2, DOS */ ! 81: # define ISSLASH(C) ((C) == '/' || (C) == '\\') ! 82: #endif ! 83: ! 84: #ifndef DIRECTORY_SEPARATOR ! 85: # define DIRECTORY_SEPARATOR '/' ! 86: #endif ! 87: ! 88: #ifndef ISSLASH ! 89: # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) ! 90: #endif ! 91: ! 92: #if HAVE_DECL_GETC_UNLOCKED ! 93: # undef getc ! 94: # define getc getc_unlocked ! 95: #endif ! 96: ! 97: /* The following static variable is declared 'volatile' to avoid a ! 98: possible multithread problem in the function get_charset_aliases. If we ! 99: are running in a threaded environment, and if two threads initialize ! 100: 'charset_aliases' simultaneously, both will produce the same value, ! 101: and everything will be ok if the two assignments to 'charset_aliases' ! 102: are atomic. But I don't know what will happen if the two assignments mix. */ ! 103: #if __STDC__ != 1 ! 104: # define volatile /* empty */ ! 105: #endif ! 106: /* Pointer to the contents of the charset.alias file, if it has already been ! 107: read, else NULL. Its format is: ! 108: ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ ! 109: static const char * volatile charset_aliases; ! 110: ! 111: /* Return a pointer to the contents of the charset.alias file. */ ! 112: static const char * ! 113: get_charset_aliases (void) ! 114: { ! 115: const char *cp; ! 116: ! 117: cp = charset_aliases; ! 118: if (cp == NULL) ! 119: { ! 120: #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__) ! 121: FILE *fp; ! 122: const char *dir; ! 123: const char *base = "charset.alias"; ! 124: char *file_name; ! 125: ! 126: /* Make it possible to override the charset.alias location. This is ! 127: necessary for running the testsuite before "make install". */ ! 128: dir = getenv ("CHARSETALIASDIR"); ! 129: if (dir == NULL || dir[0] == '\0') ! 130: dir = relocate (LIBDIR); ! 131: ! 132: /* Concatenate dir and base into freshly allocated file_name. */ ! 133: { ! 134: size_t dir_len = strlen (dir); ! 135: size_t base_len = strlen (base); ! 136: int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); ! 137: file_name = (char *) malloc (dir_len + add_slash + base_len + 1); ! 138: if (file_name != NULL) ! 139: { ! 140: memcpy (file_name, dir, dir_len); ! 141: if (add_slash) ! 142: file_name[dir_len] = DIRECTORY_SEPARATOR; ! 143: memcpy (file_name + dir_len + add_slash, base, base_len + 1); ! 144: } ! 145: } ! 146: ! 147: if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) ! 148: /* Out of memory or file not found, treat it as empty. */ ! 149: cp = ""; ! 150: else ! 151: { ! 152: /* Parse the file's contents. */ ! 153: char *res_ptr = NULL; ! 154: size_t res_size = 0; ! 155: ! 156: for (;;) ! 157: { ! 158: int c; ! 159: char buf1[50+1]; ! 160: char buf2[50+1]; ! 161: size_t l1, l2; ! 162: char *old_res_ptr; ! 163: ! 164: c = getc (fp); ! 165: if (c == EOF) ! 166: break; ! 167: if (c == '\n' || c == ' ' || c == '\t') ! 168: continue; ! 169: if (c == '#') ! 170: { ! 171: /* Skip comment, to end of line. */ ! 172: do ! 173: c = getc (fp); ! 174: while (!(c == EOF || c == '\n')); ! 175: if (c == EOF) ! 176: break; ! 177: continue; ! 178: } ! 179: ungetc (c, fp); ! 180: if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) ! 181: break; ! 182: l1 = strlen (buf1); ! 183: l2 = strlen (buf2); ! 184: old_res_ptr = res_ptr; ! 185: if (res_size == 0) ! 186: { ! 187: res_size = l1 + 1 + l2 + 1; ! 188: res_ptr = (char *) malloc (res_size + 1); ! 189: } ! 190: else ! 191: { ! 192: res_size += l1 + 1 + l2 + 1; ! 193: res_ptr = (char *) realloc (res_ptr, res_size + 1); ! 194: } ! 195: if (res_ptr == NULL) ! 196: { ! 197: /* Out of memory. */ ! 198: res_size = 0; ! 199: if (old_res_ptr != NULL) ! 200: free (old_res_ptr); ! 201: break; ! 202: } ! 203: strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); ! 204: strcpy (res_ptr + res_size - (l2 + 1), buf2); ! 205: } ! 206: fclose (fp); ! 207: if (res_size == 0) ! 208: cp = ""; ! 209: else ! 210: { ! 211: *(res_ptr + res_size) = '\0'; ! 212: cp = res_ptr; ! 213: } ! 214: } ! 215: ! 216: if (file_name != NULL) ! 217: free (file_name); ! 218: ! 219: #else ! 220: ! 221: # if defined DARWIN7 ! 222: /* To avoid the trouble of installing a file that is shared by many ! 223: GNU packages -- many packaging systems have problems with this --, ! 224: simply inline the aliases here. */ ! 225: cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" ! 226: "ISO8859-2" "\0" "ISO-8859-2" "\0" ! 227: "ISO8859-4" "\0" "ISO-8859-4" "\0" ! 228: "ISO8859-5" "\0" "ISO-8859-5" "\0" ! 229: "ISO8859-7" "\0" "ISO-8859-7" "\0" ! 230: "ISO8859-9" "\0" "ISO-8859-9" "\0" ! 231: "ISO8859-13" "\0" "ISO-8859-13" "\0" ! 232: "ISO8859-15" "\0" "ISO-8859-15" "\0" ! 233: "KOI8-R" "\0" "KOI8-R" "\0" ! 234: "KOI8-U" "\0" "KOI8-U" "\0" ! 235: "CP866" "\0" "CP866" "\0" ! 236: "CP949" "\0" "CP949" "\0" ! 237: "CP1131" "\0" "CP1131" "\0" ! 238: "CP1251" "\0" "CP1251" "\0" ! 239: "eucCN" "\0" "GB2312" "\0" ! 240: "GB2312" "\0" "GB2312" "\0" ! 241: "eucJP" "\0" "EUC-JP" "\0" ! 242: "eucKR" "\0" "EUC-KR" "\0" ! 243: "Big5" "\0" "BIG5" "\0" ! 244: "Big5HKSCS" "\0" "BIG5-HKSCS" "\0" ! 245: "GBK" "\0" "GBK" "\0" ! 246: "GB18030" "\0" "GB18030" "\0" ! 247: "SJIS" "\0" "SHIFT_JIS" "\0" ! 248: "ARMSCII-8" "\0" "ARMSCII-8" "\0" ! 249: "PT154" "\0" "PT154" "\0" ! 250: /*"ISCII-DEV" "\0" "?" "\0"*/ ! 251: "*" "\0" "UTF-8" "\0"; ! 252: # endif ! 253: ! 254: # if defined VMS ! 255: /* To avoid the troubles of an extra file charset.alias_vms in the ! 256: sources of many GNU packages, simply inline the aliases here. */ ! 257: /* The list of encodings is taken from the OpenVMS 7.3-1 documentation ! 258: "Compaq C Run-Time Library Reference Manual for OpenVMS systems" ! 259: section 10.7 "Handling Different Character Sets". */ ! 260: cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" ! 261: "ISO8859-2" "\0" "ISO-8859-2" "\0" ! 262: "ISO8859-5" "\0" "ISO-8859-5" "\0" ! 263: "ISO8859-7" "\0" "ISO-8859-7" "\0" ! 264: "ISO8859-8" "\0" "ISO-8859-8" "\0" ! 265: "ISO8859-9" "\0" "ISO-8859-9" "\0" ! 266: /* Japanese */ ! 267: "eucJP" "\0" "EUC-JP" "\0" ! 268: "SJIS" "\0" "SHIFT_JIS" "\0" ! 269: "DECKANJI" "\0" "DEC-KANJI" "\0" ! 270: "SDECKANJI" "\0" "EUC-JP" "\0" ! 271: /* Chinese */ ! 272: "eucTW" "\0" "EUC-TW" "\0" ! 273: "DECHANYU" "\0" "DEC-HANYU" "\0" ! 274: "DECHANZI" "\0" "GB2312" "\0" ! 275: /* Korean */ ! 276: "DECKOREAN" "\0" "EUC-KR" "\0"; ! 277: # endif ! 278: ! 279: # if defined WIN32_NATIVE || defined __CYGWIN__ ! 280: /* To avoid the troubles of installing a separate file in the same ! 281: directory as the DLL and of retrieving the DLL's directory at ! 282: runtime, simply inline the aliases here. */ ! 283: ! 284: cp = "CP936" "\0" "GBK" "\0" ! 285: "CP1361" "\0" "JOHAB" "\0" ! 286: "CP20127" "\0" "ASCII" "\0" ! 287: "CP20866" "\0" "KOI8-R" "\0" ! 288: "CP20936" "\0" "GB2312" "\0" ! 289: "CP21866" "\0" "KOI8-RU" "\0" ! 290: "CP28591" "\0" "ISO-8859-1" "\0" ! 291: "CP28592" "\0" "ISO-8859-2" "\0" ! 292: "CP28593" "\0" "ISO-8859-3" "\0" ! 293: "CP28594" "\0" "ISO-8859-4" "\0" ! 294: "CP28595" "\0" "ISO-8859-5" "\0" ! 295: "CP28596" "\0" "ISO-8859-6" "\0" ! 296: "CP28597" "\0" "ISO-8859-7" "\0" ! 297: "CP28598" "\0" "ISO-8859-8" "\0" ! 298: "CP28599" "\0" "ISO-8859-9" "\0" ! 299: "CP28605" "\0" "ISO-8859-15" "\0" ! 300: "CP38598" "\0" "ISO-8859-8" "\0" ! 301: "CP51932" "\0" "EUC-JP" "\0" ! 302: "CP51936" "\0" "GB2312" "\0" ! 303: "CP51949" "\0" "EUC-KR" "\0" ! 304: "CP51950" "\0" "EUC-TW" "\0" ! 305: "CP54936" "\0" "GB18030" "\0" ! 306: "CP65001" "\0" "UTF-8" "\0"; ! 307: # endif ! 308: #endif ! 309: ! 310: charset_aliases = cp; ! 311: } ! 312: ! 313: return cp; ! 314: } ! 315: ! 316: /* Determine the current locale's character encoding, and canonicalize it ! 317: into one of the canonical names listed in config.charset. ! 318: The result must not be freed; it is statically allocated. ! 319: If the canonical name cannot be determined, the result is a non-canonical ! 320: name. */ ! 321: ! 322: #ifdef STATIC ! 323: STATIC ! 324: #endif ! 325: const char * ! 326: locale_charset (void) ! 327: { ! 328: const char *codeset; ! 329: const char *aliases; ! 330: ! 331: #if !(defined WIN32_NATIVE || defined OS2) ! 332: ! 333: # if HAVE_LANGINFO_CODESET ! 334: ! 335: /* Most systems support nl_langinfo (CODESET) nowadays. */ ! 336: codeset = nl_langinfo (CODESET); ! 337: ! 338: # ifdef __CYGWIN__ ! 339: /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always ! 340: returns "US-ASCII". As long as this is not fixed, return the suffix ! 341: of the locale name from the environment variables (if present) or ! 342: the codepage as a number. */ ! 343: if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) ! 344: { ! 345: const char *locale; ! 346: static char buf[2 + 10 + 1]; ! 347: ! 348: locale = getenv ("LC_ALL"); ! 349: if (locale == NULL || locale[0] == '\0') ! 350: { ! 351: locale = getenv ("LC_CTYPE"); ! 352: if (locale == NULL || locale[0] == '\0') ! 353: locale = getenv ("LANG"); ! 354: } ! 355: if (locale != NULL && locale[0] != '\0') ! 356: { ! 357: /* If the locale name contains an encoding after the dot, return ! 358: it. */ ! 359: const char *dot = strchr (locale, '.'); ! 360: ! 361: if (dot != NULL) ! 362: { ! 363: const char *modifier; ! 364: ! 365: dot++; ! 366: /* Look for the possible @... trailer and remove it, if any. */ ! 367: modifier = strchr (dot, '@'); ! 368: if (modifier == NULL) ! 369: return dot; ! 370: if (modifier - dot < sizeof (buf)) ! 371: { ! 372: memcpy (buf, dot, modifier - dot); ! 373: buf [modifier - dot] = '\0'; ! 374: return buf; ! 375: } ! 376: } ! 377: } ! 378: ! 379: /* Woe32 has a function returning the locale's codepage as a number. */ ! 380: sprintf (buf, "CP%u", GetACP ()); ! 381: codeset = buf; ! 382: } ! 383: # endif ! 384: ! 385: # else ! 386: ! 387: /* On old systems which lack it, use setlocale or getenv. */ ! 388: const char *locale = NULL; ! 389: ! 390: /* But most old systems don't have a complete set of locales. Some ! 391: (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't ! 392: use setlocale here; it would return "C" when it doesn't support the ! 393: locale name the user has set. */ ! 394: # if 0 ! 395: locale = setlocale (LC_CTYPE, NULL); ! 396: # endif ! 397: if (locale == NULL || locale[0] == '\0') ! 398: { ! 399: locale = getenv ("LC_ALL"); ! 400: if (locale == NULL || locale[0] == '\0') ! 401: { ! 402: locale = getenv ("LC_CTYPE"); ! 403: if (locale == NULL || locale[0] == '\0') ! 404: locale = getenv ("LANG"); ! 405: } ! 406: } ! 407: ! 408: /* On some old systems, one used to set locale = "iso8859_1". On others, ! 409: you set it to "language_COUNTRY.charset". In any case, we resolve it ! 410: through the charset.alias file. */ ! 411: codeset = locale; ! 412: ! 413: # endif ! 414: ! 415: #elif defined WIN32_NATIVE ! 416: ! 417: static char buf[2 + 10 + 1]; ! 418: ! 419: /* Woe32 has a function returning the locale's codepage as a number. */ ! 420: sprintf (buf, "CP%u", GetACP ()); ! 421: codeset = buf; ! 422: ! 423: #elif defined OS2 ! 424: ! 425: const char *locale; ! 426: static char buf[2 + 10 + 1]; ! 427: ULONG cp[3]; ! 428: ULONG cplen; ! 429: ! 430: /* Allow user to override the codeset, as set in the operating system, ! 431: with standard language environment variables. */ ! 432: locale = getenv ("LC_ALL"); ! 433: if (locale == NULL || locale[0] == '\0') ! 434: { ! 435: locale = getenv ("LC_CTYPE"); ! 436: if (locale == NULL || locale[0] == '\0') ! 437: locale = getenv ("LANG"); ! 438: } ! 439: if (locale != NULL && locale[0] != '\0') ! 440: { ! 441: /* If the locale name contains an encoding after the dot, return it. */ ! 442: const char *dot = strchr (locale, '.'); ! 443: ! 444: if (dot != NULL) ! 445: { ! 446: const char *modifier; ! 447: ! 448: dot++; ! 449: /* Look for the possible @... trailer and remove it, if any. */ ! 450: modifier = strchr (dot, '@'); ! 451: if (modifier == NULL) ! 452: return dot; ! 453: if (modifier - dot < sizeof (buf)) ! 454: { ! 455: memcpy (buf, dot, modifier - dot); ! 456: buf [modifier - dot] = '\0'; ! 457: return buf; ! 458: } ! 459: } ! 460: ! 461: /* Resolve through the charset.alias file. */ ! 462: codeset = locale; ! 463: } ! 464: else ! 465: { ! 466: /* OS/2 has a function returning the locale's codepage as a number. */ ! 467: if (DosQueryCp (sizeof (cp), cp, &cplen)) ! 468: codeset = ""; ! 469: else ! 470: { ! 471: sprintf (buf, "CP%u", cp[0]); ! 472: codeset = buf; ! 473: } ! 474: } ! 475: ! 476: #endif ! 477: ! 478: if (codeset == NULL) ! 479: /* The canonical name cannot be determined. */ ! 480: codeset = ""; ! 481: ! 482: /* Resolve alias. */ ! 483: for (aliases = get_charset_aliases (); ! 484: *aliases != '\0'; ! 485: aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) ! 486: if (strcmp (codeset, aliases) == 0 ! 487: || (aliases[0] == '*' && aliases[1] == '\0')) ! 488: { ! 489: codeset = aliases + strlen (aliases) + 1; ! 490: break; ! 491: } ! 492: ! 493: /* Don't return an empty string. GNU libc and GNU libiconv interpret ! 494: the empty string as denoting "the locale's character encoding", ! 495: thus GNU libiconv would call this function a second time. */ ! 496: if (codeset[0] == '\0') ! 497: codeset = "ASCII"; ! 498: ! 499: return codeset; ! 500: }