Annotation of embedaddon/libiconv/libcharset/lib/localcharset.c, revision 1.1.1.2
1.1 misho 1: /* Determine a canonical name for the current locale's character encoding.
2:
1.1.1.2 ! misho 3: Copyright (C) 2000-2006, 2008-2010 Free Software Foundation, Inc.
1.1 misho 4:
5: This program is free software; you can redistribute it and/or modify it
6: under the terms of the GNU Library General Public License as published
7: by the Free Software Foundation; either version 2, or (at your option)
8: any later version.
9:
10: This program is distributed in the hope that it will be useful,
11: but WITHOUT ANY WARRANTY; without even the implied warranty of
12: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13: Library General Public License for more details.
14:
15: You should have received a copy of the GNU Library General Public
16: License along with this program; if not, write to the Free Software
17: Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
18: USA. */
19:
20: /* Written by Bruno Haible <bruno@clisp.org>. */
21:
22: #include <config.h>
23:
24: /* Specification. */
25: #include "localcharset.h"
26:
1.1.1.2 ! misho 27: #include <fcntl.h>
1.1 misho 28: #include <stddef.h>
29: #include <stdio.h>
30: #include <string.h>
31: #include <stdlib.h>
32:
33: #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
34: # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
35: #endif
36:
37: #if defined _WIN32 || defined __WIN32__
38: # define WIN32_NATIVE
39: #endif
40:
41: #if defined __EMX__
42: /* Assume EMX program runs on OS/2, even if compiled under DOS. */
43: # ifndef OS2
44: # define OS2
45: # endif
46: #endif
47:
48: #if !defined WIN32_NATIVE
1.1.1.2 ! misho 49: # include <unistd.h>
1.1 misho 50: # if HAVE_LANGINFO_CODESET
51: # include <langinfo.h>
52: # else
53: # if 0 /* see comment below */
54: # include <locale.h>
55: # endif
56: # endif
57: # ifdef __CYGWIN__
58: # define WIN32_LEAN_AND_MEAN
59: # include <windows.h>
60: # endif
61: #elif defined WIN32_NATIVE
62: # define WIN32_LEAN_AND_MEAN
63: # include <windows.h>
64: #endif
65: #if defined OS2
66: # define INCL_DOS
67: # include <os2.h>
68: #endif
69:
70: #if ENABLE_RELOCATABLE
71: # include "relocatable.h"
72: #else
73: # define relocate(pathname) (pathname)
74: #endif
75:
76: /* Get LIBDIR. */
77: #ifndef LIBDIR
78: # include "configmake.h"
79: #endif
80:
1.1.1.2 ! misho 81: /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
! 82: #ifndef O_NOFOLLOW
! 83: # define O_NOFOLLOW 0
! 84: #endif
! 85:
1.1 misho 86: #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
87: /* Win32, Cygwin, OS/2, DOS */
88: # define ISSLASH(C) ((C) == '/' || (C) == '\\')
89: #endif
90:
91: #ifndef DIRECTORY_SEPARATOR
92: # define DIRECTORY_SEPARATOR '/'
93: #endif
94:
95: #ifndef ISSLASH
96: # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
97: #endif
98:
99: #if HAVE_DECL_GETC_UNLOCKED
100: # undef getc
101: # define getc getc_unlocked
102: #endif
103:
104: /* The following static variable is declared 'volatile' to avoid a
105: possible multithread problem in the function get_charset_aliases. If we
106: are running in a threaded environment, and if two threads initialize
107: 'charset_aliases' simultaneously, both will produce the same value,
108: and everything will be ok if the two assignments to 'charset_aliases'
109: are atomic. But I don't know what will happen if the two assignments mix. */
110: #if __STDC__ != 1
111: # define volatile /* empty */
112: #endif
113: /* Pointer to the contents of the charset.alias file, if it has already been
114: read, else NULL. Its format is:
115: ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
116: static const char * volatile charset_aliases;
117:
118: /* Return a pointer to the contents of the charset.alias file. */
119: static const char *
120: get_charset_aliases (void)
121: {
122: const char *cp;
123:
124: cp = charset_aliases;
125: if (cp == NULL)
126: {
127: #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
128: const char *dir;
129: const char *base = "charset.alias";
130: char *file_name;
131:
132: /* Make it possible to override the charset.alias location. This is
1.1.1.2 ! misho 133: necessary for running the testsuite before "make install". */
1.1 misho 134: dir = getenv ("CHARSETALIASDIR");
135: if (dir == NULL || dir[0] == '\0')
1.1.1.2 ! misho 136: dir = relocate (LIBDIR);
1.1 misho 137:
138: /* Concatenate dir and base into freshly allocated file_name. */
139: {
1.1.1.2 ! misho 140: size_t dir_len = strlen (dir);
! 141: size_t base_len = strlen (base);
! 142: int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
! 143: file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
! 144: if (file_name != NULL)
! 145: {
! 146: memcpy (file_name, dir, dir_len);
! 147: if (add_slash)
! 148: file_name[dir_len] = DIRECTORY_SEPARATOR;
! 149: memcpy (file_name + dir_len + add_slash, base, base_len + 1);
! 150: }
1.1 misho 151: }
152:
1.1.1.2 ! misho 153: if (file_name == NULL)
! 154: /* Out of memory. Treat the file as empty. */
! 155: cp = "";
1.1 misho 156: else
1.1.1.2 ! misho 157: {
! 158: int fd;
1.1 misho 159:
1.1.1.2 ! misho 160: /* Open the file. Reject symbolic links on platforms that support
! 161: O_NOFOLLOW. This is a security feature. Without it, an attacker
! 162: could retrieve parts of the contents (namely, the tail of the
! 163: first line that starts with "* ") of an arbitrary file by placing
! 164: a symbolic link to that file under the name "charset.alias" in
! 165: some writable directory and defining the environment variable
! 166: CHARSETALIASDIR to point to that directory. */
! 167: fd = open (file_name,
! 168: O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
! 169: if (fd < 0)
! 170: /* File not found. Treat it as empty. */
! 171: cp = "";
! 172: else
! 173: {
! 174: FILE *fp;
! 175:
! 176: fp = fdopen (fd, "r");
! 177: if (fp == NULL)
! 178: {
! 179: /* Out of memory. Treat the file as empty. */
! 180: close (fd);
! 181: cp = "";
! 182: }
! 183: else
! 184: {
! 185: /* Parse the file's contents. */
! 186: char *res_ptr = NULL;
! 187: size_t res_size = 0;
! 188:
! 189: for (;;)
! 190: {
! 191: int c;
! 192: char buf1[50+1];
! 193: char buf2[50+1];
! 194: size_t l1, l2;
! 195: char *old_res_ptr;
! 196:
! 197: c = getc (fp);
! 198: if (c == EOF)
! 199: break;
! 200: if (c == '\n' || c == ' ' || c == '\t')
! 201: continue;
! 202: if (c == '#')
! 203: {
! 204: /* Skip comment, to end of line. */
! 205: do
! 206: c = getc (fp);
! 207: while (!(c == EOF || c == '\n'));
! 208: if (c == EOF)
! 209: break;
! 210: continue;
! 211: }
! 212: ungetc (c, fp);
! 213: if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
! 214: break;
! 215: l1 = strlen (buf1);
! 216: l2 = strlen (buf2);
! 217: old_res_ptr = res_ptr;
! 218: if (res_size == 0)
! 219: {
! 220: res_size = l1 + 1 + l2 + 1;
! 221: res_ptr = (char *) malloc (res_size + 1);
! 222: }
! 223: else
! 224: {
! 225: res_size += l1 + 1 + l2 + 1;
! 226: res_ptr = (char *) realloc (res_ptr, res_size + 1);
! 227: }
! 228: if (res_ptr == NULL)
! 229: {
! 230: /* Out of memory. */
! 231: res_size = 0;
! 232: free (old_res_ptr);
! 233: break;
! 234: }
! 235: strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
! 236: strcpy (res_ptr + res_size - (l2 + 1), buf2);
! 237: }
! 238: fclose (fp);
! 239: if (res_size == 0)
! 240: cp = "";
! 241: else
! 242: {
! 243: *(res_ptr + res_size) = '\0';
! 244: cp = res_ptr;
! 245: }
! 246: }
! 247: }
! 248:
! 249: free (file_name);
! 250: }
1.1 misho 251:
252: #else
253:
254: # if defined DARWIN7
255: /* To avoid the trouble of installing a file that is shared by many
1.1.1.2 ! misho 256: GNU packages -- many packaging systems have problems with this --,
! 257: simply inline the aliases here. */
1.1 misho 258: cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
1.1.1.2 ! misho 259: "ISO8859-2" "\0" "ISO-8859-2" "\0"
! 260: "ISO8859-4" "\0" "ISO-8859-4" "\0"
! 261: "ISO8859-5" "\0" "ISO-8859-5" "\0"
! 262: "ISO8859-7" "\0" "ISO-8859-7" "\0"
! 263: "ISO8859-9" "\0" "ISO-8859-9" "\0"
! 264: "ISO8859-13" "\0" "ISO-8859-13" "\0"
! 265: "ISO8859-15" "\0" "ISO-8859-15" "\0"
! 266: "KOI8-R" "\0" "KOI8-R" "\0"
! 267: "KOI8-U" "\0" "KOI8-U" "\0"
! 268: "CP866" "\0" "CP866" "\0"
! 269: "CP949" "\0" "CP949" "\0"
! 270: "CP1131" "\0" "CP1131" "\0"
! 271: "CP1251" "\0" "CP1251" "\0"
! 272: "eucCN" "\0" "GB2312" "\0"
! 273: "GB2312" "\0" "GB2312" "\0"
! 274: "eucJP" "\0" "EUC-JP" "\0"
! 275: "eucKR" "\0" "EUC-KR" "\0"
! 276: "Big5" "\0" "BIG5" "\0"
! 277: "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
! 278: "GBK" "\0" "GBK" "\0"
! 279: "GB18030" "\0" "GB18030" "\0"
! 280: "SJIS" "\0" "SHIFT_JIS" "\0"
! 281: "ARMSCII-8" "\0" "ARMSCII-8" "\0"
! 282: "PT154" "\0" "PT154" "\0"
! 283: /*"ISCII-DEV" "\0" "?" "\0"*/
! 284: "*" "\0" "UTF-8" "\0";
1.1 misho 285: # endif
286:
287: # if defined VMS
288: /* To avoid the troubles of an extra file charset.alias_vms in the
1.1.1.2 ! misho 289: sources of many GNU packages, simply inline the aliases here. */
1.1 misho 290: /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
1.1.1.2 ! misho 291: "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
! 292: section 10.7 "Handling Different Character Sets". */
1.1 misho 293: cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
1.1.1.2 ! misho 294: "ISO8859-2" "\0" "ISO-8859-2" "\0"
! 295: "ISO8859-5" "\0" "ISO-8859-5" "\0"
! 296: "ISO8859-7" "\0" "ISO-8859-7" "\0"
! 297: "ISO8859-8" "\0" "ISO-8859-8" "\0"
! 298: "ISO8859-9" "\0" "ISO-8859-9" "\0"
! 299: /* Japanese */
! 300: "eucJP" "\0" "EUC-JP" "\0"
! 301: "SJIS" "\0" "SHIFT_JIS" "\0"
! 302: "DECKANJI" "\0" "DEC-KANJI" "\0"
! 303: "SDECKANJI" "\0" "EUC-JP" "\0"
! 304: /* Chinese */
! 305: "eucTW" "\0" "EUC-TW" "\0"
! 306: "DECHANYU" "\0" "DEC-HANYU" "\0"
! 307: "DECHANZI" "\0" "GB2312" "\0"
! 308: /* Korean */
! 309: "DECKOREAN" "\0" "EUC-KR" "\0";
1.1 misho 310: # endif
311:
312: # if defined WIN32_NATIVE || defined __CYGWIN__
313: /* To avoid the troubles of installing a separate file in the same
1.1.1.2 ! misho 314: directory as the DLL and of retrieving the DLL's directory at
! 315: runtime, simply inline the aliases here. */
1.1 misho 316:
317: cp = "CP936" "\0" "GBK" "\0"
1.1.1.2 ! misho 318: "CP1361" "\0" "JOHAB" "\0"
! 319: "CP20127" "\0" "ASCII" "\0"
! 320: "CP20866" "\0" "KOI8-R" "\0"
! 321: "CP20936" "\0" "GB2312" "\0"
! 322: "CP21866" "\0" "KOI8-RU" "\0"
! 323: "CP28591" "\0" "ISO-8859-1" "\0"
! 324: "CP28592" "\0" "ISO-8859-2" "\0"
! 325: "CP28593" "\0" "ISO-8859-3" "\0"
! 326: "CP28594" "\0" "ISO-8859-4" "\0"
! 327: "CP28595" "\0" "ISO-8859-5" "\0"
! 328: "CP28596" "\0" "ISO-8859-6" "\0"
! 329: "CP28597" "\0" "ISO-8859-7" "\0"
! 330: "CP28598" "\0" "ISO-8859-8" "\0"
! 331: "CP28599" "\0" "ISO-8859-9" "\0"
! 332: "CP28605" "\0" "ISO-8859-15" "\0"
! 333: "CP38598" "\0" "ISO-8859-8" "\0"
! 334: "CP51932" "\0" "EUC-JP" "\0"
! 335: "CP51936" "\0" "GB2312" "\0"
! 336: "CP51949" "\0" "EUC-KR" "\0"
! 337: "CP51950" "\0" "EUC-TW" "\0"
! 338: "CP54936" "\0" "GB18030" "\0"
! 339: "CP65001" "\0" "UTF-8" "\0";
1.1 misho 340: # endif
341: #endif
342:
343: charset_aliases = cp;
344: }
345:
346: return cp;
347: }
348:
349: /* Determine the current locale's character encoding, and canonicalize it
350: into one of the canonical names listed in config.charset.
351: The result must not be freed; it is statically allocated.
352: If the canonical name cannot be determined, the result is a non-canonical
353: name. */
354:
355: #ifdef STATIC
356: STATIC
357: #endif
358: const char *
359: locale_charset (void)
360: {
361: const char *codeset;
362: const char *aliases;
363:
364: #if !(defined WIN32_NATIVE || defined OS2)
365:
366: # if HAVE_LANGINFO_CODESET
367:
368: /* Most systems support nl_langinfo (CODESET) nowadays. */
369: codeset = nl_langinfo (CODESET);
370:
371: # ifdef __CYGWIN__
1.1.1.2 ! misho 372: /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
! 373: returns "US-ASCII". Return the suffix of the locale name from the
! 374: environment variables (if present) or the codepage as a number. */
1.1 misho 375: if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
376: {
377: const char *locale;
378: static char buf[2 + 10 + 1];
379:
380: locale = getenv ("LC_ALL");
381: if (locale == NULL || locale[0] == '\0')
1.1.1.2 ! misho 382: {
! 383: locale = getenv ("LC_CTYPE");
! 384: if (locale == NULL || locale[0] == '\0')
! 385: locale = getenv ("LANG");
! 386: }
1.1 misho 387: if (locale != NULL && locale[0] != '\0')
1.1.1.2 ! misho 388: {
! 389: /* If the locale name contains an encoding after the dot, return
! 390: it. */
! 391: const char *dot = strchr (locale, '.');
! 392:
! 393: if (dot != NULL)
! 394: {
! 395: const char *modifier;
! 396:
! 397: dot++;
! 398: /* Look for the possible @... trailer and remove it, if any. */
! 399: modifier = strchr (dot, '@');
! 400: if (modifier == NULL)
! 401: return dot;
! 402: if (modifier - dot < sizeof (buf))
! 403: {
! 404: memcpy (buf, dot, modifier - dot);
! 405: buf [modifier - dot] = '\0';
! 406: return buf;
! 407: }
! 408: }
! 409: }
! 410:
! 411: /* Woe32 has a function returning the locale's codepage as a number:
! 412: GetACP(). This encoding is used by Cygwin, unless the user has set
! 413: the environment variable CYGWIN=codepage:oem (which very few people
! 414: do).
! 415: Output directed to console windows needs to be converted (to
! 416: GetOEMCP() if the console is using a raster font, or to
! 417: GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
! 418: this conversion transparently (see winsup/cygwin/fhandler_console.cc),
! 419: converting to GetConsoleOutputCP(). This leads to correct results,
! 420: except when SetConsoleOutputCP has been called and a raster font is
! 421: in use. */
1.1 misho 422: sprintf (buf, "CP%u", GetACP ());
423: codeset = buf;
424: }
425: # endif
426:
427: # else
428:
429: /* On old systems which lack it, use setlocale or getenv. */
430: const char *locale = NULL;
431:
432: /* But most old systems don't have a complete set of locales. Some
433: (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
434: use setlocale here; it would return "C" when it doesn't support the
435: locale name the user has set. */
436: # if 0
437: locale = setlocale (LC_CTYPE, NULL);
438: # endif
439: if (locale == NULL || locale[0] == '\0')
440: {
441: locale = getenv ("LC_ALL");
442: if (locale == NULL || locale[0] == '\0')
1.1.1.2 ! misho 443: {
! 444: locale = getenv ("LC_CTYPE");
! 445: if (locale == NULL || locale[0] == '\0')
! 446: locale = getenv ("LANG");
! 447: }
1.1 misho 448: }
449:
450: /* On some old systems, one used to set locale = "iso8859_1". On others,
451: you set it to "language_COUNTRY.charset". In any case, we resolve it
452: through the charset.alias file. */
453: codeset = locale;
454:
455: # endif
456:
457: #elif defined WIN32_NATIVE
458:
459: static char buf[2 + 10 + 1];
460:
1.1.1.2 ! misho 461: /* Woe32 has a function returning the locale's codepage as a number:
! 462: GetACP().
! 463: When the output goes to a console window, it needs to be provided in
! 464: GetOEMCP() encoding if the console is using a raster font, or in
! 465: GetConsoleOutputCP() encoding if it is using a TrueType font.
! 466: But in GUI programs and for output sent to files and pipes, GetACP()
! 467: encoding is the best bet. */
1.1 misho 468: sprintf (buf, "CP%u", GetACP ());
469: codeset = buf;
470:
471: #elif defined OS2
472:
473: const char *locale;
474: static char buf[2 + 10 + 1];
475: ULONG cp[3];
476: ULONG cplen;
477:
478: /* Allow user to override the codeset, as set in the operating system,
479: with standard language environment variables. */
480: locale = getenv ("LC_ALL");
481: if (locale == NULL || locale[0] == '\0')
482: {
483: locale = getenv ("LC_CTYPE");
484: if (locale == NULL || locale[0] == '\0')
1.1.1.2 ! misho 485: locale = getenv ("LANG");
1.1 misho 486: }
487: if (locale != NULL && locale[0] != '\0')
488: {
489: /* If the locale name contains an encoding after the dot, return it. */
490: const char *dot = strchr (locale, '.');
491:
492: if (dot != NULL)
1.1.1.2 ! misho 493: {
! 494: const char *modifier;
1.1 misho 495:
1.1.1.2 ! misho 496: dot++;
! 497: /* Look for the possible @... trailer and remove it, if any. */
! 498: modifier = strchr (dot, '@');
! 499: if (modifier == NULL)
! 500: return dot;
! 501: if (modifier - dot < sizeof (buf))
! 502: {
! 503: memcpy (buf, dot, modifier - dot);
! 504: buf [modifier - dot] = '\0';
! 505: return buf;
! 506: }
! 507: }
1.1 misho 508:
509: /* Resolve through the charset.alias file. */
510: codeset = locale;
511: }
512: else
513: {
514: /* OS/2 has a function returning the locale's codepage as a number. */
515: if (DosQueryCp (sizeof (cp), cp, &cplen))
1.1.1.2 ! misho 516: codeset = "";
1.1 misho 517: else
1.1.1.2 ! misho 518: {
! 519: sprintf (buf, "CP%u", cp[0]);
! 520: codeset = buf;
! 521: }
1.1 misho 522: }
523:
524: #endif
525:
526: if (codeset == NULL)
527: /* The canonical name cannot be determined. */
528: codeset = "";
529:
530: /* Resolve alias. */
531: for (aliases = get_charset_aliases ();
532: *aliases != '\0';
533: aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
534: if (strcmp (codeset, aliases) == 0
1.1.1.2 ! misho 535: || (aliases[0] == '*' && aliases[1] == '\0'))
1.1 misho 536: {
1.1.1.2 ! misho 537: codeset = aliases + strlen (aliases) + 1;
! 538: break;
1.1 misho 539: }
540:
541: /* Don't return an empty string. GNU libc and GNU libiconv interpret
542: the empty string as denoting "the locale's character encoding",
543: thus GNU libiconv would call this function a second time. */
544: if (codeset[0] == '\0')
545: codeset = "ASCII";
546:
547: return codeset;
548: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>