File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / libiconv / libcharset / lib / localcharset.c
Revision 1.1.1.3 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 13:38:46 2021 UTC (4 years ago) by misho
Branches: libiconv, MAIN
CVS tags: v1_16p0, HEAD
libiconv 1.16

    1: /* Determine a canonical name for the current locale's character encoding.
    2: 
    3:    Copyright (C) 2000-2006, 2008-2018 Free Software Foundation, Inc.
    4: 
    5:    This program is free software; you can redistribute it and/or modify it
    6:    under the terms of the GNU Library General Public License as published
    7:    by the Free Software Foundation; either version 2, or (at your option)
    8:    any later version.
    9: 
   10:    This program is distributed in the hope that it will be useful,
   11:    but WITHOUT ANY WARRANTY; without even the implied warranty of
   12:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13:    Library General Public License for more details.
   14: 
   15:    You should have received a copy of the GNU Library General Public License
   16:    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
   17: 
   18: /* Written by Bruno Haible <bruno@clisp.org>.  */
   19: 
   20: #include <config.h>
   21: 
   22: /* Specification.  */
   23: #include "localcharset.h"
   24: 
   25: #include <stddef.h>
   26: #include <stdio.h>
   27: #include <string.h>
   28: #include <stdlib.h>
   29: 
   30: #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
   31: # define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
   32: #endif
   33: 
   34: #if defined _WIN32 && !defined __CYGWIN__
   35: # define WINDOWS_NATIVE
   36: # include <locale.h>
   37: #endif
   38: 
   39: #if defined __EMX__
   40: /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
   41: # ifndef OS2
   42: #  define OS2
   43: # endif
   44: #endif
   45: 
   46: #if !defined WINDOWS_NATIVE
   47: # if HAVE_LANGINFO_CODESET
   48: #  include <langinfo.h>
   49: # else
   50: #  if 0 /* see comment regarding use of setlocale(), below */
   51: #   include <locale.h>
   52: #  endif
   53: # endif
   54: # ifdef __CYGWIN__
   55: #  define WIN32_LEAN_AND_MEAN
   56: #  include <windows.h>
   57: # endif
   58: #elif defined WINDOWS_NATIVE
   59: # define WIN32_LEAN_AND_MEAN
   60: # include <windows.h>
   61: #endif
   62: #if defined OS2
   63: # define INCL_DOS
   64: # include <os2.h>
   65: #endif
   66: 
   67: /* For MB_CUR_MAX_L */
   68: #if defined DARWIN7
   69: # include <xlocale.h>
   70: #endif
   71: 
   72: 
   73: #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
   74: 
   75: /* On these platforms, we use a mapping from non-canonical encoding name
   76:    to GNU canonical encoding name.  */
   77: 
   78: /* With glibc-2.1 or newer, we don't need any canonicalization,
   79:    because glibc has iconv and both glibc and libiconv support all
   80:    GNU canonical names directly.  */
   81: # if !((defined __GNU_LIBRARY__ && __GLIBC__ >= 2) || defined __UCLIBC__)
   82: 
   83: struct table_entry
   84: {
   85:   const char alias[11+1];
   86:   const char canonical[11+1];
   87: };
   88: 
   89: /* Table of platform-dependent mappings, sorted in ascending order.  */
   90: static const struct table_entry alias_table[] =
   91:   {
   92: #  if defined __FreeBSD__                                   /* FreeBSD */
   93:   /*{ "ARMSCII-8",  "ARMSCII-8" },*/
   94:     { "Big5",       "BIG5" },
   95:     { "Big5HKSCS",  "BIG5-HKSCS" },
   96:     { "C",          "ASCII" },
   97:   /*{ "CP1131",     "CP1131" },*/
   98:   /*{ "CP1251",     "CP1251" },*/
   99:   /*{ "CP866",      "CP866" },*/
  100:   /*{ "GB18030",    "GB18030" },*/
  101:   /*{ "GB2312",     "GB2312" },*/
  102:   /*{ "GBK",        "GBK" },*/
  103:   /*{ "ISCII-DEV",  "?" },*/
  104:     { "ISO8859-1",  "ISO-8859-1" },
  105:     { "ISO8859-13", "ISO-8859-13" },
  106:     { "ISO8859-15", "ISO-8859-15" },
  107:     { "ISO8859-2",  "ISO-8859-2" },
  108:     { "ISO8859-4",  "ISO-8859-4" },
  109:     { "ISO8859-5",  "ISO-8859-5" },
  110:     { "ISO8859-7",  "ISO-8859-7" },
  111:     { "ISO8859-9",  "ISO-8859-9" },
  112:   /*{ "KOI8-R",     "KOI8-R" },*/
  113:   /*{ "KOI8-U",     "KOI8-U" },*/
  114:     { "SJIS",       "SHIFT_JIS" },
  115:     { "US-ASCII",   "ASCII" },
  116:     { "eucCN",      "GB2312" },
  117:     { "eucJP",      "EUC-JP" },
  118:     { "eucKR",      "EUC-KR" }
  119: #   define alias_table_defined
  120: #  endif
  121: #  if defined __NetBSD__                                    /* NetBSD */
  122:     { "646",        "ASCII" },
  123:   /*{ "ARMSCII-8",  "ARMSCII-8" },*/
  124:   /*{ "BIG5",       "BIG5" },*/
  125:     { "Big5-HKSCS", "BIG5-HKSCS" },
  126:   /*{ "CP1251",     "CP1251" },*/
  127:   /*{ "CP866",      "CP866" },*/
  128:   /*{ "GB18030",    "GB18030" },*/
  129:   /*{ "GB2312",     "GB2312" },*/
  130:     { "ISO8859-1",  "ISO-8859-1" },
  131:     { "ISO8859-13", "ISO-8859-13" },
  132:     { "ISO8859-15", "ISO-8859-15" },
  133:     { "ISO8859-2",  "ISO-8859-2" },
  134:     { "ISO8859-4",  "ISO-8859-4" },
  135:     { "ISO8859-5",  "ISO-8859-5" },
  136:     { "ISO8859-7",  "ISO-8859-7" },
  137:   /*{ "KOI8-R",     "KOI8-R" },*/
  138:   /*{ "KOI8-U",     "KOI8-U" },*/
  139:   /*{ "PT154",      "PT154" },*/
  140:     { "SJIS",       "SHIFT_JIS" },
  141:     { "eucCN",      "GB2312" },
  142:     { "eucJP",      "EUC-JP" },
  143:     { "eucKR",      "EUC-KR" },
  144:     { "eucTW",      "EUC-TW" }
  145: #   define alias_table_defined
  146: #  endif
  147: #  if defined __OpenBSD__                                   /* OpenBSD */
  148:     { "646",        "ASCII" },
  149:     { "ISO8859-1",  "ISO-8859-1" },
  150:     { "ISO8859-13", "ISO-8859-13" },
  151:     { "ISO8859-15", "ISO-8859-15" },
  152:     { "ISO8859-2",  "ISO-8859-2" },
  153:     { "ISO8859-4",  "ISO-8859-4" },
  154:     { "ISO8859-5",  "ISO-8859-5" },
  155:     { "ISO8859-7",  "ISO-8859-7" }
  156: #   define alias_table_defined
  157: #  endif
  158: #  if defined __APPLE__ && defined __MACH__                 /* Mac OS X */
  159:     /* Darwin 7.5 has nl_langinfo(CODESET), but sometimes its value is
  160:        useless:
  161:        - It returns the empty string when LANG is set to a locale of the
  162:          form ll_CC, although ll_CC/LC_CTYPE is a symlink to an UTF-8
  163:          LC_CTYPE file.
  164:        - The environment variables LANG, LC_CTYPE, LC_ALL are not set by
  165:          the system; nl_langinfo(CODESET) returns "US-ASCII" in this case.
  166:        - The documentation says:
  167:            "... all code that calls BSD system routines should ensure
  168:             that the const *char parameters of these routines are in UTF-8
  169:             encoding. All BSD system functions expect their string
  170:             parameters to be in UTF-8 encoding and nothing else."
  171:          It also says
  172:            "An additional caveat is that string parameters for files,
  173:             paths, and other file-system entities must be in canonical
  174:             UTF-8. In a canonical UTF-8 Unicode string, all decomposable
  175:             characters are decomposed ..."
  176:          but this is not true: You can pass non-decomposed UTF-8 strings
  177:          to file system functions, and it is the OS which will convert
  178:          them to decomposed UTF-8 before accessing the file system.
  179:        - The Apple Terminal application displays UTF-8 by default.
  180:        - However, other applications are free to use different encodings:
  181:          - xterm uses ISO-8859-1 by default.
  182:          - TextEdit uses MacRoman by default.
  183:        We prefer UTF-8 over decomposed UTF-8-MAC because one should
  184:        minimize the use of decomposed Unicode. Unfortunately, through the
  185:        Darwin file system, decomposed UTF-8 strings are leaked into user
  186:        space nevertheless.
  187:        Then there are also the locales with encodings other than US-ASCII
  188:        and UTF-8. These locales can be occasionally useful to users (e.g.
  189:        when grepping through ISO-8859-1 encoded text files), when all their
  190:        file names are in US-ASCII.
  191:      */
  192:     { "ARMSCII-8",  "ARMSCII-8" },
  193:     { "Big5",       "BIG5" },
  194:     { "Big5HKSCS",  "BIG5-HKSCS" },
  195:     { "CP1131",     "CP1131" },
  196:     { "CP1251",     "CP1251" },
  197:     { "CP866",      "CP866" },
  198:     { "CP949",      "CP949" },
  199:     { "GB18030",    "GB18030" },
  200:     { "GB2312",     "GB2312" },
  201:     { "GBK",        "GBK" },
  202:   /*{ "ISCII-DEV",  "?" },*/
  203:     { "ISO8859-1",  "ISO-8859-1" },
  204:     { "ISO8859-13", "ISO-8859-13" },
  205:     { "ISO8859-15", "ISO-8859-15" },
  206:     { "ISO8859-2",  "ISO-8859-2" },
  207:     { "ISO8859-4",  "ISO-8859-4" },
  208:     { "ISO8859-5",  "ISO-8859-5" },
  209:     { "ISO8859-7",  "ISO-8859-7" },
  210:     { "ISO8859-9",  "ISO-8859-9" },
  211:     { "KOI8-R",     "KOI8-R" },
  212:     { "KOI8-U",     "KOI8-U" },
  213:     { "PT154",      "PT154" },
  214:     { "SJIS",       "SHIFT_JIS" },
  215:     { "eucCN",      "GB2312" },
  216:     { "eucJP",      "EUC-JP" },
  217:     { "eucKR",      "EUC-KR" }
  218: #   define alias_table_defined
  219: #  endif
  220: #  if defined _AIX                                          /* AIX */
  221:   /*{ "GBK",        "GBK" },*/
  222:     { "IBM-1046",   "CP1046" },
  223:     { "IBM-1124",   "CP1124" },
  224:     { "IBM-1129",   "CP1129" },
  225:     { "IBM-1252",   "CP1252" },
  226:     { "IBM-850",    "CP850" },
  227:     { "IBM-856",    "CP856" },
  228:     { "IBM-921",    "ISO-8859-13" },
  229:     { "IBM-922",    "CP922" },
  230:     { "IBM-932",    "CP932" },
  231:     { "IBM-943",    "CP943" },
  232:     { "IBM-eucCN",  "GB2312" },
  233:     { "IBM-eucJP",  "EUC-JP" },
  234:     { "IBM-eucKR",  "EUC-KR" },
  235:     { "IBM-eucTW",  "EUC-TW" },
  236:     { "ISO8859-1",  "ISO-8859-1" },
  237:     { "ISO8859-15", "ISO-8859-15" },
  238:     { "ISO8859-2",  "ISO-8859-2" },
  239:     { "ISO8859-5",  "ISO-8859-5" },
  240:     { "ISO8859-6",  "ISO-8859-6" },
  241:     { "ISO8859-7",  "ISO-8859-7" },
  242:     { "ISO8859-8",  "ISO-8859-8" },
  243:     { "ISO8859-9",  "ISO-8859-9" },
  244:     { "TIS-620",    "TIS-620" },
  245:   /*{ "UTF-8",      "UTF-8" },*/
  246:     { "big5",       "BIG5" }
  247: #   define alias_table_defined
  248: #  endif
  249: #  if defined __hpux                                        /* HP-UX */
  250:     { "SJIS",      "SHIFT_JIS" },
  251:     { "arabic8",   "HP-ARABIC8" },
  252:     { "big5",      "BIG5" },
  253:     { "cp1251",    "CP1251" },
  254:     { "eucJP",     "EUC-JP" },
  255:     { "eucKR",     "EUC-KR" },
  256:     { "eucTW",     "EUC-TW" },
  257:     { "gb18030",   "GB18030" },
  258:     { "greek8",    "HP-GREEK8" },
  259:     { "hebrew8",   "HP-HEBREW8" },
  260:     { "hkbig5",    "BIG5-HKSCS" },
  261:     { "hp15CN",    "GB2312" },
  262:     { "iso88591",  "ISO-8859-1" },
  263:     { "iso885913", "ISO-8859-13" },
  264:     { "iso885915", "ISO-8859-15" },
  265:     { "iso88592",  "ISO-8859-2" },
  266:     { "iso88594",  "ISO-8859-4" },
  267:     { "iso88595",  "ISO-8859-5" },
  268:     { "iso88596",  "ISO-8859-6" },
  269:     { "iso88597",  "ISO-8859-7" },
  270:     { "iso88598",  "ISO-8859-8" },
  271:     { "iso88599",  "ISO-8859-9" },
  272:     { "kana8",     "HP-KANA8" },
  273:     { "koi8r",     "KOI8-R" },
  274:     { "roman8",    "HP-ROMAN8" },
  275:     { "tis620",    "TIS-620" },
  276:     { "turkish8",  "HP-TURKISH8" },
  277:     { "utf8",      "UTF-8" }
  278: #   define alias_table_defined
  279: #  endif
  280: #  if defined __sgi                                         /* IRIX */
  281:     { "ISO8859-1",  "ISO-8859-1" },
  282:     { "ISO8859-15", "ISO-8859-15" },
  283:     { "ISO8859-2",  "ISO-8859-2" },
  284:     { "ISO8859-5",  "ISO-8859-5" },
  285:     { "ISO8859-7",  "ISO-8859-7" },
  286:     { "ISO8859-9",  "ISO-8859-9" },
  287:     { "eucCN",      "GB2312" },
  288:     { "eucJP",      "EUC-JP" },
  289:     { "eucKR",      "EUC-KR" },
  290:     { "eucTW",      "EUC-TW" }
  291: #   define alias_table_defined
  292: #  endif
  293: #  if defined __osf__                                       /* OSF/1 */
  294:   /*{ "GBK",        "GBK" },*/
  295:     { "ISO8859-1",  "ISO-8859-1" },
  296:     { "ISO8859-15", "ISO-8859-15" },
  297:     { "ISO8859-2",  "ISO-8859-2" },
  298:     { "ISO8859-4",  "ISO-8859-4" },
  299:     { "ISO8859-5",  "ISO-8859-5" },
  300:     { "ISO8859-7",  "ISO-8859-7" },
  301:     { "ISO8859-8",  "ISO-8859-8" },
  302:     { "ISO8859-9",  "ISO-8859-9" },
  303:     { "KSC5601",    "CP949" },
  304:     { "SJIS",       "SHIFT_JIS" },
  305:     { "TACTIS",     "TIS-620" },
  306:   /*{ "UTF-8",      "UTF-8" },*/
  307:     { "big5",       "BIG5" },
  308:     { "cp850",      "CP850" },
  309:     { "dechanyu",   "DEC-HANYU" },
  310:     { "dechanzi",   "GB2312" },
  311:     { "deckanji",   "DEC-KANJI" },
  312:     { "deckorean",  "EUC-KR" },
  313:     { "eucJP",      "EUC-JP" },
  314:     { "eucKR",      "EUC-KR" },
  315:     { "eucTW",      "EUC-TW" },
  316:     { "sdeckanji",  "EUC-JP" }
  317: #   define alias_table_defined
  318: #  endif
  319: #  if defined __sun                                         /* Solaris */
  320:     { "5601",        "EUC-KR" },
  321:     { "646",         "ASCII" },
  322:   /*{ "BIG5",        "BIG5" },*/
  323:     { "Big5-HKSCS",  "BIG5-HKSCS" },
  324:     { "GB18030",     "GB18030" },
  325:   /*{ "GBK",         "GBK" },*/
  326:     { "ISO8859-1",   "ISO-8859-1" },
  327:     { "ISO8859-11",  "TIS-620" },
  328:     { "ISO8859-13",  "ISO-8859-13" },
  329:     { "ISO8859-15",  "ISO-8859-15" },
  330:     { "ISO8859-2",   "ISO-8859-2" },
  331:     { "ISO8859-3",   "ISO-8859-3" },
  332:     { "ISO8859-4",   "ISO-8859-4" },
  333:     { "ISO8859-5",   "ISO-8859-5" },
  334:     { "ISO8859-6",   "ISO-8859-6" },
  335:     { "ISO8859-7",   "ISO-8859-7" },
  336:     { "ISO8859-8",   "ISO-8859-8" },
  337:     { "ISO8859-9",   "ISO-8859-9" },
  338:     { "PCK",         "SHIFT_JIS" },
  339:     { "TIS620.2533", "TIS-620" },
  340:   /*{ "UTF-8",       "UTF-8" },*/
  341:     { "ansi-1251",   "CP1251" },
  342:     { "cns11643",    "EUC-TW" },
  343:     { "eucJP",       "EUC-JP" },
  344:     { "gb2312",      "GB2312" },
  345:     { "koi8-r",      "KOI8-R" }
  346: #   define alias_table_defined
  347: #  endif
  348: #  if defined __minix                                       /* Minix */
  349:     { "646", "ASCII" }
  350: #   define alias_table_defined
  351: #  endif
  352: #  if defined WINDOWS_NATIVE || defined __CYGWIN__          /* Windows */
  353:     { "CP1361",  "JOHAB" },
  354:     { "CP20127", "ASCII" },
  355:     { "CP20866", "KOI8-R" },
  356:     { "CP20936", "GB2312" },
  357:     { "CP21866", "KOI8-RU" },
  358:     { "CP28591", "ISO-8859-1" },
  359:     { "CP28592", "ISO-8859-2" },
  360:     { "CP28593", "ISO-8859-3" },
  361:     { "CP28594", "ISO-8859-4" },
  362:     { "CP28595", "ISO-8859-5" },
  363:     { "CP28596", "ISO-8859-6" },
  364:     { "CP28597", "ISO-8859-7" },
  365:     { "CP28598", "ISO-8859-8" },
  366:     { "CP28599", "ISO-8859-9" },
  367:     { "CP28605", "ISO-8859-15" },
  368:     { "CP38598", "ISO-8859-8" },
  369:     { "CP51932", "EUC-JP" },
  370:     { "CP51936", "GB2312" },
  371:     { "CP51949", "EUC-KR" },
  372:     { "CP51950", "EUC-TW" },
  373:     { "CP54936", "GB18030" },
  374:     { "CP65001", "UTF-8" },
  375:     { "CP936",   "GBK" }
  376: #   define alias_table_defined
  377: #  endif
  378: #  if defined OS2                                           /* OS/2 */
  379:     /* The list of encodings is taken from "List of OS/2 Codepages"
  380:        by Alex Taylor:
  381:        <http://altsan.org/os2/toolkits/uls/index.html#codepages>.
  382:        See also "IBM Globalization - Code page identifiers":
  383:        <https://www-01.ibm.com/software/globalization/cp/cp_cpgid.html>.  */
  384:     { "CP1089", "ISO-8859-6" },
  385:     { "CP1208", "UTF-8" },
  386:     { "CP1381", "GB2312" },
  387:     { "CP1386", "GBK" },
  388:     { "CP3372", "EUC-JP" },
  389:     { "CP813",  "ISO-8859-7" },
  390:     { "CP819",  "ISO-8859-1" },
  391:     { "CP878",  "KOI8-R" },
  392:     { "CP912",  "ISO-8859-2" },
  393:     { "CP913",  "ISO-8859-3" },
  394:     { "CP914",  "ISO-8859-4" },
  395:     { "CP915",  "ISO-8859-5" },
  396:     { "CP916",  "ISO-8859-8" },
  397:     { "CP920",  "ISO-8859-9" },
  398:     { "CP921",  "ISO-8859-13" },
  399:     { "CP923",  "ISO-8859-15" },
  400:     { "CP954",  "EUC-JP" },
  401:     { "CP964",  "EUC-TW" },
  402:     { "CP970",  "EUC-KR" }
  403: #   define alias_table_defined
  404: #  endif
  405: #  if defined VMS                                           /* OpenVMS */
  406:     /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
  407:        "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
  408:        section 10.7 "Handling Different Character Sets".  */
  409:     { "DECHANYU",  "DEC-HANYU" },
  410:     { "DECHANZI",  "GB2312" },
  411:     { "DECKANJI",  "DEC-KANJI" },
  412:     { "DECKOREAN", "EUC-KR" },
  413:     { "ISO8859-1", "ISO-8859-1" },
  414:     { "ISO8859-2", "ISO-8859-2" },
  415:     { "ISO8859-5", "ISO-8859-5" },
  416:     { "ISO8859-7", "ISO-8859-7" },
  417:     { "ISO8859-8", "ISO-8859-8" },
  418:     { "ISO8859-9", "ISO-8859-9" },
  419:     { "SDECKANJI", "EUC-JP" },
  420:     { "SJIS",      "SHIFT_JIS" },
  421:     { "eucJP",     "EUC-JP" },
  422:     { "eucTW",     "EUC-TW" }
  423: #   define alias_table_defined
  424: #  endif
  425: #  ifndef alias_table_defined
  426:     /* Just a dummy entry, to avoid a C syntax error.  */
  427:     { "", "" }
  428: #  endif
  429:   };
  430: 
  431: # endif
  432: 
  433: #else
  434: 
  435: /* On these platforms, we use a mapping from locale name to GNU canonical
  436:    encoding name.  */
  437: 
  438: struct table_entry
  439: {
  440:   const char locale[17+1];
  441:   const char canonical[11+1];
  442: };
  443: 
  444: /* Table of platform-dependent mappings, sorted in ascending order.  */
  445: static const struct table_entry locale_table[] =
  446:   {
  447: # if defined __FreeBSD__                                    /* FreeBSD 4.2 */
  448:     { "cs_CZ.ISO_8859-2",  "ISO-8859-2" },
  449:     { "da_DK.DIS_8859-15", "ISO-8859-15" },
  450:     { "da_DK.ISO_8859-1",  "ISO-8859-1" },
  451:     { "de_AT.DIS_8859-15", "ISO-8859-15" },
  452:     { "de_AT.ISO_8859-1",  "ISO-8859-1" },
  453:     { "de_CH.DIS_8859-15", "ISO-8859-15" },
  454:     { "de_CH.ISO_8859-1",  "ISO-8859-1" },
  455:     { "de_DE.DIS_8859-15", "ISO-8859-15" },
  456:     { "de_DE.ISO_8859-1",  "ISO-8859-1" },
  457:     { "en_AU.DIS_8859-15", "ISO-8859-15" },
  458:     { "en_AU.ISO_8859-1",  "ISO-8859-1" },
  459:     { "en_CA.DIS_8859-15", "ISO-8859-15" },
  460:     { "en_CA.ISO_8859-1",  "ISO-8859-1" },
  461:     { "en_GB.DIS_8859-15", "ISO-8859-15" },
  462:     { "en_GB.ISO_8859-1",  "ISO-8859-1" },
  463:     { "en_US.DIS_8859-15", "ISO-8859-15" },
  464:     { "en_US.ISO_8859-1",  "ISO-8859-1" },
  465:     { "es_ES.DIS_8859-15", "ISO-8859-15" },
  466:     { "es_ES.ISO_8859-1",  "ISO-8859-1" },
  467:     { "fi_FI.DIS_8859-15", "ISO-8859-15" },
  468:     { "fi_FI.ISO_8859-1",  "ISO-8859-1" },
  469:     { "fr_BE.DIS_8859-15", "ISO-8859-15" },
  470:     { "fr_BE.ISO_8859-1",  "ISO-8859-1" },
  471:     { "fr_CA.DIS_8859-15", "ISO-8859-15" },
  472:     { "fr_CA.ISO_8859-1",  "ISO-8859-1" },
  473:     { "fr_CH.DIS_8859-15", "ISO-8859-15" },
  474:     { "fr_CH.ISO_8859-1",  "ISO-8859-1" },
  475:     { "fr_FR.DIS_8859-15", "ISO-8859-15" },
  476:     { "fr_FR.ISO_8859-1",  "ISO-8859-1" },
  477:     { "hr_HR.ISO_8859-2",  "ISO-8859-2" },
  478:     { "hu_HU.ISO_8859-2",  "ISO-8859-2" },
  479:     { "is_IS.DIS_8859-15", "ISO-8859-15" },
  480:     { "is_IS.ISO_8859-1",  "ISO-8859-1" },
  481:     { "it_CH.DIS_8859-15", "ISO-8859-15" },
  482:     { "it_CH.ISO_8859-1",  "ISO-8859-1" },
  483:     { "it_IT.DIS_8859-15", "ISO-8859-15" },
  484:     { "it_IT.ISO_8859-1",  "ISO-8859-1" },
  485:     { "ja_JP.EUC",         "EUC-JP" },
  486:     { "ja_JP.SJIS",        "SHIFT_JIS" },
  487:     { "ja_JP.Shift_JIS",   "SHIFT_JIS" },
  488:     { "ko_KR.EUC",         "EUC-KR" },
  489:     { "la_LN.ASCII",       "ASCII" },
  490:     { "la_LN.DIS_8859-15", "ISO-8859-15" },
  491:     { "la_LN.ISO_8859-1",  "ISO-8859-1" },
  492:     { "la_LN.ISO_8859-2",  "ISO-8859-2" },
  493:     { "la_LN.ISO_8859-4",  "ISO-8859-4" },
  494:     { "lt_LN.ASCII",       "ASCII" },
  495:     { "lt_LN.DIS_8859-15", "ISO-8859-15" },
  496:     { "lt_LN.ISO_8859-1",  "ISO-8859-1" },
  497:     { "lt_LN.ISO_8859-2",  "ISO-8859-2" },
  498:     { "lt_LT.ISO_8859-4",  "ISO-8859-4" },
  499:     { "nl_BE.DIS_8859-15", "ISO-8859-15" },
  500:     { "nl_BE.ISO_8859-1",  "ISO-8859-1" },
  501:     { "nl_NL.DIS_8859-15", "ISO-8859-15" },
  502:     { "nl_NL.ISO_8859-1",  "ISO-8859-1" },
  503:     { "no_NO.DIS_8859-15", "ISO-8859-15" },
  504:     { "no_NO.ISO_8859-1",  "ISO-8859-1" },
  505:     { "pl_PL.ISO_8859-2",  "ISO-8859-2" },
  506:     { "pt_PT.DIS_8859-15", "ISO-8859-15" },
  507:     { "pt_PT.ISO_8859-1",  "ISO-8859-1" },
  508:     { "ru_RU.CP866",       "CP866" },
  509:     { "ru_RU.ISO_8859-5",  "ISO-8859-5" },
  510:     { "ru_RU.KOI8-R",      "KOI8-R" },
  511:     { "ru_SU.CP866",       "CP866" },
  512:     { "ru_SU.ISO_8859-5",  "ISO-8859-5" },
  513:     { "ru_SU.KOI8-R",      "KOI8-R" },
  514:     { "sl_SI.ISO_8859-2",  "ISO-8859-2" },
  515:     { "sv_SE.DIS_8859-15", "ISO-8859-15" },
  516:     { "sv_SE.ISO_8859-1",  "ISO-8859-1" },
  517:     { "uk_UA.KOI8-U",      "KOI8-U" },
  518:     { "zh_CN.EUC",         "GB2312" },
  519:     { "zh_TW.BIG5",        "BIG5" },
  520:     { "zh_TW.Big5",        "BIG5" }
  521: #  define locale_table_defined
  522: # endif
  523: # if defined __DJGPP__                                      /* DOS / DJGPP 2.03 */
  524:     /* The encodings given here may not all be correct.
  525:        If you find that the encoding given for your language and
  526:        country is not the one your DOS machine actually uses, just
  527:        correct it in this file, and send a mail to
  528:        Juan Manuel Guerrero <juan.guerrero@gmx.de>
  529:        and <bug-gnulib@gnu.org>.  */
  530:     { "C",     "ASCII" },
  531:     { "ar",    "CP864" },
  532:     { "ar_AE", "CP864" },
  533:     { "ar_DZ", "CP864" },
  534:     { "ar_EG", "CP864" },
  535:     { "ar_IQ", "CP864" },
  536:     { "ar_IR", "CP864" },
  537:     { "ar_JO", "CP864" },
  538:     { "ar_KW", "CP864" },
  539:     { "ar_MA", "CP864" },
  540:     { "ar_OM", "CP864" },
  541:     { "ar_QA", "CP864" },
  542:     { "ar_SA", "CP864" },
  543:     { "ar_SY", "CP864" },
  544:     { "be",    "CP866" },
  545:     { "be_BE", "CP866" },
  546:     { "bg",    "CP866" }, /* not CP855 ?? */
  547:     { "bg_BG", "CP866" }, /* not CP855 ?? */
  548:     { "ca",    "CP850" },
  549:     { "ca_ES", "CP850" },
  550:     { "cs",    "CP852" },
  551:     { "cs_CZ", "CP852" },
  552:     { "da",    "CP865" }, /* not CP850 ?? */
  553:     { "da_DK", "CP865" }, /* not CP850 ?? */
  554:     { "de",    "CP850" },
  555:     { "de_AT", "CP850" },
  556:     { "de_CH", "CP850" },
  557:     { "de_DE", "CP850" },
  558:     { "el",    "CP869" },
  559:     { "el_GR", "CP869" },
  560:     { "en",    "CP850" },
  561:     { "en_AU", "CP850" }, /* not CP437 ?? */
  562:     { "en_CA", "CP850" },
  563:     { "en_GB", "CP850" },
  564:     { "en_NZ", "CP437" },
  565:     { "en_US", "CP437" },
  566:     { "en_ZA", "CP850" }, /* not CP437 ?? */
  567:     { "eo",    "CP850" },
  568:     { "eo_EO", "CP850" },
  569:     { "es",    "CP850" },
  570:     { "es_AR", "CP850" },
  571:     { "es_BO", "CP850" },
  572:     { "es_CL", "CP850" },
  573:     { "es_CO", "CP850" },
  574:     { "es_CR", "CP850" },
  575:     { "es_CU", "CP850" },
  576:     { "es_DO", "CP850" },
  577:     { "es_EC", "CP850" },
  578:     { "es_ES", "CP850" },
  579:     { "es_GT", "CP850" },
  580:     { "es_HN", "CP850" },
  581:     { "es_MX", "CP850" },
  582:     { "es_NI", "CP850" },
  583:     { "es_PA", "CP850" },
  584:     { "es_PE", "CP850" },
  585:     { "es_PY", "CP850" },
  586:     { "es_SV", "CP850" },
  587:     { "es_UY", "CP850" },
  588:     { "es_VE", "CP850" },
  589:     { "et",    "CP850" },
  590:     { "et_EE", "CP850" },
  591:     { "eu",    "CP850" },
  592:     { "eu_ES", "CP850" },
  593:     { "fi",    "CP850" },
  594:     { "fi_FI", "CP850" },
  595:     { "fr",    "CP850" },
  596:     { "fr_BE", "CP850" },
  597:     { "fr_CA", "CP850" },
  598:     { "fr_CH", "CP850" },
  599:     { "fr_FR", "CP850" },
  600:     { "ga",    "CP850" },
  601:     { "ga_IE", "CP850" },
  602:     { "gd",    "CP850" },
  603:     { "gd_GB", "CP850" },
  604:     { "gl",    "CP850" },
  605:     { "gl_ES", "CP850" },
  606:     { "he",    "CP862" },
  607:     { "he_IL", "CP862" },
  608:     { "hr",    "CP852" },
  609:     { "hr_HR", "CP852" },
  610:     { "hu",    "CP852" },
  611:     { "hu_HU", "CP852" },
  612:     { "id",    "CP850" }, /* not CP437 ?? */
  613:     { "id_ID", "CP850" }, /* not CP437 ?? */
  614:     { "is",    "CP861" }, /* not CP850 ?? */
  615:     { "is_IS", "CP861" }, /* not CP850 ?? */
  616:     { "it",    "CP850" },
  617:     { "it_CH", "CP850" },
  618:     { "it_IT", "CP850" },
  619:     { "ja",    "CP932" },
  620:     { "ja_JP", "CP932" },
  621:     { "kr",    "CP949" }, /* not CP934 ?? */
  622:     { "kr_KR", "CP949" }, /* not CP934 ?? */
  623:     { "lt",    "CP775" },
  624:     { "lt_LT", "CP775" },
  625:     { "lv",    "CP775" },
  626:     { "lv_LV", "CP775" },
  627:     { "mk",    "CP866" }, /* not CP855 ?? */
  628:     { "mk_MK", "CP866" }, /* not CP855 ?? */
  629:     { "mt",    "CP850" },
  630:     { "mt_MT", "CP850" },
  631:     { "nb",    "CP865" }, /* not CP850 ?? */
  632:     { "nb_NO", "CP865" }, /* not CP850 ?? */
  633:     { "nl",    "CP850" },
  634:     { "nl_BE", "CP850" },
  635:     { "nl_NL", "CP850" },
  636:     { "nn",    "CP865" }, /* not CP850 ?? */
  637:     { "nn_NO", "CP865" }, /* not CP850 ?? */
  638:     { "no",    "CP865" }, /* not CP850 ?? */
  639:     { "no_NO", "CP865" }, /* not CP850 ?? */
  640:     { "pl",    "CP852" },
  641:     { "pl_PL", "CP852" },
  642:     { "pt",    "CP850" },
  643:     { "pt_BR", "CP850" },
  644:     { "pt_PT", "CP850" },
  645:     { "ro",    "CP852" },
  646:     { "ro_RO", "CP852" },
  647:     { "ru",    "CP866" },
  648:     { "ru_RU", "CP866" },
  649:     { "sk",    "CP852" },
  650:     { "sk_SK", "CP852" },
  651:     { "sl",    "CP852" },
  652:     { "sl_SI", "CP852" },
  653:     { "sq",    "CP852" },
  654:     { "sq_AL", "CP852" },
  655:     { "sr",    "CP852" }, /* CP852 or CP866 or CP855 ?? */
  656:     { "sr_CS", "CP852" }, /* CP852 or CP866 or CP855 ?? */
  657:     { "sr_YU", "CP852" }, /* CP852 or CP866 or CP855 ?? */
  658:     { "sv",    "CP850" },
  659:     { "sv_SE", "CP850" },
  660:     { "th",    "CP874" },
  661:     { "th_TH", "CP874" },
  662:     { "tr",    "CP857" },
  663:     { "tr_TR", "CP857" },
  664:     { "uk",    "CP1125" },
  665:     { "uk_UA", "CP1125" },
  666:     { "zh_CN", "GBK" },
  667:     { "zh_TW", "CP950" } /* not CP938 ?? */
  668: #  define locale_table_defined
  669: # endif
  670: # ifndef locale_table_defined
  671:     /* Just a dummy entry, to avoid a C syntax error.  */
  672:     { "", "" }
  673: # endif
  674:   };
  675: 
  676: #endif
  677: 
  678: 
  679: /* Determine the current locale's character encoding, and canonicalize it
  680:    into one of the canonical names listed in localcharset.h.
  681:    The result must not be freed; it is statically allocated.
  682:    If the canonical name cannot be determined, the result is a non-canonical
  683:    name.  */
  684: 
  685: #ifdef STATIC
  686: STATIC
  687: #endif
  688: const char *
  689: locale_charset (void)
  690: {
  691:   const char *codeset;
  692: 
  693: #if HAVE_LANGINFO_CODESET || defined WINDOWS_NATIVE || defined OS2
  694: 
  695: # if HAVE_LANGINFO_CODESET
  696: 
  697:   /* Most systems support nl_langinfo (CODESET) nowadays.  */
  698:   codeset = nl_langinfo (CODESET);
  699: 
  700: #  ifdef __CYGWIN__
  701:   /* Cygwin < 1.7 does not have locales.  nl_langinfo (CODESET) always
  702:      returns "US-ASCII".  Return the suffix of the locale name from the
  703:      environment variables (if present) or the codepage as a number.  */
  704:   if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
  705:     {
  706:       const char *locale;
  707:       static char buf[2 + 10 + 1];
  708: 
  709:       locale = getenv ("LC_ALL");
  710:       if (locale == NULL || locale[0] == '\0')
  711:         {
  712:           locale = getenv ("LC_CTYPE");
  713:           if (locale == NULL || locale[0] == '\0')
  714:             locale = getenv ("LANG");
  715:         }
  716:       if (locale != NULL && locale[0] != '\0')
  717:         {
  718:           /* If the locale name contains an encoding after the dot, return
  719:              it.  */
  720:           const char *dot = strchr (locale, '.');
  721: 
  722:           if (dot != NULL)
  723:             {
  724:               const char *modifier;
  725: 
  726:               dot++;
  727:               /* Look for the possible @... trailer and remove it, if any.  */
  728:               modifier = strchr (dot, '@');
  729:               if (modifier == NULL)
  730:                 return dot;
  731:               if (modifier - dot < sizeof (buf))
  732:                 {
  733:                   memcpy (buf, dot, modifier - dot);
  734:                   buf [modifier - dot] = '\0';
  735:                   return buf;
  736:                 }
  737:             }
  738:         }
  739: 
  740:       /* The Windows API has a function returning the locale's codepage as a
  741:          number: GetACP().  This encoding is used by Cygwin, unless the user
  742:          has set the environment variable CYGWIN=codepage:oem (which very few
  743:          people do).
  744:          Output directed to console windows needs to be converted (to
  745:          GetOEMCP() if the console is using a raster font, or to
  746:          GetConsoleOutputCP() if it is using a TrueType font).  Cygwin does
  747:          this conversion transparently (see winsup/cygwin/fhandler_console.cc),
  748:          converting to GetConsoleOutputCP().  This leads to correct results,
  749:          except when SetConsoleOutputCP has been called and a raster font is
  750:          in use.  */
  751:       sprintf (buf, "CP%u", GetACP ());
  752:       codeset = buf;
  753:     }
  754: #  endif
  755: 
  756:   if (codeset == NULL)
  757:     /* The canonical name cannot be determined.  */
  758:     codeset = "";
  759: 
  760: # elif defined WINDOWS_NATIVE
  761: 
  762:   static char buf[2 + 10 + 1];
  763: 
  764:   /* The Windows API has a function returning the locale's codepage as
  765:      a number, but the value doesn't change according to what the
  766:      'setlocale' call specified.  So we use it as a last resort, in
  767:      case the string returned by 'setlocale' doesn't specify the
  768:      codepage.  */
  769:   char *current_locale = setlocale (LC_ALL, NULL);
  770:   char *pdot;
  771: 
  772:   /* If they set different locales for different categories,
  773:      'setlocale' will return a semi-colon separated list of locale
  774:      values.  To make sure we use the correct one, we choose LC_CTYPE.  */
  775:   if (strchr (current_locale, ';'))
  776:     current_locale = setlocale (LC_CTYPE, NULL);
  777: 
  778:   pdot = strrchr (current_locale, '.');
  779:   if (pdot && 2 + strlen (pdot + 1) + 1 <= sizeof (buf))
  780:     sprintf (buf, "CP%s", pdot + 1);
  781:   else
  782:     {
  783:       /* The Windows API has a function returning the locale's codepage as a
  784:         number: GetACP().
  785:         When the output goes to a console window, it needs to be provided in
  786:         GetOEMCP() encoding if the console is using a raster font, or in
  787:         GetConsoleOutputCP() encoding if it is using a TrueType font.
  788:         But in GUI programs and for output sent to files and pipes, GetACP()
  789:         encoding is the best bet.  */
  790:       sprintf (buf, "CP%u", GetACP ());
  791:     }
  792:   codeset = buf;
  793: 
  794: # elif defined OS2
  795: 
  796:   const char *locale;
  797:   static char buf[2 + 10 + 1];
  798:   ULONG cp[3];
  799:   ULONG cplen;
  800: 
  801:   codeset = NULL;
  802: 
  803:   /* Allow user to override the codeset, as set in the operating system,
  804:      with standard language environment variables.  */
  805:   locale = getenv ("LC_ALL");
  806:   if (locale == NULL || locale[0] == '\0')
  807:     {
  808:       locale = getenv ("LC_CTYPE");
  809:       if (locale == NULL || locale[0] == '\0')
  810:         locale = getenv ("LANG");
  811:     }
  812:   if (locale != NULL && locale[0] != '\0')
  813:     {
  814:       /* If the locale name contains an encoding after the dot, return it.  */
  815:       const char *dot = strchr (locale, '.');
  816: 
  817:       if (dot != NULL)
  818:         {
  819:           const char *modifier;
  820: 
  821:           dot++;
  822:           /* Look for the possible @... trailer and remove it, if any.  */
  823:           modifier = strchr (dot, '@');
  824:           if (modifier == NULL)
  825:             return dot;
  826:           if (modifier - dot < sizeof (buf))
  827:             {
  828:               memcpy (buf, dot, modifier - dot);
  829:               buf [modifier - dot] = '\0';
  830:               return buf;
  831:             }
  832:         }
  833: 
  834:       /* For the POSIX locale, don't use the system's codepage.  */
  835:       if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0)
  836:         codeset = "";
  837:     }
  838: 
  839:   if (codeset == NULL)
  840:     {
  841:       /* OS/2 has a function returning the locale's codepage as a number.  */
  842:       if (DosQueryCp (sizeof (cp), cp, &cplen))
  843:         codeset = "";
  844:       else
  845:         {
  846:           sprintf (buf, "CP%u", cp[0]);
  847:           codeset = buf;
  848:         }
  849:     }
  850: 
  851: # else
  852: 
  853: #  error "Add code for other platforms here."
  854: 
  855: # endif
  856: 
  857:   /* Resolve alias.  */
  858:   {
  859: # ifdef alias_table_defined
  860:     /* On some platforms, UTF-8 locales are the most frequently used ones.
  861:        Speed up the common case and slow down the less common cases by
  862:        testing for this case first.  */
  863: #  if defined __OpenBSD__ || (defined __APPLE__ && defined __MACH__) || defined __sun || defined __CYGWIN__
  864:     if (strcmp (codeset, "UTF-8") == 0)
  865:       goto done_table_lookup;
  866:     else
  867: #  endif
  868:       {
  869:         const struct table_entry * const table = alias_table;
  870:         size_t const table_size =
  871:           sizeof (alias_table) / sizeof (struct table_entry);
  872:         /* The table is sorted.  Perform a binary search.  */
  873:         size_t hi = table_size;
  874:         size_t lo = 0;
  875:         while (lo < hi)
  876:           {
  877:             /* Invariant:
  878:                for i < lo, strcmp (table[i].alias, codeset) < 0,
  879:                for i >= hi, strcmp (table[i].alias, codeset) > 0.  */
  880:             size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
  881:             int cmp = strcmp (table[mid].alias, codeset);
  882:             if (cmp < 0)
  883:               lo = mid + 1;
  884:             else if (cmp > 0)
  885:               hi = mid;
  886:             else
  887:               {
  888:                 /* Found an i with
  889:                      strcmp (table[i].alias, codeset) == 0.  */
  890:                 codeset = table[mid].canonical;
  891:                 goto done_table_lookup;
  892:               }
  893:           }
  894:       }
  895:     if (0)
  896:       done_table_lookup: ;
  897:     else
  898: # endif
  899:       {
  900:         /* Did not find it in the table.  */
  901:         /* On Mac OS X, all modern locales use the UTF-8 encoding.
  902:            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
  903: # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
  904:         codeset = "UTF-8";
  905: # else
  906:         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
  907:            the empty string as denoting "the locale's character encoding",
  908:            thus GNU libiconv would call this function a second time.  */
  909:         if (codeset[0] == '\0')
  910:           codeset = "ASCII";
  911: # endif
  912:       }
  913:   }
  914: 
  915: #else
  916: 
  917:   /* On old systems which lack it, use setlocale or getenv.  */
  918:   const char *locale = NULL;
  919: 
  920:   /* But most old systems don't have a complete set of locales.  Some
  921:      (like DJGPP) have only the C locale.  Therefore we don't use setlocale
  922:      here; it would return "C" when it doesn't support the locale name the
  923:      user has set.  */
  924: # if 0
  925:   locale = setlocale (LC_CTYPE, NULL);
  926: # endif
  927:   if (locale == NULL || locale[0] == '\0')
  928:     {
  929:       locale = getenv ("LC_ALL");
  930:       if (locale == NULL || locale[0] == '\0')
  931:         {
  932:           locale = getenv ("LC_CTYPE");
  933:           if (locale == NULL || locale[0] == '\0')
  934:             locale = getenv ("LANG");
  935:             if (locale == NULL)
  936:               locale = "";
  937:         }
  938:     }
  939: 
  940:   /* Map locale name to canonical encoding name.  */
  941:   {
  942: # ifdef locale_table_defined
  943:     const struct table_entry * const table = locale_table;
  944:     size_t const table_size =
  945:       sizeof (locale_table) / sizeof (struct table_entry);
  946:     /* The table is sorted.  Perform a binary search.  */
  947:     size_t hi = table_size;
  948:     size_t lo = 0;
  949:     while (lo < hi)
  950:       {
  951:         /* Invariant:
  952:            for i < lo, strcmp (table[i].locale, locale) < 0,
  953:            for i >= hi, strcmp (table[i].locale, locale) > 0.  */
  954:         size_t mid = (hi + lo) >> 1; /* >= lo, < hi */
  955:         int cmp = strcmp (table[mid].locale, locale);
  956:         if (cmp < 0)
  957:           lo = mid + 1;
  958:         else if (cmp > 0)
  959:           hi = mid;
  960:         else
  961:           {
  962:             /* Found an i with
  963:                  strcmp (table[i].locale, locale) == 0.  */
  964:             codeset = table[mid].canonical;
  965:             goto done_table_lookup;
  966:           }
  967:       }
  968:     if (0)
  969:       done_table_lookup: ;
  970:     else
  971: # endif
  972:       {
  973:         /* Did not find it in the table.  */
  974:         /* On Mac OS X, all modern locales use the UTF-8 encoding.
  975:            BeOS and Haiku have a single locale, and it has UTF-8 encoding.  */
  976: # if (defined __APPLE__ && defined __MACH__) || defined __BEOS__ || defined __HAIKU__
  977:         codeset = "UTF-8";
  978: # else
  979:         /* The canonical name cannot be determined.  */
  980:         /* Don't return an empty string.  GNU libc and GNU libiconv interpret
  981:            the empty string as denoting "the locale's character encoding",
  982:            thus GNU libiconv would call this function a second time.  */
  983:         codeset = "ASCII";
  984: # endif
  985:       }
  986:   }
  987: 
  988: #endif
  989: 
  990: #ifdef DARWIN7
  991:   /* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
  992:      (the default codeset) does not work when MB_CUR_MAX is 1.  */
  993:   if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)
  994:     codeset = "ASCII";
  995: #endif
  996: 
  997:   return codeset;
  998: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>