Annotation of embedaddon/libiconv/NOTES, revision 1.1

1.1     ! misho       1: Q: Why does libiconv support encoding XXX? Why does libiconv not support
        !             2:    encoding ZZZ?
        !             3: 
        !             4: A: libiconv, as an internationalization library, supports those character
        !             5:    sets and encodings which are in wide-spread use in at least one territory
        !             6:    of the world.
        !             7: 
        !             8:    Hint1: On http://www.w3c.org/International/O-charset-lang.html you find a
        !             9:    page "Languages, countries, and the charsets typically used for them".
        !            10:    From this table, we can conclude that the following are in active use:
        !            11: 
        !            12:      ISO-8859-1, CP1252   Afrikaans, Albanian, Basque, Catalan, Danish, Dutch,
        !            13:                           English, Faroese, Finnish, French, Galician, German,
        !            14:                           Icelandic, Irish, Italian, Norwegian, Portuguese,
        !            15:                           Scottish, Spanish, Swedish
        !            16:      ISO-8859-2           Croatian, Czech, Hungarian, Polish, Romanian, Slovak,
        !            17:                           Slovenian
        !            18:      ISO-8859-3           Esperanto, Maltese
        !            19:      ISO-8859-5           Bulgarian, Byelorussian, Macedonian, Russian,
        !            20:                           Serbian, Ukrainian
        !            21:      ISO-8859-6           Arabic
        !            22:      ISO-8859-7           Greek
        !            23:      ISO-8859-8           Hebrew
        !            24:      ISO-8859-9, CP1254   Turkish
        !            25:      ISO-8859-10          Inuit, Lapp
        !            26:      ISO-8859-13          Latvian, Lithuanian
        !            27:      ISO-8859-15          Estonian
        !            28:      KOI8-R               Russian
        !            29:      SHIFT_JIS            Japanese
        !            30:      ISO-2022-JP          Japanese
        !            31:      EUC-JP               Japanese
        !            32: 
        !            33:    Ordered by frequency on the web (1997):
        !            34:      ISO-8859-1, CP1252   96%
        !            35:      SHIFT_JIS             1.6%
        !            36:      ISO-2022-JP           1.2%
        !            37:      EUC-JP                0.4%
        !            38:      CP1250                0.3%
        !            39:      CP1251                0.2%
        !            40:      CP850                 0.1%
        !            41:      MACINTOSH             0.1%
        !            42:      ISO-8859-5            0.1%
        !            43:      ISO-8859-2            0.0%
        !            44: 
        !            45:    Hint2: The character sets mentioned in the XFree86 4.0 locale.alias file.
        !            46: 
        !            47:      ISO-8859-1           Afrikaans, Basque, Breton, Catalan, Danish, Dutch,
        !            48:                           English, Estonian, Faroese, Finnish, French,
        !            49:                           Galician, German, Greenlandic, Icelandic,
        !            50:                           Indonesian, Irish, Italian, Lithuanian, Norwegian,
        !            51:                           Occitan, Portuguese, Scottish, Spanish, Swedish,
        !            52:                           Walloon, Welsh
        !            53:      ISO-8859-2           Albanian, Croatian, Czech, Hungarian, Polish,
        !            54:                           Romanian, Serbian, Slovak, Slovenian
        !            55:      ISO-8859-3           Esperanto
        !            56:      ISO-8859-4           Estonian, Latvian, Lithuanian
        !            57:      ISO-8859-5           Bulgarian, Byelorussian, Macedonian, Russian,
        !            58:                           Serbian, Ukrainian
        !            59:      ISO-8859-6           Arabic
        !            60:      ISO-8859-7           Greek
        !            61:      ISO-8859-8           Hebrew
        !            62:      ISO-8859-9           Turkish
        !            63:      ISO-8859-14          Breton, Irish, Scottish, Welsh
        !            64:      ISO-8859-15          Basque, Breton, Catalan, Danish, Dutch, Estonian,
        !            65:                           Faroese, Finnish, French, Galician, German,
        !            66:                           Greenlandic, Icelandic, Irish, Italian, Lithuanian,
        !            67:                           Norwegian, Occitan, Portuguese, Scottish, Spanish,
        !            68:                           Swedish, Walloon, Welsh
        !            69:      KOI8-R               Russian
        !            70:      KOI8-U               Russian, Ukrainian
        !            71:      EUC-JP (alias eucJP)      Japanese
        !            72:      ISO-2022-JP (alias JIS7)  Japanese
        !            73:      SHIFT_JIS (alias SJIS)    Japanese
        !            74:      U90                       Japanese
        !            75:      S90                       Japanese
        !            76:      EUC-CN (alias eucCN)      Chinese
        !            77:      EUC-TW (alias eucTW)      Chinese
        !            78:      BIG5                      Chinese
        !            79:      EUC-KR (alias eucKR)      Korean
        !            80:      ARMSCII-8                 Armenian
        !            81:      GEORGIAN-ACADEMY          Georgian
        !            82:      GEORGIAN-PS               Georgian
        !            83:      TIS-620 (alias TACTIS)    Thai
        !            84:      MULELAO-1                 Laothian
        !            85:      IBM-CP1133                Laothian
        !            86:      VISCII                    Vietnamese
        !            87:      TCVN                      Vietnamese
        !            88:      NUNACOM-8                 Inuktitut
        !            89: 
        !            90:    Hint3: The character sets supported by Netscape Communicator 4.
        !            91: 
        !            92:      Where is this documented? For the complete picture, I had to use
        !            93:      "strings netscape" and then a lot of guesswork. For a quick take,
        !            94:      look at the "View - Character set" menu of Netscape Communicator 4.6:
        !            95: 
        !            96:      ISO-8859-{1,2,5,7,9,15}
        !            97:      WINDOWS-{1250,1251,1253}
        !            98:      KOI8-R               Cyrillic
        !            99:      CP866                Cyrillic
        !           100:      Autodetect           Japanese  (EUC-JP, ISO-2022-JP, ISO-2022-JP-2, SJIS)
        !           101:      EUC-JP               Japanese
        !           102:      SHIFT_JIS            Japanese
        !           103:      GB2312               Chinese
        !           104:      BIG5                 Chinese
        !           105:      EUC-TW               Chinese
        !           106:      Autodetect           Korean    (EUC-KR, ISO-2022-KR, but not JOHAB)
        !           107: 
        !           108:      UTF-8
        !           109:      UTF-7
        !           110: 
        !           111:    Hint4: The character sets supported by Microsoft Internet Explorer 4.
        !           112: 
        !           113:      ISO-8859-{1,2,3,4,5,6,7,8,9}
        !           114:      WINDOWS-{1250,1251,1252,1253,1254,1255,1256,1257}
        !           115:      KOI8-R               Cyrillic
        !           116:      KOI8-RU              Ukrainian
        !           117:      ASMO-708             Arabic
        !           118:      EUC-JP               Japanese
        !           119:      ISO-2022-JP          Japanese
        !           120:      SHIFT_JIS            Japanese
        !           121:      GB2312               Chinese
        !           122:      HZ-GB-2312           Chinese
        !           123:      BIG5                 Chinese
        !           124:      EUC-KR               Korean
        !           125:      ISO-2022-KR          Korean
        !           126:      WINDOWS-874          Thai
        !           127:      WINDOWS-1258         Vietnamese
        !           128: 
        !           129:      UTF-8
        !           130:      UTF-7
        !           131:      UNICODE             actually UNICODE-LITTLE
        !           132:      UNICODEFEFF         actually UNICODE-BIG
        !           133: 
        !           134:      and various DOS character sets: DOS-720, DOS-862, IBM852, CP866.
        !           135: 
        !           136:    We take the union of all these four sets. The result is:
        !           137: 
        !           138:    European and Semitic languages
        !           139:      * ASCII.
        !           140:        We implement this because it is occasionally useful to know or to
        !           141:        check whether some text is entirely ASCII (i.e. if the conversion
        !           142:        ISO-8859-x -> UTF-8 is trivial).
        !           143:      * ISO-8859-{1,2,3,4,5,6,7,8,9,10}
        !           144:        We implement this because they are widely used. Except ISO-8859-4
        !           145:        which appears to have been superseded by ISO-8859-13 in the baltic
        !           146:        countries. But it's an ISO standard anyway.
        !           147:      * ISO-8859-13
        !           148:        We implement this because it's a standard in Lithuania and Latvia.
        !           149:      * ISO-8859-14
        !           150:        We implement this because it's an ISO standard.
        !           151:      * ISO-8859-15
        !           152:        We implement this because it's increasingly used in Europe, because
        !           153:        of the Euro symbol.
        !           154:      * ISO-8859-16
        !           155:        We implement this because it's an ISO standard.
        !           156:      * KOI8-R, KOI8-U
        !           157:        We implement this because it appears to be the predominant encoding
        !           158:        on Unix in Russia and Ukraine, respectively.
        !           159:      * KOI8-RU
        !           160:        We implement this because MSIE4 supports it.
        !           161:      * KOI8-T
        !           162:        We implement this because it is the locale encoding in glibc's Tajik
        !           163:        locale.
        !           164:      * PT154
        !           165:        We implement this because it is the locale encoding in glibc's Kazakh
        !           166:        locale.
        !           167:      * RK1048
        !           168:        We implement this because it's a standard in Kazakhstan.
        !           169:      * CP{1250,1251,1252,1253,1254,1255,1256,1257}
        !           170:        We implement these because they are the predominant Windows encodings
        !           171:        in Europe.
        !           172:      * CP850
        !           173:        We implement this because it is mentioned as occurring in the web
        !           174:        in the aforementioned statistics.
        !           175:      * CP862
        !           176:        We implement this because Ron Aaron says it is sometimes used in web
        !           177:        pages and emails.
        !           178:      * CP866
        !           179:        We implement this because Netscape Communicator does.
        !           180:      * CP1131
        !           181:        We implement this because it is the locale encoding of a Belorusian
        !           182:        locale in FreeBSD and MacOS X.
        !           183:      * Mac{Roman,CentralEurope,Croatian,Romania,Cyrillic,Greek,Turkish} and
        !           184:        Mac{Hebrew,Arabic}
        !           185:        We implement these because the Sun JDK does, and because Mac users
        !           186:        don't deserve to be punished.
        !           187:      * Macintosh
        !           188:        We implement this because it is mentioned as occurring in the web
        !           189:        in the aforementioned statistics.
        !           190:    Japanese
        !           191:      * EUC-JP, SHIFT_JIS, ISO-2022-JP
        !           192:        We implement these because they are widely used. EUC-JP and SHIFT_JIS
        !           193:        are more used for files, whereas ISO-2022-JP is recommended for email.
        !           194:      * CP932
        !           195:        We implement this because it is the Microsoft variant of SHIFT_JIS,
        !           196:        used on Windows.
        !           197:      * ISO-2022-JP-2
        !           198:        We implement this because it's the common way to represent mails which
        !           199:        make use of JIS X 0212 characters.
        !           200:      * ISO-2022-JP-1
        !           201:        We implement this because it's in the RFCs, but I don't think it is
        !           202:        really used.
        !           203:      * U90, S90
        !           204:        We DON'T implement this because I have no informations about what it
        !           205:        is or who uses it.
        !           206:    Simplified Chinese
        !           207:      * EUC-CN = GB2312
        !           208:        We implement this because it is the widely used representation
        !           209:        of simplified Chinese.
        !           210:      * GBK
        !           211:        We implement this because it appears to be used on Solaris and Windows.
        !           212:      * GB18030
        !           213:        We implement this because it is an official requirement in the
        !           214:        People's Republic of China.
        !           215:      * ISO-2022-CN
        !           216:        We implement this because it is in the RFCs, but I have no idea
        !           217:        whether it is really used.
        !           218:      * ISO-2022-CN-EXT
        !           219:        We implement this because it's in the RFCs, but I don't think it is
        !           220:        really used.
        !           221:      * HZ = HZ-GB-2312
        !           222:        We implement this because the RFCs recommend it for Usenet postings,
        !           223:        and because MSIE4 supports it.
        !           224:    Traditional Chinese
        !           225:      * EUC-TW
        !           226:        We implement it because it appears to be used on Unix.
        !           227:      * BIG5
        !           228:        We implement it because it is the de-facto standard for traditional
        !           229:        Chinese.
        !           230:      * CP950
        !           231:        We implement this because it is the Microsoft variant of BIG5, used
        !           232:        on Windows.
        !           233:      * BIG5+
        !           234:        We DON'T implement this because it doesn't appear to be in wide use.
        !           235:        Only the CWEX fonts use this encoding. Furthermore, the conversion
        !           236:        tables in the big5p package are not coherent: If you convert directly,
        !           237:        you get different results than when you convert via GBK.
        !           238:      * BIG5-HKSCS
        !           239:        We implement it because it is the de-facto standard for traditional
        !           240:        Chinese in Hongkong.
        !           241:    Korean
        !           242:      * EUC-KR
        !           243:        We implement these because they appear to be the widely used
        !           244:        representations for Korean.
        !           245:      * CP949
        !           246:        We implement this because it is the Microsoft variant of EUC-KR, used
        !           247:        on Windows.
        !           248:      * ISO-2022-KR
        !           249:        We implement it because it is in the RFCs and because MSIE4 supports
        !           250:        it, but I have no idea whether it's really used.
        !           251:      * JOHAB
        !           252:        We implement this because it is apparently used on Windows as a locale
        !           253:        encoding (codepage 1361).
        !           254:      * ISO-646-KR
        !           255:        We DON'T implement this because although an old ASCII variant, its
        !           256:        glyph for 0x7E is not clear: RFC 1345 and unicode.org's JOHAB.TXT
        !           257:        say it's a tilde, but Ken Lunde's "CJKV information processing" says
        !           258:        it's an overline. And it is not ISO-IR registered.
        !           259:    Armenian
        !           260:      * ARMSCII-8
        !           261:        We implement it because XFree86 supports it.
        !           262:    Georgian
        !           263:      * Georgian-Academy, Georgian-PS
        !           264:        We implement these because they appear to be both used for Georgian;
        !           265:        Xfree86 supports them.
        !           266:    Thai
        !           267:      * ISO-8859-11, TIS-620
        !           268:        We implement these because it seems to be standard for Thai.
        !           269:      * CP874
        !           270:        We implement this because MSIE4 supports it.
        !           271:      * MacThai
        !           272:        We implement this because the Sun JDK does, and because Mac users
        !           273:        don't deserve to be punished.
        !           274:    Laotian
        !           275:      * MuleLao-1, CP1133
        !           276:        We implement these because XFree86 supports them. I have no idea which
        !           277:        one is used more widely.
        !           278:    Vietnamese
        !           279:      * VISCII, TCVN
        !           280:        We implement these because XFree86 supports them.
        !           281:      * CP1258
        !           282:        We implement this because MSIE4 supports it.
        !           283:    Other languages
        !           284:      * NUNACOM-8 (Inuktitut)
        !           285:        We DON'T implement this because it isn't part of Unicode yet, and
        !           286:        therefore doesn't convert to anything except itself.
        !           287:    Platform specifics
        !           288:      * HP-ROMAN8, NEXTSTEP
        !           289:        We implement these because they were the native character set on HPs
        !           290:        and NeXTs for a long time, and libiconv is intended to be usable on
        !           291:        these old machines.
        !           292:    Full Unicode
        !           293:      * UTF-8, UCS-2, UCS-4
        !           294:        We implement these. Obviously.
        !           295:      * UCS-2BE, UCS-2LE, UCS-4BE, UCS-4LE
        !           296:        We implement these because they are the preferred internal
        !           297:        representation of strings in Unicode aware applications. These are
        !           298:        non-ambiguous names, known to glibc. (glibc doesn't have
        !           299:        UCS-2-INTERNAL and UCS-4-INTERNAL.)
        !           300:      * UTF-16, UTF-16BE, UTF-16LE
        !           301:        We implement these, because UTF-16 is still the favourite encoding of
        !           302:        the president of the Unicode Consortium (for political reasons), and
        !           303:        because they appear in RFC 2781.
        !           304:      * UTF-32, UTF-32BE, UTF-32LE
        !           305:        We implement these because they are part of Unicode 3.1.
        !           306:      * UTF-7
        !           307:        We implement this because it is essential functionality for mail
        !           308:        applications.
        !           309:      * C99
        !           310:        We implement it because it's used for C and C++ programs and because
        !           311:        it's a nice encoding for debugging.
        !           312:      * JAVA
        !           313:        We implement it because it's used for Java programs and because it's
        !           314:        a nice encoding for debugging.
        !           315:      * UNICODE (big endian), UNICODEFEFF (little endian)
        !           316:        We DON'T implement these because they are stupid and not standardized.
        !           317:    Full Unicode, in terms of `uint16_t' or `uint32_t'
        !           318:    (with machine dependent endianness and alignment)
        !           319:      * UCS-2-INTERNAL, UCS-4-INTERNAL
        !           320:        We implement these because they are the preferred internal
        !           321:        representation of strings in Unicode aware applications.
        !           322: 
        !           323: Q: Support encodings mentioned in RFC 1345 ?
        !           324: A: No, they are not in use any more. Supporting ISO-646 variants is pointless
        !           325:    since ISO-8859-* have been adopted.
        !           326: 
        !           327: Q: Support EBCDIC ?
        !           328: A: No!
        !           329: 
        !           330: Q: How do I add a new character set?
        !           331: A: 1. Explain the "why" in this file, above.
        !           332:    2. You need to have a conversion table from/to Unicode. Transform it into
        !           333:    the format used by the mapping tables found on ftp.unicode.org: each line
        !           334:    contains the character code, in hex, with 0x prefix, then whitespace,
        !           335:    then the Unicode code point, in hex, 4 hex digits, with 0x prefix. '#'
        !           336:    counts as a comment delimiter until end of line.
        !           337:    Please also send your table to Mark Leisher <mleisher@crl.nmsu.edu> so he
        !           338:    can include it in his collection.
        !           339:    3. If it's an 8-bit character set, use the '8bit_tab_to_h' program in the
        !           340:    tools directory to generate the C code for the conversion. You may tweak
        !           341:    the resulting C code if you are not satisfied with its quality, but this
        !           342:    is rarely needed.
        !           343:    If it's a two-dimensional character set (with rows and columns), use the
        !           344:    'cjk_tab_to_h' program in the tools directory to generate the C code for
        !           345:    the conversion. You will need to modify the main() function to recognize
        !           346:    the new character set name, with the proper dimensions, but that shouldn't
        !           347:    be too hard. This yields the CCS. The CES you have to write by hand.
        !           348:    4. Store the resulting C code file in the lib directory. Add a #include
        !           349:    directive to converters.h, and add an entry to the encodings.def file.
        !           350:    5. Compile the package, and test your new encoding using a program like
        !           351:    iconv(1) or clisp(1).
        !           352:    6. Augment the testsuite: Add a line to tests/Makefile.in. For a stateless
        !           353:    encoding, create the complete table as a TXT file. For a stateful encoding,
        !           354:    provide a text snippet encoded using your new encoding and its UTF-8
        !           355:    equivalent.
        !           356:    7. Update the README and man/iconv_open.3, to mention the new encoding.
        !           357:    Add a note in the NEWS file.
        !           358: 
        !           359: Q: What about bidirectional text? Should it be tagged or reversed when
        !           360:    converting from ISO-8859-8 or ISO-8859-6 to Unicode? Qt appears to do
        !           361:    this, see qt-2.0.1/src/tools/qrtlcodec.cpp.
        !           362: A: After reading RFC 1556: I don't think so. Support for ISO-8859-8-I and
        !           363:    ISO-8859-E remains to be implemented.
        !           364:    On the other hand, a page on www.w3c.org says that ISO-8859-8 in *email*
        !           365:    is visually encoded, ISO-8859-8 in *HTML* is logically encoded, i.e.
        !           366:    the same as ISO-8859-8-I. I'm confused.
        !           367: 
        !           368: Other character sets not implemented:
        !           369: "MNEMONIC" = "csMnemonic"
        !           370: "MNEM" = "csMnem"
        !           371: "ISO-10646-UCS-Basic" = "csUnicodeASCII"
        !           372: "ISO-10646-Unicode-Latin1" = "csUnicodeLatin1" = "ISO-10646"
        !           373: "ISO-10646-J-1"
        !           374: "UNICODE-1-1" = "csUnicode11"
        !           375: "csWindows31Latin5"
        !           376: 
        !           377: Other aliases not implemented (and not implemented in glibc-2.1 either):
        !           378:   From MSIE4:
        !           379:     ISO-8859-1: alias ISO8859-1
        !           380:     ISO-8859-2: alias ISO8859-2
        !           381:     KSC_5601: alias KS_C_5601
        !           382:     UTF-8: aliases UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8
        !           383: 
        !           384: 
        !           385: Q: How can I integrate libiconv into my package?
        !           386: A: Just copy the entire libiconv package into a subdirectory of your package.
        !           387:    At configuration time, call libiconv's configure script with the
        !           388:    appropriate --srcdir option and maybe --enable-static or --disable-shared.
        !           389:    Then "cd libiconv && make && make install-lib libdir=... includedir=...".
        !           390:    'install-lib' is a special (not GNU standardized) target which installs
        !           391:    only the include file - in $(includedir) - and the library - in $(libdir) -
        !           392:    and does not use other directory variables. After "installing" libiconv
        !           393:    in your package's build directory, building of your package can proceed.
        !           394: 
        !           395: Q: Why is the testsuite so big?
        !           396: A: Because some of the tests are very comprehensive.
        !           397:    If you don't feel like using the testsuite, you can simply remove the
        !           398:    tests/ directory.
        !           399: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>