Annotation of embedaddon/php/ext/json/utf8_decode.c, revision 1.1

1.1     ! misho       1: /* utf8_decode.c */
        !             2: 
        !             3: /* 2005-12-25 */
        !             4: 
        !             5: /*
        !             6: Copyright (c) 2005 JSON.org
        !             7: 
        !             8: Permission is hereby granted, free of charge, to any person obtaining a copy
        !             9: of this software and associated documentation files (the "Software"), to deal
        !            10: in the Software without restriction, including without limitation the rights
        !            11: to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
        !            12: copies of the Software, and to permit persons to whom the Software is
        !            13: furnished to do so, subject to the following conditions:
        !            14: 
        !            15: The above copyright notice and this permission notice shall be included in all
        !            16: copies or substantial portions of the Software.
        !            17: 
        !            18: The Software shall be used for Good, not Evil.
        !            19: 
        !            20: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
        !            21: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
        !            22: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
        !            23: AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
        !            24: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
        !            25: OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
        !            26: SOFTWARE.
        !            27: */
        !            28: 
        !            29: #include "utf8_decode.h"
        !            30: 
        !            31: /*
        !            32:     Very Strict UTF-8 Decoder
        !            33: 
        !            34:     UTF-8 is a multibyte character encoding of Unicode. A character can be
        !            35:     represented by 1-4 bytes. The bit pattern of the first byte indicates the
        !            36:     number of continuation bytes.
        !            37: 
        !            38:     Most UTF-8 decoders tend to be lenient, attempting to recover as much
        !            39:     information as possible, even from badly encoded input. This UTF-8
        !            40:     decoder is not lenient. It will reject input which does not include
        !            41:     proper continuation bytes. It will reject aliases (or suboptimal
        !            42:     codings). It will reject surrogates. (Surrogate encoding should only be
        !            43:     used with UTF-16.)
        !            44: 
        !            45:     Code     Contination Minimum Maximum
        !            46:     0xxxxxxx           0       0     127
        !            47:     10xxxxxx       error
        !            48:     110xxxxx           1     128    2047
        !            49:     1110xxxx           2    2048   65535 excluding 55296 - 57343
        !            50:     11110xxx           3   65536 1114111
        !            51:     11111xxx       error
        !            52: */
        !            53: 
        !            54: 
        !            55: /*
        !            56:     Get the next byte. It returns UTF8_END if there are no more bytes.
        !            57: */
        !            58: static int 
        !            59: get(json_utf8_decode *utf8)
        !            60: {
        !            61:     int c;
        !            62:     if (utf8->the_index >= utf8->the_length) {
        !            63:         return UTF8_END;
        !            64:     }
        !            65:     c = utf8->the_input[utf8->the_index] & 0xFF;
        !            66:     utf8->the_index += 1;
        !            67:     return c;
        !            68: }
        !            69: 
        !            70: 
        !            71: /*
        !            72:     Get the 6-bit payload of the next continuation byte.
        !            73:     Return UTF8_ERROR if it is not a contination byte.
        !            74: */
        !            75: static int 
        !            76: cont(json_utf8_decode *utf8)
        !            77: {
        !            78:     int c = get(utf8);
        !            79:     return ((c & 0xC0) == 0x80) ? (c & 0x3F) : UTF8_ERROR;
        !            80: }
        !            81: 
        !            82: 
        !            83: /*
        !            84:     Initialize the UTF-8 decoder. The decoder is not reentrant,
        !            85: */
        !            86: void 
        !            87: utf8_decode_init(json_utf8_decode *utf8, char p[], int length)
        !            88: {
        !            89:     utf8->the_index = 0;
        !            90:     utf8->the_input = p;
        !            91:     utf8->the_length = length;
        !            92:     utf8->the_char = 0;
        !            93:     utf8->the_byte = 0;
        !            94: }
        !            95: 
        !            96: 
        !            97: /*
        !            98:     Get the current byte offset. This is generally used in error reporting.
        !            99: */
        !           100: int 
        !           101: utf8_decode_at_byte(json_utf8_decode *utf8)
        !           102: {
        !           103:     return utf8->the_byte;
        !           104: }
        !           105: 
        !           106: 
        !           107: /*
        !           108:     Get the current character offset. This is generally used in error reporting.
        !           109:     The character offset matches the byte offset if the text is strictly ASCII.
        !           110: */
        !           111: int 
        !           112: utf8_decode_at_character(json_utf8_decode *utf8)
        !           113: {
        !           114:     return utf8->the_char > 0 ? utf8->the_char - 1 : 0;
        !           115: }
        !           116: 
        !           117: 
        !           118: /*
        !           119:     Extract the next character.
        !           120:     Returns: the character (between 0 and 1114111)
        !           121:          or  UTF8_END   (the end)
        !           122:          or  UTF8_ERROR (error)
        !           123: */
        !           124: int 
        !           125: utf8_decode_next(json_utf8_decode *utf8)
        !           126: {
        !           127:     int c;  /* the first byte of the character */
        !           128:     int r;  /* the result */
        !           129: 
        !           130:     if (utf8->the_index >= utf8->the_length) {
        !           131:         return utf8->the_index == utf8->the_length ? UTF8_END : UTF8_ERROR;
        !           132:     }
        !           133:     utf8->the_byte = utf8->the_index;
        !           134:     utf8->the_char += 1;
        !           135:     c = get(utf8);
        !           136: /*
        !           137:     Zero continuation (0 to 127)
        !           138: */
        !           139:     if ((c & 0x80) == 0) {
        !           140:         return c;
        !           141:     }
        !           142: /*
        !           143:     One contination (128 to 2047)
        !           144: */
        !           145:     if ((c & 0xE0) == 0xC0) {
        !           146:         int c1 = cont(utf8);
        !           147:         if (c1 < 0) {
        !           148:             return UTF8_ERROR;
        !           149:         }
        !           150:         r = ((c & 0x1F) << 6) | c1;
        !           151:         return r >= 128 ? r : UTF8_ERROR;
        !           152:     }
        !           153: /*
        !           154:     Two continuation (2048 to 55295 and 57344 to 65535) 
        !           155: */
        !           156:     if ((c & 0xF0) == 0xE0) {
        !           157:         int c1 = cont(utf8);
        !           158:         int c2 = cont(utf8);
        !           159:         if (c1 < 0 || c2 < 0) {
        !           160:             return UTF8_ERROR;
        !           161:         }
        !           162:         r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
        !           163:         return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR;
        !           164:     }
        !           165: /*
        !           166:     Three continuation (65536 to 1114111)
        !           167: */
        !           168:     if ((c & 0xF8) == 0xF0) {
        !           169:         int c1 = cont(utf8);
        !           170:         int c2 = cont(utf8);
        !           171:         int c3 = cont(utf8);
        !           172:         if (c1 < 0 || c2 < 0 || c3 < 0) {
        !           173:             return UTF8_ERROR;
        !           174:         }
        !           175:         r = ((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3;
        !           176:         return r >= 65536 && r <= 1114111 ? r : UTF8_ERROR;
        !           177:     }
        !           178:     return UTF8_ERROR;
        !           179: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>