Annotation of embedaddon/php/ext/json/utf8_decode.c, revision 1.1.1.1

1.1       misho       1: /* utf8_decode.c */
                      2: 
                      3: /* 2005-12-25 */
                      4: 
                      5: /*
                      6: Copyright (c) 2005 JSON.org
                      7: 
                      8: Permission is hereby granted, free of charge, to any person obtaining a copy
                      9: of this software and associated documentation files (the "Software"), to deal
                     10: in the Software without restriction, including without limitation the rights
                     11: to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
                     12: copies of the Software, and to permit persons to whom the Software is
                     13: furnished to do so, subject to the following conditions:
                     14: 
                     15: The above copyright notice and this permission notice shall be included in all
                     16: copies or substantial portions of the Software.
                     17: 
                     18: The Software shall be used for Good, not Evil.
                     19: 
                     20: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                     21: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                     22: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                     23: AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                     24: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
                     25: OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
                     26: SOFTWARE.
                     27: */
                     28: 
                     29: #include "utf8_decode.h"
                     30: 
                     31: /*
                     32:     Very Strict UTF-8 Decoder
                     33: 
                     34:     UTF-8 is a multibyte character encoding of Unicode. A character can be
                     35:     represented by 1-4 bytes. The bit pattern of the first byte indicates the
                     36:     number of continuation bytes.
                     37: 
                     38:     Most UTF-8 decoders tend to be lenient, attempting to recover as much
                     39:     information as possible, even from badly encoded input. This UTF-8
                     40:     decoder is not lenient. It will reject input which does not include
                     41:     proper continuation bytes. It will reject aliases (or suboptimal
                     42:     codings). It will reject surrogates. (Surrogate encoding should only be
                     43:     used with UTF-16.)
                     44: 
                     45:     Code     Contination Minimum Maximum
                     46:     0xxxxxxx           0       0     127
                     47:     10xxxxxx       error
                     48:     110xxxxx           1     128    2047
                     49:     1110xxxx           2    2048   65535 excluding 55296 - 57343
                     50:     11110xxx           3   65536 1114111
                     51:     11111xxx       error
                     52: */
                     53: 
                     54: 
                     55: /*
                     56:     Get the next byte. It returns UTF8_END if there are no more bytes.
                     57: */
                     58: static int 
                     59: get(json_utf8_decode *utf8)
                     60: {
                     61:     int c;
                     62:     if (utf8->the_index >= utf8->the_length) {
                     63:         return UTF8_END;
                     64:     }
                     65:     c = utf8->the_input[utf8->the_index] & 0xFF;
                     66:     utf8->the_index += 1;
                     67:     return c;
                     68: }
                     69: 
                     70: 
                     71: /*
                     72:     Get the 6-bit payload of the next continuation byte.
                     73:     Return UTF8_ERROR if it is not a contination byte.
                     74: */
                     75: static int 
                     76: cont(json_utf8_decode *utf8)
                     77: {
                     78:     int c = get(utf8);
                     79:     return ((c & 0xC0) == 0x80) ? (c & 0x3F) : UTF8_ERROR;
                     80: }
                     81: 
                     82: 
                     83: /*
                     84:     Initialize the UTF-8 decoder. The decoder is not reentrant,
                     85: */
                     86: void 
                     87: utf8_decode_init(json_utf8_decode *utf8, char p[], int length)
                     88: {
                     89:     utf8->the_index = 0;
                     90:     utf8->the_input = p;
                     91:     utf8->the_length = length;
                     92:     utf8->the_char = 0;
                     93:     utf8->the_byte = 0;
                     94: }
                     95: 
                     96: 
                     97: /*
                     98:     Get the current byte offset. This is generally used in error reporting.
                     99: */
                    100: int 
                    101: utf8_decode_at_byte(json_utf8_decode *utf8)
                    102: {
                    103:     return utf8->the_byte;
                    104: }
                    105: 
                    106: 
                    107: /*
                    108:     Get the current character offset. This is generally used in error reporting.
                    109:     The character offset matches the byte offset if the text is strictly ASCII.
                    110: */
                    111: int 
                    112: utf8_decode_at_character(json_utf8_decode *utf8)
                    113: {
                    114:     return utf8->the_char > 0 ? utf8->the_char - 1 : 0;
                    115: }
                    116: 
                    117: 
                    118: /*
                    119:     Extract the next character.
                    120:     Returns: the character (between 0 and 1114111)
                    121:          or  UTF8_END   (the end)
                    122:          or  UTF8_ERROR (error)
                    123: */
                    124: int 
                    125: utf8_decode_next(json_utf8_decode *utf8)
                    126: {
                    127:     int c;  /* the first byte of the character */
                    128:     int r;  /* the result */
                    129: 
                    130:     if (utf8->the_index >= utf8->the_length) {
                    131:         return utf8->the_index == utf8->the_length ? UTF8_END : UTF8_ERROR;
                    132:     }
                    133:     utf8->the_byte = utf8->the_index;
                    134:     utf8->the_char += 1;
                    135:     c = get(utf8);
                    136: /*
                    137:     Zero continuation (0 to 127)
                    138: */
                    139:     if ((c & 0x80) == 0) {
                    140:         return c;
                    141:     }
                    142: /*
                    143:     One contination (128 to 2047)
                    144: */
                    145:     if ((c & 0xE0) == 0xC0) {
                    146:         int c1 = cont(utf8);
                    147:         if (c1 < 0) {
                    148:             return UTF8_ERROR;
                    149:         }
                    150:         r = ((c & 0x1F) << 6) | c1;
                    151:         return r >= 128 ? r : UTF8_ERROR;
                    152:     }
                    153: /*
                    154:     Two continuation (2048 to 55295 and 57344 to 65535) 
                    155: */
                    156:     if ((c & 0xF0) == 0xE0) {
                    157:         int c1 = cont(utf8);
                    158:         int c2 = cont(utf8);
                    159:         if (c1 < 0 || c2 < 0) {
                    160:             return UTF8_ERROR;
                    161:         }
                    162:         r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
                    163:         return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR;
                    164:     }
                    165: /*
                    166:     Three continuation (65536 to 1114111)
                    167: */
                    168:     if ((c & 0xF8) == 0xF0) {
                    169:         int c1 = cont(utf8);
                    170:         int c2 = cont(utf8);
                    171:         int c3 = cont(utf8);
                    172:         if (c1 < 0 || c2 < 0 || c3 < 0) {
                    173:             return UTF8_ERROR;
                    174:         }
                    175:         r = ((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3;
                    176:         return r >= 65536 && r <= 1114111 ? r : UTF8_ERROR;
                    177:     }
                    178:     return UTF8_ERROR;
                    179: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>