Annotation of embedaddon/php/ext/json/utf8_decode.c, revision 1.1
1.1 ! misho 1: /* utf8_decode.c */
! 2:
! 3: /* 2005-12-25 */
! 4:
! 5: /*
! 6: Copyright (c) 2005 JSON.org
! 7:
! 8: Permission is hereby granted, free of charge, to any person obtaining a copy
! 9: of this software and associated documentation files (the "Software"), to deal
! 10: in the Software without restriction, including without limitation the rights
! 11: to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
! 12: copies of the Software, and to permit persons to whom the Software is
! 13: furnished to do so, subject to the following conditions:
! 14:
! 15: The above copyright notice and this permission notice shall be included in all
! 16: copies or substantial portions of the Software.
! 17:
! 18: The Software shall be used for Good, not Evil.
! 19:
! 20: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
! 21: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
! 22: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
! 23: AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
! 24: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
! 25: OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
! 26: SOFTWARE.
! 27: */
! 28:
! 29: #include "utf8_decode.h"
! 30:
! 31: /*
! 32: Very Strict UTF-8 Decoder
! 33:
! 34: UTF-8 is a multibyte character encoding of Unicode. A character can be
! 35: represented by 1-4 bytes. The bit pattern of the first byte indicates the
! 36: number of continuation bytes.
! 37:
! 38: Most UTF-8 decoders tend to be lenient, attempting to recover as much
! 39: information as possible, even from badly encoded input. This UTF-8
! 40: decoder is not lenient. It will reject input which does not include
! 41: proper continuation bytes. It will reject aliases (or suboptimal
! 42: codings). It will reject surrogates. (Surrogate encoding should only be
! 43: used with UTF-16.)
! 44:
! 45: Code Contination Minimum Maximum
! 46: 0xxxxxxx 0 0 127
! 47: 10xxxxxx error
! 48: 110xxxxx 1 128 2047
! 49: 1110xxxx 2 2048 65535 excluding 55296 - 57343
! 50: 11110xxx 3 65536 1114111
! 51: 11111xxx error
! 52: */
! 53:
! 54:
! 55: /*
! 56: Get the next byte. It returns UTF8_END if there are no more bytes.
! 57: */
! 58: static int
! 59: get(json_utf8_decode *utf8)
! 60: {
! 61: int c;
! 62: if (utf8->the_index >= utf8->the_length) {
! 63: return UTF8_END;
! 64: }
! 65: c = utf8->the_input[utf8->the_index] & 0xFF;
! 66: utf8->the_index += 1;
! 67: return c;
! 68: }
! 69:
! 70:
! 71: /*
! 72: Get the 6-bit payload of the next continuation byte.
! 73: Return UTF8_ERROR if it is not a contination byte.
! 74: */
! 75: static int
! 76: cont(json_utf8_decode *utf8)
! 77: {
! 78: int c = get(utf8);
! 79: return ((c & 0xC0) == 0x80) ? (c & 0x3F) : UTF8_ERROR;
! 80: }
! 81:
! 82:
! 83: /*
! 84: Initialize the UTF-8 decoder. The decoder is not reentrant,
! 85: */
! 86: void
! 87: utf8_decode_init(json_utf8_decode *utf8, char p[], int length)
! 88: {
! 89: utf8->the_index = 0;
! 90: utf8->the_input = p;
! 91: utf8->the_length = length;
! 92: utf8->the_char = 0;
! 93: utf8->the_byte = 0;
! 94: }
! 95:
! 96:
! 97: /*
! 98: Get the current byte offset. This is generally used in error reporting.
! 99: */
! 100: int
! 101: utf8_decode_at_byte(json_utf8_decode *utf8)
! 102: {
! 103: return utf8->the_byte;
! 104: }
! 105:
! 106:
! 107: /*
! 108: Get the current character offset. This is generally used in error reporting.
! 109: The character offset matches the byte offset if the text is strictly ASCII.
! 110: */
! 111: int
! 112: utf8_decode_at_character(json_utf8_decode *utf8)
! 113: {
! 114: return utf8->the_char > 0 ? utf8->the_char - 1 : 0;
! 115: }
! 116:
! 117:
! 118: /*
! 119: Extract the next character.
! 120: Returns: the character (between 0 and 1114111)
! 121: or UTF8_END (the end)
! 122: or UTF8_ERROR (error)
! 123: */
! 124: int
! 125: utf8_decode_next(json_utf8_decode *utf8)
! 126: {
! 127: int c; /* the first byte of the character */
! 128: int r; /* the result */
! 129:
! 130: if (utf8->the_index >= utf8->the_length) {
! 131: return utf8->the_index == utf8->the_length ? UTF8_END : UTF8_ERROR;
! 132: }
! 133: utf8->the_byte = utf8->the_index;
! 134: utf8->the_char += 1;
! 135: c = get(utf8);
! 136: /*
! 137: Zero continuation (0 to 127)
! 138: */
! 139: if ((c & 0x80) == 0) {
! 140: return c;
! 141: }
! 142: /*
! 143: One contination (128 to 2047)
! 144: */
! 145: if ((c & 0xE0) == 0xC0) {
! 146: int c1 = cont(utf8);
! 147: if (c1 < 0) {
! 148: return UTF8_ERROR;
! 149: }
! 150: r = ((c & 0x1F) << 6) | c1;
! 151: return r >= 128 ? r : UTF8_ERROR;
! 152: }
! 153: /*
! 154: Two continuation (2048 to 55295 and 57344 to 65535)
! 155: */
! 156: if ((c & 0xF0) == 0xE0) {
! 157: int c1 = cont(utf8);
! 158: int c2 = cont(utf8);
! 159: if (c1 < 0 || c2 < 0) {
! 160: return UTF8_ERROR;
! 161: }
! 162: r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
! 163: return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR;
! 164: }
! 165: /*
! 166: Three continuation (65536 to 1114111)
! 167: */
! 168: if ((c & 0xF8) == 0xF0) {
! 169: int c1 = cont(utf8);
! 170: int c2 = cont(utf8);
! 171: int c3 = cont(utf8);
! 172: if (c1 < 0 || c2 < 0 || c3 < 0) {
! 173: return UTF8_ERROR;
! 174: }
! 175: r = ((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3;
! 176: return r >= 65536 && r <= 1114111 ? r : UTF8_ERROR;
! 177: }
! 178: return UTF8_ERROR;
! 179: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>