Return to utf8_decode.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / json |
1.1 ! misho 1: /* utf8_decode.c */ ! 2: ! 3: /* 2005-12-25 */ ! 4: ! 5: /* ! 6: Copyright (c) 2005 JSON.org ! 7: ! 8: Permission is hereby granted, free of charge, to any person obtaining a copy ! 9: of this software and associated documentation files (the "Software"), to deal ! 10: in the Software without restriction, including without limitation the rights ! 11: to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ! 12: copies of the Software, and to permit persons to whom the Software is ! 13: furnished to do so, subject to the following conditions: ! 14: ! 15: The above copyright notice and this permission notice shall be included in all ! 16: copies or substantial portions of the Software. ! 17: ! 18: The Software shall be used for Good, not Evil. ! 19: ! 20: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ! 21: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ! 22: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ! 23: AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ! 24: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ! 25: OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ! 26: SOFTWARE. ! 27: */ ! 28: ! 29: #include "utf8_decode.h" ! 30: ! 31: /* ! 32: Very Strict UTF-8 Decoder ! 33: ! 34: UTF-8 is a multibyte character encoding of Unicode. A character can be ! 35: represented by 1-4 bytes. The bit pattern of the first byte indicates the ! 36: number of continuation bytes. ! 37: ! 38: Most UTF-8 decoders tend to be lenient, attempting to recover as much ! 39: information as possible, even from badly encoded input. This UTF-8 ! 40: decoder is not lenient. It will reject input which does not include ! 41: proper continuation bytes. It will reject aliases (or suboptimal ! 42: codings). It will reject surrogates. (Surrogate encoding should only be ! 43: used with UTF-16.) ! 44: ! 45: Code Contination Minimum Maximum ! 46: 0xxxxxxx 0 0 127 ! 47: 10xxxxxx error ! 48: 110xxxxx 1 128 2047 ! 49: 1110xxxx 2 2048 65535 excluding 55296 - 57343 ! 50: 11110xxx 3 65536 1114111 ! 51: 11111xxx error ! 52: */ ! 53: ! 54: ! 55: /* ! 56: Get the next byte. It returns UTF8_END if there are no more bytes. ! 57: */ ! 58: static int ! 59: get(json_utf8_decode *utf8) ! 60: { ! 61: int c; ! 62: if (utf8->the_index >= utf8->the_length) { ! 63: return UTF8_END; ! 64: } ! 65: c = utf8->the_input[utf8->the_index] & 0xFF; ! 66: utf8->the_index += 1; ! 67: return c; ! 68: } ! 69: ! 70: ! 71: /* ! 72: Get the 6-bit payload of the next continuation byte. ! 73: Return UTF8_ERROR if it is not a contination byte. ! 74: */ ! 75: static int ! 76: cont(json_utf8_decode *utf8) ! 77: { ! 78: int c = get(utf8); ! 79: return ((c & 0xC0) == 0x80) ? (c & 0x3F) : UTF8_ERROR; ! 80: } ! 81: ! 82: ! 83: /* ! 84: Initialize the UTF-8 decoder. The decoder is not reentrant, ! 85: */ ! 86: void ! 87: utf8_decode_init(json_utf8_decode *utf8, char p[], int length) ! 88: { ! 89: utf8->the_index = 0; ! 90: utf8->the_input = p; ! 91: utf8->the_length = length; ! 92: utf8->the_char = 0; ! 93: utf8->the_byte = 0; ! 94: } ! 95: ! 96: ! 97: /* ! 98: Get the current byte offset. This is generally used in error reporting. ! 99: */ ! 100: int ! 101: utf8_decode_at_byte(json_utf8_decode *utf8) ! 102: { ! 103: return utf8->the_byte; ! 104: } ! 105: ! 106: ! 107: /* ! 108: Get the current character offset. This is generally used in error reporting. ! 109: The character offset matches the byte offset if the text is strictly ASCII. ! 110: */ ! 111: int ! 112: utf8_decode_at_character(json_utf8_decode *utf8) ! 113: { ! 114: return utf8->the_char > 0 ? utf8->the_char - 1 : 0; ! 115: } ! 116: ! 117: ! 118: /* ! 119: Extract the next character. ! 120: Returns: the character (between 0 and 1114111) ! 121: or UTF8_END (the end) ! 122: or UTF8_ERROR (error) ! 123: */ ! 124: int ! 125: utf8_decode_next(json_utf8_decode *utf8) ! 126: { ! 127: int c; /* the first byte of the character */ ! 128: int r; /* the result */ ! 129: ! 130: if (utf8->the_index >= utf8->the_length) { ! 131: return utf8->the_index == utf8->the_length ? UTF8_END : UTF8_ERROR; ! 132: } ! 133: utf8->the_byte = utf8->the_index; ! 134: utf8->the_char += 1; ! 135: c = get(utf8); ! 136: /* ! 137: Zero continuation (0 to 127) ! 138: */ ! 139: if ((c & 0x80) == 0) { ! 140: return c; ! 141: } ! 142: /* ! 143: One contination (128 to 2047) ! 144: */ ! 145: if ((c & 0xE0) == 0xC0) { ! 146: int c1 = cont(utf8); ! 147: if (c1 < 0) { ! 148: return UTF8_ERROR; ! 149: } ! 150: r = ((c & 0x1F) << 6) | c1; ! 151: return r >= 128 ? r : UTF8_ERROR; ! 152: } ! 153: /* ! 154: Two continuation (2048 to 55295 and 57344 to 65535) ! 155: */ ! 156: if ((c & 0xF0) == 0xE0) { ! 157: int c1 = cont(utf8); ! 158: int c2 = cont(utf8); ! 159: if (c1 < 0 || c2 < 0) { ! 160: return UTF8_ERROR; ! 161: } ! 162: r = ((c & 0x0F) << 12) | (c1 << 6) | c2; ! 163: return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR; ! 164: } ! 165: /* ! 166: Three continuation (65536 to 1114111) ! 167: */ ! 168: if ((c & 0xF8) == 0xF0) { ! 169: int c1 = cont(utf8); ! 170: int c2 = cont(utf8); ! 171: int c3 = cont(utf8); ! 172: if (c1 < 0 || c2 < 0 || c3 < 0) { ! 173: return UTF8_ERROR; ! 174: } ! 175: r = ((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3; ! 176: return r >= 65536 && r <= 1114111 ? r : UTF8_ERROR; ! 177: } ! 178: return UTF8_ERROR; ! 179: }