Annotation of embedaddon/php/ext/json/utf8_decode.c, revision 1.1.1.1
1.1 misho 1: /* utf8_decode.c */
2:
3: /* 2005-12-25 */
4:
5: /*
6: Copyright (c) 2005 JSON.org
7:
8: Permission is hereby granted, free of charge, to any person obtaining a copy
9: of this software and associated documentation files (the "Software"), to deal
10: in the Software without restriction, including without limitation the rights
11: to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12: copies of the Software, and to permit persons to whom the Software is
13: furnished to do so, subject to the following conditions:
14:
15: The above copyright notice and this permission notice shall be included in all
16: copies or substantial portions of the Software.
17:
18: The Software shall be used for Good, not Evil.
19:
20: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23: AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25: OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26: SOFTWARE.
27: */
28:
29: #include "utf8_decode.h"
30:
31: /*
32: Very Strict UTF-8 Decoder
33:
34: UTF-8 is a multibyte character encoding of Unicode. A character can be
35: represented by 1-4 bytes. The bit pattern of the first byte indicates the
36: number of continuation bytes.
37:
38: Most UTF-8 decoders tend to be lenient, attempting to recover as much
39: information as possible, even from badly encoded input. This UTF-8
40: decoder is not lenient. It will reject input which does not include
41: proper continuation bytes. It will reject aliases (or suboptimal
42: codings). It will reject surrogates. (Surrogate encoding should only be
43: used with UTF-16.)
44:
45: Code Contination Minimum Maximum
46: 0xxxxxxx 0 0 127
47: 10xxxxxx error
48: 110xxxxx 1 128 2047
49: 1110xxxx 2 2048 65535 excluding 55296 - 57343
50: 11110xxx 3 65536 1114111
51: 11111xxx error
52: */
53:
54:
55: /*
56: Get the next byte. It returns UTF8_END if there are no more bytes.
57: */
58: static int
59: get(json_utf8_decode *utf8)
60: {
61: int c;
62: if (utf8->the_index >= utf8->the_length) {
63: return UTF8_END;
64: }
65: c = utf8->the_input[utf8->the_index] & 0xFF;
66: utf8->the_index += 1;
67: return c;
68: }
69:
70:
71: /*
72: Get the 6-bit payload of the next continuation byte.
73: Return UTF8_ERROR if it is not a contination byte.
74: */
75: static int
76: cont(json_utf8_decode *utf8)
77: {
78: int c = get(utf8);
79: return ((c & 0xC0) == 0x80) ? (c & 0x3F) : UTF8_ERROR;
80: }
81:
82:
83: /*
84: Initialize the UTF-8 decoder. The decoder is not reentrant,
85: */
86: void
87: utf8_decode_init(json_utf8_decode *utf8, char p[], int length)
88: {
89: utf8->the_index = 0;
90: utf8->the_input = p;
91: utf8->the_length = length;
92: utf8->the_char = 0;
93: utf8->the_byte = 0;
94: }
95:
96:
97: /*
98: Get the current byte offset. This is generally used in error reporting.
99: */
100: int
101: utf8_decode_at_byte(json_utf8_decode *utf8)
102: {
103: return utf8->the_byte;
104: }
105:
106:
107: /*
108: Get the current character offset. This is generally used in error reporting.
109: The character offset matches the byte offset if the text is strictly ASCII.
110: */
111: int
112: utf8_decode_at_character(json_utf8_decode *utf8)
113: {
114: return utf8->the_char > 0 ? utf8->the_char - 1 : 0;
115: }
116:
117:
118: /*
119: Extract the next character.
120: Returns: the character (between 0 and 1114111)
121: or UTF8_END (the end)
122: or UTF8_ERROR (error)
123: */
124: int
125: utf8_decode_next(json_utf8_decode *utf8)
126: {
127: int c; /* the first byte of the character */
128: int r; /* the result */
129:
130: if (utf8->the_index >= utf8->the_length) {
131: return utf8->the_index == utf8->the_length ? UTF8_END : UTF8_ERROR;
132: }
133: utf8->the_byte = utf8->the_index;
134: utf8->the_char += 1;
135: c = get(utf8);
136: /*
137: Zero continuation (0 to 127)
138: */
139: if ((c & 0x80) == 0) {
140: return c;
141: }
142: /*
143: One contination (128 to 2047)
144: */
145: if ((c & 0xE0) == 0xC0) {
146: int c1 = cont(utf8);
147: if (c1 < 0) {
148: return UTF8_ERROR;
149: }
150: r = ((c & 0x1F) << 6) | c1;
151: return r >= 128 ? r : UTF8_ERROR;
152: }
153: /*
154: Two continuation (2048 to 55295 and 57344 to 65535)
155: */
156: if ((c & 0xF0) == 0xE0) {
157: int c1 = cont(utf8);
158: int c2 = cont(utf8);
159: if (c1 < 0 || c2 < 0) {
160: return UTF8_ERROR;
161: }
162: r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
163: return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR;
164: }
165: /*
166: Three continuation (65536 to 1114111)
167: */
168: if ((c & 0xF8) == 0xF0) {
169: int c1 = cont(utf8);
170: int c2 = cont(utf8);
171: int c3 = cont(utf8);
172: if (c1 < 0 || c2 < 0 || c3 < 0) {
173: return UTF8_ERROR;
174: }
175: r = ((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3;
176: return r >= 65536 && r <= 1114111 ? r : UTF8_ERROR;
177: }
178: return UTF8_ERROR;
179: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>