Annotation of embedaddon/php/ext/mbstring/tests/illformed_utf_sequences.phpt, revision 1.1.1.2
1.1 misho 1: --TEST--
2: Unicode standard conformance test (ill-formed UTF sequences.)
3: --SKIPIF--
4: <?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
5: --FILE--
6: <?php
1.1.1.2 ! misho 7: function chk_enc($str, $n, $enc = "UTF-8", $with_bom = false) {
! 8: $src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc));
! 9: $dst = str_repeat("0000fffd", $n);
! 10: if ($with_bom) {
! 11: $dst = "0000feff" . $dst;
! 12: }
! 13: if ($dst == $src) {
! 14: return false;
! 15: } else {
! 16: return $src;
! 17: }
! 18: }
! 19:
! 20: mb_substitute_character(0xfffd);
! 21:
! 22:
1.1 misho 23: echo "UTF-8 redundancy\n";
1.1.1.2 ! misho 24: var_dump(chk_enc("\x31\x32\x33", 0));
! 25: var_dump(chk_enc("\x41\x42\x43", 0));
! 26: var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
! 27: var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
! 28: var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
! 29: var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
! 30: var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
! 31: var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
! 32: var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
! 33: var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
! 34: var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
! 35: var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
! 36:
! 37: var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
! 38: var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
! 39: var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
! 40: var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
! 41: var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
! 42:
! 43: var_dump(chk_enc("\xc1\xbf", 2));
! 44: var_dump(chk_enc("\xc2\x80", 0));
! 45: var_dump(chk_enc("\xdf\xbf", 0));
! 46: var_dump(chk_enc("\xe0\x9f\xff", 2));
! 47: var_dump(chk_enc("\xe0\xa0\x80", 2));
! 48: var_dump(chk_enc("\xef\xbf\xbf", 0));
! 49: var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
! 50: var_dump(chk_enc("\xf0\x90\x80\x80", 0));
! 51: var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
! 52: var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
! 53: var_dump(chk_enc("\xf8\x88\x80\x80\x80", 5));
! 54: var_dump(chk_enc("\xfb\xbf\xbf\xbf\xbf", 5));
! 55: var_dump(chk_enc("\xfc\x83\xbf\xbf\xbf\xbf", 6));
! 56: var_dump(chk_enc("\xfc\x84\x80\x80\x80\x80", 6));
! 57: var_dump(chk_enc("\xfd\xaf\xbf\xbf\xbf\xbf", 6));
! 58: var_dump(chk_enc("\xfd\xbf\xbf\xbf\xbf\xbf", 6));
1.1 misho 59:
60: echo "UTF-8 and surrogates area\n";
61: $out = '';
1.1.1.2 ! misho 62: $cnt = 0;
1.1 misho 63: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2 ! misho 64: $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
! 65: if ($s === false) {
! 66: $cnt++;
! 67: } else {
! 68: $out .= $s;
! 69: }
1.1 misho 70: }
1.1.1.2 ! misho 71: var_dump($cnt);
! 72: var_dump($out);
1.1 misho 73:
74: echo "UTF-32 code range\n";
1.1.1.2 ! misho 75: var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32BE"));
! 76: var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32BE"));
! 77: var_dump(chk_enc("\x00\x00\x11\x00", 1, "UTF-32LE"));
! 78: var_dump(chk_enc("\xff\xff\x10\x00", 0, "UTF-32LE"));
! 79: var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32"));
! 80: var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32"));
! 81: var_dump(chk_enc("\x00\x00\xfe\xff\x00\x11\x00\x00", 0, "UTF-32"));
! 82: var_dump(chk_enc("\x00\x00\xfe\xff\x00\x10\xff\xff", 0, "UTF-32"));
! 83: var_dump(chk_enc("\xff\xfe\x00\x00\x00\x00\x11\x00", 0, "UTF-32"));
! 84: var_dump(chk_enc("\xff\xfe\x00\x00\xff\xff\x10\x00", 0, "UTF-32"));
1.1 misho 85:
86: echo "UTF-32 and surrogates area\n";
87: $out = '';
1.1.1.2 ! misho 88: $cnt = 0;
1.1 misho 89: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2 ! misho 90: $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32BE");
! 91: if ($s === false) {
! 92: $cnt++;
! 93: } else {
! 94: $out .= $s;
! 95: }
1.1 misho 96: }
1.1.1.2 ! misho 97: var_dump($cnt);
! 98: var_dump($out);
1.1 misho 99:
100: $out = '';
1.1.1.2 ! misho 101: $cnt = 0;
1.1 misho 102: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2 ! misho 103: $s = chk_enc(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 1, "UTF-32LE");
! 104: if ($s === false) {
! 105: $cnt++;
! 106: } else {
! 107: $out .= $s;
! 108: }
1.1 misho 109: }
1.1.1.2 ! misho 110: var_dump($cnt);
! 111: var_dump($out);
1.1 misho 112:
113: $out = '';
1.1.1.2 ! misho 114: $cnt = 0;
1.1 misho 115: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2 ! misho 116: $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32");
! 117: if ($s === false) {
! 118: $cnt++;
! 119: } else {
! 120: $out .= $s;
! 121: }
1.1 misho 122: }
1.1.1.2 ! misho 123: var_dump($cnt);
! 124: var_dump($out);
! 125:
! 126: echo "UTF-32 and surrogates area with BOM\n";
1.1 misho 127:
128: $out = '';
1.1.1.2 ! misho 129: $cnt = 0;
1.1 misho 130: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2 ! misho 131: $s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff),
! 132: 1, "UTF-32", true);
! 133: if ($s === false) {
! 134: $cnt++;
! 135: } else {
! 136: $out .= $s;
! 137: }
1.1 misho 138: }
1.1.1.2 ! misho 139: var_dump($cnt);
! 140: var_dump(str_replace("0000feff","",$out));
1.1 misho 141:
142: $out = '';
1.1.1.2 ! misho 143: $cnt = 0;
1.1 misho 144: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2 ! misho 145: $s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff),
! 146: 1, "UTF-32", true);
! 147: if ($s === false) {
! 148: $cnt++;
! 149: } else {
! 150: $out .= $s;
! 151: }
1.1 misho 152: }
1.1.1.2 ! misho 153: var_dump($cnt);
! 154: var_dump(str_replace("0000feff","",$out));
! 155:
1.1 misho 156: ?>
157: --EXPECT--
158: UTF-8 redundancy
159: string(24) "000000310000003200000033"
160: string(24) "000000410000004200000043"
1.1.1.2 ! misho 161: bool(false)
! 162: bool(false)
! 163: bool(false)
! 164: bool(false)
! 165: bool(false)
! 166: bool(false)
! 167: bool(false)
! 168: bool(false)
! 169: bool(false)
! 170: bool(false)
1.1 misho 171: string(24) "000000a2000000a3000000a5"
1.1.1.2 ! misho 172: bool(false)
! 173: bool(false)
! 174: bool(false)
! 175: bool(false)
! 176: bool(false)
1.1 misho 177: string(8) "00000080"
178: string(8) "000007ff"
1.1.1.2 ! misho 179: bool(false)
1.1 misho 180: string(8) "00000800"
181: string(8) "0000ffff"
1.1.1.2 ! misho 182: bool(false)
1.1 misho 183: string(8) "00010000"
1.1.1.2 ! misho 184: bool(false)
! 185: bool(false)
! 186: bool(false)
! 187: bool(false)
! 188: bool(false)
! 189: bool(false)
! 190: bool(false)
! 191: bool(false)
1.1 misho 192: UTF-8 and surrogates area
1.1.1.2 ! misho 193: int(2048)
1.1 misho 194: string(16) "0000d7ff0000e000"
195: UTF-32 code range
1.1.1.2 ! misho 196: bool(false)
1.1 misho 197: string(8) "0010ffff"
1.1.1.2 ! misho 198: bool(false)
1.1 misho 199: string(8) "0010ffff"
1.1.1.2 ! misho 200: bool(false)
1.1 misho 201: string(8) "0010ffff"
1.1.1.2 ! misho 202: string(16) "0000feff0000fffd"
1.1 misho 203: string(16) "0000feff0010ffff"
1.1.1.2 ! misho 204: string(16) "0000feff0000fffd"
1.1 misho 205: string(16) "0000feff0010ffff"
206: UTF-32 and surrogates area
1.1.1.2 ! misho 207: int(2048)
1.1 misho 208: string(16) "0000d7ff0000e000"
1.1.1.2 ! misho 209: int(2048)
1.1 misho 210: string(16) "0000d7ff0000e000"
1.1.1.2 ! misho 211: int(2048)
1.1 misho 212: string(16) "0000d7ff0000e000"
1.1.1.2 ! misho 213: UTF-32 and surrogates area with BOM
! 214: int(2048)
1.1 misho 215: string(16) "0000d7ff0000e000"
1.1.1.2 ! misho 216: int(2048)
1.1 misho 217: string(16) "0000d7ff0000e000"
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>