Annotation of embedaddon/php/ext/mbstring/tests/illformed_utf_sequences.phpt, revision 1.1.1.3

1.1       misho       1: --TEST--
                      2: Unicode standard conformance test (ill-formed UTF sequences.)
                      3: --SKIPIF--
                      4: <?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
                      5: --FILE--
                      6: <?php
1.1.1.2   misho       7: function chk_enc($str, $n, $enc = "UTF-8", $with_bom = false) {
                      8:        $src = bin2hex(mb_convert_encoding($str, "UCS-4BE", $enc));
                      9:        $dst = str_repeat("0000fffd", $n);
                     10:        if ($with_bom) {
                     11:                $dst = "0000feff" . $dst;
                     12:        }
                     13:        if ($dst == $src) {
                     14:                return false;
                     15:        } else {
                     16:                return $src;
                     17:        }
                     18: }
                     19: 
                     20: mb_substitute_character(0xfffd);
                     21: 
                     22: 
1.1       misho      23: echo "UTF-8 redundancy\n";
1.1.1.2   misho      24: var_dump(chk_enc("\x31\x32\x33", 0));
                     25: var_dump(chk_enc("\x41\x42\x43", 0));
                     26: var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
                     27: var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
1.1.1.3 ! misho      28: var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
        !            29: var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
        !            30: var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
        !            31: var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
1.1.1.2   misho      32: var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
                     33: var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
                     34: var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
                     35: var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
                     36: 
                     37: var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
1.1.1.3 ! misho      38: var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
        !            39: var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
1.1.1.2   misho      40: var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
                     41: var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
                     42: 
                     43: var_dump(chk_enc("\xc1\xbf", 2));
                     44: var_dump(chk_enc("\xc2\x80", 0));
                     45: var_dump(chk_enc("\xdf\xbf", 0));
1.1.1.3 ! misho      46: var_dump(chk_enc("\xe0\x9f\xff", 3));
1.1.1.2   misho      47: var_dump(chk_enc("\xe0\xa0\x80", 2));
                     48: var_dump(chk_enc("\xef\xbf\xbf", 0));
1.1.1.3 ! misho      49: var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
1.1.1.2   misho      50: var_dump(chk_enc("\xf0\x90\x80\x80", 0));
                     51: var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
                     52: var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
                     53: var_dump(chk_enc("\xf8\x88\x80\x80\x80", 5));
                     54: var_dump(chk_enc("\xfb\xbf\xbf\xbf\xbf", 5));
                     55: var_dump(chk_enc("\xfc\x83\xbf\xbf\xbf\xbf", 6));
                     56: var_dump(chk_enc("\xfc\x84\x80\x80\x80\x80", 6));
                     57: var_dump(chk_enc("\xfd\xaf\xbf\xbf\xbf\xbf", 6));
                     58: var_dump(chk_enc("\xfd\xbf\xbf\xbf\xbf\xbf", 6));
1.1       misho      59: 
                     60: echo "UTF-8 and surrogates area\n";
                     61: $out = '';
1.1.1.2   misho      62: $cnt = 0;
1.1       misho      63: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.3 ! misho      64:        $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
1.1.1.2   misho      65:        if ($s === false) {
                     66:                $cnt++;
                     67:        } else {
                     68:                $out .= $s;
                     69:        }
1.1       misho      70: }
1.1.1.2   misho      71: var_dump($cnt);
                     72: var_dump($out);
1.1       misho      73: 
                     74: echo "UTF-32 code range\n";
1.1.1.2   misho      75: var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32BE"));
                     76: var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32BE"));
                     77: var_dump(chk_enc("\x00\x00\x11\x00", 1, "UTF-32LE"));
                     78: var_dump(chk_enc("\xff\xff\x10\x00", 0, "UTF-32LE"));
                     79: var_dump(chk_enc("\x00\x11\x00\x00", 1, "UTF-32"));
                     80: var_dump(chk_enc("\x00\x10\xff\xff", 0, "UTF-32"));
                     81: var_dump(chk_enc("\x00\x00\xfe\xff\x00\x11\x00\x00", 0, "UTF-32"));
                     82: var_dump(chk_enc("\x00\x00\xfe\xff\x00\x10\xff\xff", 0, "UTF-32"));
                     83: var_dump(chk_enc("\xff\xfe\x00\x00\x00\x00\x11\x00", 0, "UTF-32"));
                     84: var_dump(chk_enc("\xff\xfe\x00\x00\xff\xff\x10\x00", 0, "UTF-32"));
1.1       misho      85: 
                     86: echo "UTF-32 and surrogates area\n";
                     87: $out = '';
1.1.1.2   misho      88: $cnt = 0;
1.1       misho      89: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2   misho      90:     $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32BE");
                     91:        if ($s === false) {
                     92:                $cnt++;
                     93:        } else {
                     94:                $out .= $s;
                     95:        }
1.1       misho      96: }
1.1.1.2   misho      97: var_dump($cnt);
                     98: var_dump($out);
1.1       misho      99: 
                    100: $out = '';
1.1.1.2   misho     101: $cnt = 0;
1.1       misho     102: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2   misho     103:     $s = chk_enc(pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 1, "UTF-32LE");
                    104:        if ($s === false) {
                    105:                $cnt++;
                    106:        } else {
                    107:                $out .= $s;
                    108:        }
1.1       misho     109: }
1.1.1.2   misho     110: var_dump($cnt);
                    111: var_dump($out);
1.1       misho     112: 
                    113: $out = '';
1.1.1.2   misho     114: $cnt = 0;
1.1       misho     115: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2   misho     116:     $s = chk_enc(pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 1, "UTF-32");
                    117:        if ($s === false) {
                    118:                $cnt++;
                    119:        } else {
                    120:                $out .= $s;
                    121:        }
1.1       misho     122: }
1.1.1.2   misho     123: var_dump($cnt);
                    124: var_dump($out);
                    125: 
                    126: echo "UTF-32 and surrogates area with BOM\n";
1.1       misho     127: 
                    128: $out = '';
1.1.1.2   misho     129: $cnt = 0;
1.1       misho     130: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2   misho     131:     $s = chk_enc("\x00\x00\xfe\xff". pack('C4', $i >> 24, ($i >> 16) & 0xff, ($i >> 8) & 0xff, $i & 0xff), 
                    132:                                 1, "UTF-32", true);
                    133:        if ($s === false) {
                    134:                $cnt++;
                    135:        } else {
                    136:                $out .= $s;
                    137:        }
1.1       misho     138: }
1.1.1.2   misho     139: var_dump($cnt);
                    140: var_dump(str_replace("0000feff","",$out));
1.1       misho     141: 
                    142: $out = '';
1.1.1.2   misho     143: $cnt = 0;
1.1       misho     144: for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
1.1.1.2   misho     145:     $s = chk_enc("\xff\xfe\x00\x00". pack('C4', $i & 0xff, ($i >> 8) & 0xff, ($i >> 16) & 0xff, ($i >> 24) & 0xff), 
                    146:                                 1, "UTF-32", true);
                    147:        if ($s === false) {
                    148:                $cnt++;
                    149:        } else {
                    150:                $out .= $s;
                    151:        }
1.1       misho     152: }
1.1.1.2   misho     153: var_dump($cnt);
                    154: var_dump(str_replace("0000feff","",$out));
                    155: 
1.1       misho     156: ?>
                    157: --EXPECT--
                    158: UTF-8 redundancy
                    159: string(24) "000000310000003200000033"
                    160: string(24) "000000410000004200000043"
1.1.1.2   misho     161: bool(false)
                    162: bool(false)
                    163: bool(false)
                    164: bool(false)
                    165: bool(false)
                    166: bool(false)
                    167: bool(false)
                    168: bool(false)
                    169: bool(false)
                    170: bool(false)
1.1       misho     171: string(24) "000000a2000000a3000000a5"
1.1.1.2   misho     172: bool(false)
                    173: bool(false)
                    174: bool(false)
                    175: bool(false)
                    176: bool(false)
1.1       misho     177: string(8) "00000080"
                    178: string(8) "000007ff"
1.1.1.2   misho     179: bool(false)
1.1       misho     180: string(8) "00000800"
                    181: string(8) "0000ffff"
1.1.1.2   misho     182: bool(false)
1.1       misho     183: string(8) "00010000"
1.1.1.2   misho     184: bool(false)
                    185: bool(false)
                    186: bool(false)
                    187: bool(false)
                    188: bool(false)
                    189: bool(false)
                    190: bool(false)
                    191: bool(false)
1.1       misho     192: UTF-8 and surrogates area
1.1.1.2   misho     193: int(2048)
1.1       misho     194: string(16) "0000d7ff0000e000"
                    195: UTF-32 code range
1.1.1.2   misho     196: bool(false)
1.1       misho     197: string(8) "0010ffff"
1.1.1.2   misho     198: bool(false)
1.1       misho     199: string(8) "0010ffff"
1.1.1.2   misho     200: bool(false)
1.1       misho     201: string(8) "0010ffff"
1.1.1.2   misho     202: string(16) "0000feff0000fffd"
1.1       misho     203: string(16) "0000feff0010ffff"
1.1.1.2   misho     204: string(16) "0000feff0000fffd"
1.1       misho     205: string(16) "0000feff0010ffff"
                    206: UTF-32 and surrogates area
1.1.1.2   misho     207: int(2048)
1.1       misho     208: string(16) "0000d7ff0000e000"
1.1.1.2   misho     209: int(2048)
1.1       misho     210: string(16) "0000d7ff0000e000"
1.1.1.2   misho     211: int(2048)
1.1       misho     212: string(16) "0000d7ff0000e000"
1.1.1.2   misho     213: UTF-32 and surrogates area with BOM
                    214: int(2048)
1.1       misho     215: string(16) "0000d7ff0000e000"
1.1.1.2   misho     216: int(2048)
1.1       misho     217: string(16) "0000d7ff0000e000"

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>