Annotation of embedaddon/php/ext/mbstring/oniguruma/enc/sjis.c, revision 1.1
1.1 ! misho 1: /**********************************************************************
! 2: sjis.c - Oniguruma (regular expression library)
! 3: **********************************************************************/
! 4: /*-
! 5: * Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
! 6: * All rights reserved.
! 7: *
! 8: * Redistribution and use in source and binary forms, with or without
! 9: * modification, are permitted provided that the following conditions
! 10: * are met:
! 11: * 1. Redistributions of source code must retain the above copyright
! 12: * notice, this list of conditions and the following disclaimer.
! 13: * 2. Redistributions in binary form must reproduce the above copyright
! 14: * notice, this list of conditions and the following disclaimer in the
! 15: * documentation and/or other materials provided with the distribution.
! 16: *
! 17: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
! 18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 20: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
! 21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
! 22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
! 23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
! 24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
! 25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
! 26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
! 27: * SUCH DAMAGE.
! 28: */
! 29:
! 30: #include "regenc.h"
! 31:
! 32: static const int EncLen_SJIS[] = {
! 33: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 34: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 35: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 36: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 37: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 38: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 39: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 40: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 41: 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 42: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 43: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 44: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 45: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 46: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 47: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
! 48: 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
! 49: };
! 50:
! 51: static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
! 52: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
! 53: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
! 54: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
! 55: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
! 56: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 57: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 58: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 59: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
! 60: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 61: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 62: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 63: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 64: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 65: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 66: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
! 67: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
! 68: };
! 69:
! 70: #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
! 71: #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
! 72:
! 73: static int
! 74: sjis_mbc_enc_len(const UChar* p)
! 75: {
! 76: return EncLen_SJIS[*p];
! 77: }
! 78:
! 79: static int
! 80: sjis_code_to_mbclen(OnigCodePoint code)
! 81: {
! 82: if (code < 256) {
! 83: if (EncLen_SJIS[(int )code] == 1)
! 84: return 1;
! 85: else
! 86: return 0;
! 87: }
! 88: else if (code <= 0xffff) {
! 89: return 2;
! 90: }
! 91: else
! 92: return 0;
! 93: }
! 94:
! 95: static OnigCodePoint
! 96: sjis_mbc_to_code(const UChar* p, const UChar* end)
! 97: {
! 98: int c, i, len;
! 99: OnigCodePoint n;
! 100:
! 101: len = enc_len(ONIG_ENCODING_SJIS, p);
! 102: c = *p++;
! 103: n = c;
! 104: if (len == 1) return n;
! 105:
! 106: for (i = 1; i < len; i++) {
! 107: if (p >= end) break;
! 108: c = *p++;
! 109: n <<= 8; n += c;
! 110: }
! 111: return n;
! 112: }
! 113:
! 114: static int
! 115: sjis_code_to_mbc(OnigCodePoint code, UChar *buf)
! 116: {
! 117: UChar *p = buf;
! 118:
! 119: if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
! 120: *p++ = (UChar )(code & 0xff);
! 121:
! 122: #if 0
! 123: if (enc_len(ONIG_ENCODING_SJIS, buf) != (p - buf))
! 124: return REGERR_INVALID_WIDE_CHAR_VALUE;
! 125: #endif
! 126: return p - buf;
! 127: }
! 128:
! 129: static int
! 130: sjis_mbc_to_normalize(OnigAmbigType flag,
! 131: const UChar** pp, const UChar* end, UChar* lower)
! 132: {
! 133: const UChar* p = *pp;
! 134:
! 135: if (ONIGENC_IS_MBC_ASCII(p)) {
! 136: if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
! 137: *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
! 138: }
! 139: else {
! 140: *lower = *p;
! 141: }
! 142:
! 143: (*pp)++;
! 144: return 1;
! 145: }
! 146: else {
! 147: int len = enc_len(ONIG_ENCODING_SJIS, p);
! 148:
! 149: if (lower != p) {
! 150: int i;
! 151: for (i = 0; i < len; i++) {
! 152: *lower++ = *p++;
! 153: }
! 154: }
! 155: (*pp) += len;
! 156: return len; /* return byte length of converted char to lower */
! 157: }
! 158: }
! 159:
! 160: static int
! 161: sjis_is_mbc_ambiguous(OnigAmbigType flag, const UChar** pp, const UChar* end)
! 162: {
! 163: return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end);
! 164:
! 165: }
! 166:
! 167: static int
! 168: sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype)
! 169: {
! 170: if (code < 128)
! 171: return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
! 172: else {
! 173: if ((ctype & (ONIGENC_CTYPE_WORD |
! 174: ONIGENC_CTYPE_GRAPH | ONIGENC_CTYPE_PRINT)) != 0) {
! 175: return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE);
! 176: }
! 177: }
! 178:
! 179: return FALSE;
! 180: }
! 181:
! 182: static UChar*
! 183: sjis_left_adjust_char_head(const UChar* start, const UChar* s)
! 184: {
! 185: const UChar *p;
! 186: int len;
! 187:
! 188: if (s <= start) return (UChar* )s;
! 189: p = s;
! 190:
! 191: if (SJIS_ISMB_TRAIL(*p)) {
! 192: while (p > start) {
! 193: if (! SJIS_ISMB_FIRST(*--p)) {
! 194: p++;
! 195: break;
! 196: }
! 197: }
! 198: }
! 199: len = enc_len(ONIG_ENCODING_SJIS, p);
! 200: if (p + len > s) return (UChar* )p;
! 201: p += len;
! 202: return (UChar* )(p + ((s - p) & ~1));
! 203: }
! 204:
! 205: static int
! 206: sjis_is_allowed_reverse_match(const UChar* s, const UChar* end)
! 207: {
! 208: const UChar c = *s;
! 209: return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
! 210: }
! 211:
! 212: OnigEncodingType OnigEncodingSJIS = {
! 213: sjis_mbc_enc_len,
! 214: "Shift_JIS", /* name */
! 215: 2, /* max byte length */
! 216: 1, /* min byte length */
! 217: ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
! 218: {
! 219: (OnigCodePoint )'\\' /* esc */
! 220: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
! 221: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
! 222: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
! 223: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
! 224: , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
! 225: },
! 226: onigenc_is_mbc_newline_0x0a,
! 227: sjis_mbc_to_code,
! 228: sjis_code_to_mbclen,
! 229: sjis_code_to_mbc,
! 230: sjis_mbc_to_normalize,
! 231: sjis_is_mbc_ambiguous,
! 232: onigenc_ascii_get_all_pair_ambig_codes,
! 233: onigenc_nothing_get_all_comp_ambig_codes,
! 234: sjis_is_code_ctype,
! 235: onigenc_not_support_get_ctype_code_range,
! 236: sjis_left_adjust_char_head,
! 237: sjis_is_allowed_reverse_match
! 238: };
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>