Return to html.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / standard |
1.1 ! misho 1: /* ! 2: +----------------------------------------------------------------------+ ! 3: | PHP Version 5 | ! 4: +----------------------------------------------------------------------+ ! 5: | Copyright (c) 1997-2012 The PHP Group | ! 6: +----------------------------------------------------------------------+ ! 7: | This source file is subject to version 3.01 of the PHP license, | ! 8: | that is bundled with this package in the file LICENSE, and is | ! 9: | available through the world-wide-web at the following url: | ! 10: | http://www.php.net/license/3_01.txt | ! 11: | If you did not receive a copy of the PHP license and are unable to | ! 12: | obtain it through the world-wide-web, please send a note to | ! 13: | license@php.net so we can mail you a copy immediately. | ! 14: +----------------------------------------------------------------------+ ! 15: | Authors: Rasmus Lerdorf <rasmus@php.net> | ! 16: | Jaakko Hyvätti <jaakko.hyvatti@iki.fi> | ! 17: | Wez Furlong <wez@thebrainroom.com> | ! 18: +----------------------------------------------------------------------+ ! 19: */ ! 20: ! 21: /* $Id: html.c 321634 2012-01-01 13:15:04Z felipe $ */ ! 22: ! 23: /* ! 24: * HTML entity resources: ! 25: * ! 26: * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp ! 27: * http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp ! 28: * http://www.unicode.org/Public/MAPPINGS/OBSOLETE/UNI2SGML.TXT ! 29: * ! 30: * http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#h-A2 ! 31: * ! 32: */ ! 33: ! 34: #include "php.h" ! 35: #if PHP_WIN32 ! 36: #include "config.w32.h" ! 37: #else ! 38: #include <php_config.h> ! 39: #endif ! 40: #include "html.h" ! 41: #include "php_string.h" ! 42: #include "SAPI.h" ! 43: #if HAVE_LOCALE_H ! 44: #include <locale.h> ! 45: #endif ! 46: #if HAVE_LANGINFO_H ! 47: #include <langinfo.h> ! 48: #endif ! 49: ! 50: #if HAVE_MBSTRING ! 51: # include "ext/mbstring/mbstring.h" ! 52: ZEND_EXTERN_MODULE_GLOBALS(mbstring) ! 53: #endif ! 54: ! 55: enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, ! 56: cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, ! 57: cs_big5hkscs, cs_sjis, cs_eucjp, cs_koi8r, ! 58: cs_cp1251, cs_8859_5, cs_cp866, cs_macroman ! 59: }; ! 60: typedef const char *const entity_table_t; ! 61: ! 62: /* codepage 1252 is a Windows extension to iso-8859-1. */ ! 63: static entity_table_t ent_cp_1252[] = { ! 64: "euro", NULL, "sbquo", "fnof", "bdquo", "hellip", "dagger", ! 65: "Dagger", "circ", "permil", "Scaron", "lsaquo", "OElig", ! 66: NULL, NULL, NULL, NULL, "lsquo", "rsquo", "ldquo", "rdquo", ! 67: "bull", "ndash", "mdash", "tilde", "trade", "scaron", "rsaquo", ! 68: "oelig", NULL, NULL, "Yuml" ! 69: }; ! 70: ! 71: static entity_table_t ent_iso_8859_1[] = { ! 72: "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", ! 73: "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", ! 74: "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", ! 75: "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", ! 76: "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", ! 77: "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", ! 78: "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", ! 79: "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", ! 80: "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", ! 81: "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", ! 82: "atilde", "auml", "aring", "aelig", "ccedil", "egrave", ! 83: "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", ! 84: "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", ! 85: "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", ! 86: "uuml", "yacute", "thorn", "yuml" ! 87: }; ! 88: ! 89: static entity_table_t ent_iso_8859_15[] = { ! 90: "nbsp", "iexcl", "cent", "pound", "euro", "yen", "Scaron", ! 91: "sect", "scaron", "copy", "ordf", "laquo", "not", "shy", "reg", ! 92: "macr", "deg", "plusmn", "sup2", "sup3", NULL, /* Zcaron */ ! 93: "micro", "para", "middot", NULL, /* zcaron */ "sup1", "ordm", ! 94: "raquo", "OElig", "oelig", "Yuml", "iquest", "Agrave", "Aacute", ! 95: "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", ! 96: "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", ! 97: "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", ! 98: "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", ! 99: "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", ! 100: "atilde", "auml", "aring", "aelig", "ccedil", "egrave", ! 101: "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", ! 102: "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", ! 103: "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", ! 104: "uuml", "yacute", "thorn", "yuml" ! 105: }; ! 106: ! 107: static entity_table_t ent_uni_338_402[] = { ! 108: /* 338 (0x0152) */ ! 109: "OElig", "oelig", NULL, NULL, NULL, NULL, ! 110: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 111: /* 352 (0x0160) */ ! 112: "Scaron", "scaron", NULL, NULL, NULL, NULL, NULL, NULL, ! 113: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 114: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 115: /* 376 (0x0178) */ ! 116: "Yuml", NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 117: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 118: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 119: /* 400 (0x0190) */ ! 120: NULL, NULL, "fnof" ! 121: }; ! 122: ! 123: static entity_table_t ent_uni_spacing[] = { ! 124: /* 710 */ ! 125: "circ", ! 126: /* 711 - 730 */ ! 127: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 128: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 129: /* 731 - 732 */ ! 130: NULL, "tilde" ! 131: }; ! 132: ! 133: static entity_table_t ent_uni_greek[] = { ! 134: /* 913 */ ! 135: "Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Eta", "Theta", ! 136: "Iota", "Kappa", "Lambda", "Mu", "Nu", "Xi", "Omicron", "Pi", "Rho", ! 137: NULL, "Sigma", "Tau", "Upsilon", "Phi", "Chi", "Psi", "Omega", ! 138: /* 938 - 944 are not mapped */ ! 139: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 140: "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", ! 141: "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", ! 142: "sigmaf", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega", ! 143: /* 970 - 976 are not mapped */ ! 144: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 145: "thetasym", "upsih", ! 146: NULL, NULL, NULL, ! 147: "piv" ! 148: }; ! 149: ! 150: static entity_table_t ent_uni_punct[] = { ! 151: /* 8194 */ ! 152: "ensp", "emsp", NULL, NULL, NULL, NULL, NULL, ! 153: "thinsp", NULL, NULL, "zwnj", "zwj", "lrm", "rlm", ! 154: NULL, NULL, NULL, "ndash", "mdash", NULL, NULL, NULL, ! 155: /* 8216 */ ! 156: "lsquo", "rsquo", "sbquo", NULL, "ldquo", "rdquo", "bdquo", NULL, ! 157: "dagger", "Dagger", "bull", NULL, NULL, NULL, "hellip", ! 158: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "permil", NULL, ! 159: /* 8242 */ ! 160: "prime", "Prime", NULL, NULL, NULL, NULL, NULL, "lsaquo", "rsaquo", NULL, ! 161: NULL, NULL, "oline", NULL, NULL, NULL, NULL, NULL, ! 162: "frasl" ! 163: }; ! 164: ! 165: static entity_table_t ent_uni_euro[] = { ! 166: "euro" ! 167: }; ! 168: ! 169: static entity_table_t ent_uni_8465_8501[] = { ! 170: /* 8465 */ ! 171: "image", NULL, NULL, NULL, NULL, NULL, NULL, ! 172: /* 8472 */ ! 173: "weierp", NULL, NULL, NULL, ! 174: /* 8476 */ ! 175: "real", NULL, NULL, NULL, NULL, NULL, ! 176: /* 8482 */ ! 177: "trade", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 178: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 179: /* 8501 */ ! 180: "alefsym", ! 181: }; ! 182: ! 183: static entity_table_t ent_uni_8592_9002[] = { ! 184: /* 8592 (0x2190) */ ! 185: "larr", "uarr", "rarr", "darr", "harr", NULL, NULL, NULL, ! 186: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 187: /* 8608 (0x21a0) */ ! 188: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 189: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 190: /* 8624 (0x21b0) */ ! 191: NULL, NULL, NULL, NULL, NULL, "crarr", NULL, NULL, ! 192: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 193: /* 8640 (0x21c0) */ ! 194: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 195: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 196: /* 8656 (0x21d0) */ ! 197: "lArr", "uArr", "rArr", "dArr", "hArr", NULL, NULL, NULL, ! 198: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 199: /* 8672 (0x21e0) */ ! 200: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 201: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 202: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 203: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 204: /* 8704 (0x2200) */ ! 205: "forall", NULL, "part", "exist", NULL, "empty", NULL, "nabla", ! 206: "isin", "notin", NULL, "ni", NULL, NULL, NULL, "prod", ! 207: /* 8720 (0x2210) */ ! 208: NULL, "sum", "minus", NULL, NULL, NULL, NULL, "lowast", ! 209: NULL, NULL, "radic", NULL, NULL, "prop", "infin", NULL, ! 210: /* 8736 (0x2220) */ ! 211: "ang", NULL, NULL, NULL, NULL, NULL, NULL, "and", ! 212: "or", "cap", "cup", "int", NULL, NULL, NULL, NULL, ! 213: /* 8752 (0x2230) */ ! 214: NULL, NULL, NULL, NULL, "there4", NULL, NULL, NULL, ! 215: NULL, NULL, NULL, NULL, "sim", NULL, NULL, NULL, ! 216: /* 8768 (0x2240) */ ! 217: NULL, NULL, NULL, NULL, NULL, "cong", NULL, NULL, ! 218: "asymp", NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 219: /* 8784 (0x2250) */ ! 220: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 221: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 222: /* 8800 (0x2260) */ ! 223: "ne", "equiv", NULL, NULL, "le", "ge", NULL, NULL, ! 224: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 225: /* 8816 (0x2270) */ ! 226: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 227: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 228: /* 8832 (0x2280) */ ! 229: NULL, NULL, "sub", "sup", "nsub", NULL, "sube", "supe", ! 230: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 231: /* 8848 (0x2290) */ ! 232: NULL, NULL, NULL, NULL, NULL, "oplus", NULL, "otimes", ! 233: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 234: /* 8864 (0x22a0) */ ! 235: NULL, NULL, NULL, NULL, NULL, "perp", NULL, NULL, ! 236: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 237: /* 8880 (0x22b0) */ ! 238: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 239: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 240: /* 8896 (0x22c0) */ ! 241: NULL, NULL, NULL, NULL, NULL, "sdot", NULL, NULL, ! 242: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 243: /* 8912 (0x22d0) */ ! 244: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 245: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 246: /* 8928 (0x22e0) */ ! 247: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 248: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 249: /* 8944 (0x22f0) */ ! 250: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 251: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 252: /* 8960 (0x2300) */ ! 253: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 254: "lceil", "rceil", "lfloor", "rfloor", NULL, NULL, NULL, NULL, ! 255: /* 8976 (0x2310) */ ! 256: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 257: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 258: /* 8992 (0x2320) */ ! 259: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 260: NULL, "lang", "rang" ! 261: }; ! 262: ! 263: static entity_table_t ent_uni_9674[] = { ! 264: /* 9674 */ ! 265: "loz" ! 266: }; ! 267: ! 268: static entity_table_t ent_uni_9824_9830[] = { ! 269: /* 9824 */ ! 270: "spades", NULL, NULL, "clubs", NULL, "hearts", "diams" ! 271: }; ! 272: ! 273: static entity_table_t ent_koi8r[] = { ! 274: "#1105", /* "jo "*/ ! 275: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 276: NULL, NULL, NULL, NULL, NULL, "#1025", /* "JO" */ ! 277: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 278: "#1102", "#1072", "#1073", "#1094", "#1076", "#1077", "#1092", ! 279: "#1075", "#1093", "#1080", "#1081", "#1082", "#1083", "#1084", ! 280: "#1085", "#1086", "#1087", "#1103", "#1088", "#1089", "#1090", ! 281: "#1091", "#1078", "#1074", "#1100", "#1099", "#1079", "#1096", ! 282: "#1101", "#1097", "#1095", "#1098", "#1070", "#1040", "#1041", ! 283: "#1062", "#1044", "#1045", "#1060", "#1043", "#1061", "#1048", ! 284: "#1049", "#1050", "#1051", "#1052", "#1053", "#1054", "#1055", ! 285: "#1071", "#1056", "#1057", "#1058", "#1059", "#1046", "#1042", ! 286: "#1068", "#1067", "#1047", "#1064", "#1069", "#1065", "#1063", ! 287: "#1066" ! 288: }; ! 289: ! 290: static entity_table_t ent_cp_1251[] = { ! 291: "#1026", "#1027", "#8218", "#1107", "#8222", "hellip", "dagger", ! 292: "Dagger", "euro", "permil", "#1033", "#8249", "#1034", "#1036", ! 293: "#1035", "#1039", "#1106", "#8216", "#8217", "#8219", "#8220", ! 294: "bull", "ndash", "mdash", NULL, "trade", "#1113", "#8250", ! 295: "#1114", "#1116", "#1115", "#1119", "nbsp", "#1038", "#1118", ! 296: "#1032", "curren", "#1168", "brvbar", "sect", "#1025", "copy", ! 297: "#1028", "laquo", "not", "shy", "reg", "#1031", "deg", "plusmn", ! 298: "#1030", "#1110", "#1169", "micro", "para", "middot", "#1105", ! 299: "#8470", "#1108", "raquo", "#1112", "#1029", "#1109", "#1111", ! 300: "#1040", "#1041", "#1042", "#1043", "#1044", "#1045", "#1046", ! 301: "#1047", "#1048", "#1049", "#1050", "#1051", "#1052", "#1053", ! 302: "#1054", "#1055", "#1056", "#1057", "#1058", "#1059", "#1060", ! 303: "#1061", "#1062", "#1063", "#1064", "#1065", "#1066", "#1067", ! 304: "#1068", "#1069", "#1070", "#1071", "#1072", "#1073", "#1074", ! 305: "#1075", "#1076", "#1077", "#1078", "#1079", "#1080", "#1081", ! 306: "#1082", "#1083", "#1084", "#1085", "#1086", "#1087", "#1088", ! 307: "#1089", "#1090", "#1091", "#1092", "#1093", "#1094", "#1095", ! 308: "#1096", "#1097", "#1098", "#1099", "#1100", "#1101", "#1102", ! 309: "#1103" ! 310: }; ! 311: ! 312: static entity_table_t ent_iso_8859_5[] = { ! 313: "#1056", "#1057", "#1058", "#1059", "#1060", "#1061", "#1062", ! 314: "#1063", "#1064", "#1065", "#1066", "#1067", "#1068", "#1069", ! 315: "#1070", "#1071", "#1072", "#1073", "#1074", "#1075", "#1076", ! 316: "#1077", "#1078", "#1079", "#1080", "#1081", "#1082", "#1083", ! 317: "#1084", "#1085", "#1086", "#1087", "#1088", "#1089", "#1090", ! 318: "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", ! 319: "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1104", ! 320: "#1105", "#1106", "#1107", "#1108", "#1109", "#1110", "#1111", ! 321: "#1112", "#1113", "#1114", "#1115", "#1116", "#1117", "#1118", ! 322: "#1119" ! 323: }; ! 324: ! 325: static entity_table_t ent_cp_866[] = { ! 326: ! 327: "#9492", "#9524", "#9516", "#9500", "#9472", "#9532", "#9566", ! 328: "#9567", "#9562", "#9556", "#9577", "#9574", "#9568", "#9552", ! 329: "#9580", "#9575", "#9576", "#9572", "#9573", "#9561", "#9560", ! 330: "#9554", "#9555", "#9579", "#9578", "#9496", "#9484", "#9608", ! 331: "#9604", "#9612", "#9616", "#9600", "#1088", "#1089", "#1090", ! 332: "#1091", "#1092", "#1093", "#1094", "#1095", "#1096", "#1097", ! 333: "#1098", "#1099", "#1100", "#1101", "#1102", "#1103", "#1025", ! 334: "#1105", "#1028", "#1108", "#1031", "#1111", "#1038", "#1118", ! 335: "#176", "#8729", "#183", "#8730", "#8470", "#164", "#9632", ! 336: "#160" ! 337: }; ! 338: ! 339: /* MacRoman has a couple of low-ascii chars that need mapping too */ ! 340: /* Vertical tab (ASCII 11) is often used to store line breaks inside */ ! 341: /* DB exports, this mapping changes it to a space */ ! 342: static entity_table_t ent_macroman[] = { ! 343: "sp", NULL, NULL, NULL, ! 344: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 345: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 346: NULL, NULL, NULL, NULL, NULL, "quot", NULL, ! 347: NULL, NULL, "amp", NULL, NULL, NULL, NULL, ! 348: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 349: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 350: NULL, NULL, NULL, "lt", NULL, "gt", NULL, ! 351: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 352: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 353: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 354: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 355: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 356: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 357: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 358: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 359: NULL, NULL, NULL, NULL, NULL, NULL, NULL, ! 360: NULL, "Auml", "Aring", "Ccedil", "Eacute", "Ntilde", "Ouml", ! 361: "Uuml", "aacute", "agrave", "acirc", "auml", "atilde", "aring", ! 362: "ccedil", "eacute", "egrave", "ecirc", "euml", "iacute", "igrave", ! 363: "icirc", "iuml", "ntilde", "oacute", "ograve", "ocirc", "ouml", ! 364: "otilde", "uacute", "ugrave", "ucirc", "uuml", "dagger", "deg", ! 365: "cent", "pound", "sect", "bull", "para", "szlig", "reg", ! 366: "copy", "trade", "acute", "uml", "ne", "AElig", "Oslash", ! 367: "infin", "plusmn", "le", "ge", "yen", "micro", "part", ! 368: "sum", "prod", "pi", "int", "ordf", "ordm", "Omega", ! 369: "aelig", "oslash", "iquest", "iexcl", "not", "radic", "fnof", ! 370: "asymp", "#8710", "laquo", "raquo", "hellip", "nbsp", "Agrave", ! 371: "Atilde", "Otilde", "OElig", "oelig", "ndash", "mdash", "ldquo", ! 372: "rdquo", "lsquo", "rsquo", "divide", "loz", "yuml", "Yuml", ! 373: "frasl", "euro", "lsaquo", "rsaquo", "#xFB01", "#xFB02", "Dagger", ! 374: "middot", "sbquo", "bdquo", "permil", "Acirc", "Ecirc", "Aacute", ! 375: "Euml", "Egrave", "Iacute", "Icirc", "Iuml", "Igrave", "Oacute", ! 376: "Ocirc", "#xF8FF", "Ograve", "Uacute", "Ucirc", "Ugrave", "#305", ! 377: "circ", "tilde", "macr", "#728", "#729", "#730", "cedil", ! 378: "#733", "#731", "#711" ! 379: }; ! 380: ! 381: struct html_entity_map { ! 382: enum entity_charset charset; /* charset identifier */ ! 383: unsigned int basechar; /* char code at start of table */ ! 384: unsigned int endchar; /* last char code in the table */ ! 385: entity_table_t *table; /* the table of mappings */ ! 386: }; ! 387: ! 388: static const struct html_entity_map entity_map[] = { ! 389: { cs_cp1252, 0x80, 0x9f, ent_cp_1252 }, ! 390: { cs_cp1252, 0xa0, 0xff, ent_iso_8859_1 }, ! 391: { cs_8859_1, 0xa0, 0xff, ent_iso_8859_1 }, ! 392: { cs_8859_15, 0xa0, 0xff, ent_iso_8859_15 }, ! 393: { cs_utf_8, 0xa0, 0xff, ent_iso_8859_1 }, ! 394: { cs_utf_8, 338, 402, ent_uni_338_402 }, ! 395: { cs_utf_8, 710, 732, ent_uni_spacing }, ! 396: { cs_utf_8, 913, 982, ent_uni_greek }, ! 397: { cs_utf_8, 8194, 8260, ent_uni_punct }, ! 398: { cs_utf_8, 8364, 8364, ent_uni_euro }, ! 399: { cs_utf_8, 8465, 8501, ent_uni_8465_8501 }, ! 400: { cs_utf_8, 8592, 9002, ent_uni_8592_9002 }, ! 401: { cs_utf_8, 9674, 9674, ent_uni_9674 }, ! 402: { cs_utf_8, 9824, 9830, ent_uni_9824_9830 }, ! 403: { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, ! 404: { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, ! 405: { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, ! 406: { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, ! 407: { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, ! 408: { cs_koi8r, 0xa3, 0xff, ent_koi8r }, ! 409: { cs_cp1251, 0x80, 0xff, ent_cp_1251 }, ! 410: { cs_8859_5, 0xc0, 0xff, ent_iso_8859_5 }, ! 411: { cs_cp866, 0xc0, 0xff, ent_cp_866 }, ! 412: { cs_macroman, 0x0b, 0xff, ent_macroman }, ! 413: { cs_terminator } ! 414: }; ! 415: ! 416: static const struct { ! 417: const char *codeset; ! 418: enum entity_charset charset; ! 419: } charset_map[] = { ! 420: { "ISO-8859-1", cs_8859_1 }, ! 421: { "ISO8859-1", cs_8859_1 }, ! 422: { "ISO-8859-15", cs_8859_15 }, ! 423: { "ISO8859-15", cs_8859_15 }, ! 424: { "utf-8", cs_utf_8 }, ! 425: { "cp1252", cs_cp1252 }, ! 426: { "Windows-1252", cs_cp1252 }, ! 427: { "1252", cs_cp1252 }, ! 428: { "BIG5", cs_big5 }, ! 429: { "950", cs_big5 }, ! 430: { "GB2312", cs_gb2312 }, ! 431: { "936", cs_gb2312 }, ! 432: { "BIG5-HKSCS", cs_big5hkscs }, ! 433: { "Shift_JIS", cs_sjis }, ! 434: { "SJIS", cs_sjis }, ! 435: { "932", cs_sjis }, ! 436: { "EUCJP", cs_eucjp }, ! 437: { "EUC-JP", cs_eucjp }, ! 438: { "KOI8-R", cs_koi8r }, ! 439: { "koi8-ru", cs_koi8r }, ! 440: { "koi8r", cs_koi8r }, ! 441: { "cp1251", cs_cp1251 }, ! 442: { "Windows-1251", cs_cp1251 }, ! 443: { "win-1251", cs_cp1251 }, ! 444: { "iso8859-5", cs_8859_5 }, ! 445: { "iso-8859-5", cs_8859_5 }, ! 446: { "cp866", cs_cp866 }, ! 447: { "866", cs_cp866 }, ! 448: { "ibm866", cs_cp866 }, ! 449: { "MacRoman", cs_macroman }, ! 450: { NULL } ! 451: }; ! 452: ! 453: static const struct { ! 454: unsigned short charcode; ! 455: char *entity; ! 456: int entitylen; ! 457: int flags; ! 458: } basic_entities[] = { ! 459: { '"', """, 6, ENT_HTML_QUOTE_DOUBLE }, ! 460: { '\'', "'", 6, ENT_HTML_QUOTE_SINGLE }, ! 461: { '\'', "'", 5, ENT_HTML_QUOTE_SINGLE }, ! 462: { '<', "<", 4, 0 }, ! 463: { '>', ">", 4, 0 }, ! 464: { 0, NULL, 0, 0 } ! 465: }; ! 466: ! 467: struct basic_entities_dec { ! 468: unsigned short charcode; ! 469: char entity[8]; ! 470: int entitylen; ! 471: }; ! 472: ! 473: #define MB_RETURN { \ ! 474: *newpos = pos; \ ! 475: mbseq[mbpos] = '\0'; \ ! 476: *mbseqlen = mbpos; \ ! 477: return this_char; } ! 478: ! 479: #define MB_WRITE(mbchar) { \ ! 480: mbspace--; \ ! 481: if (mbspace == 0) { \ ! 482: MB_RETURN; \ ! 483: } \ ! 484: mbseq[mbpos++] = (mbchar); } ! 485: ! 486: /* skip one byte and return */ ! 487: #define MB_FAILURE(pos) do { \ ! 488: *newpos = pos + 1; \ ! 489: *status = FAILURE; \ ! 490: return 0; \ ! 491: } while (0) ! 492: ! 493: #define CHECK_LEN(pos, chars_need) \ ! 494: if (chars_need < 1) { \ ! 495: if((str_len - (pos)) < chars_need) { \ ! 496: *newpos = pos; \ ! 497: *status = FAILURE; \ ! 498: return 0; \ ! 499: } \ ! 500: } else { \ ! 501: if((str_len - (pos)) < chars_need) { \ ! 502: *newpos = pos + 1; \ ! 503: *status = FAILURE; \ ! 504: return 0; \ ! 505: } \ ! 506: } ! 507: ! 508: /* {{{ get_next_char ! 509: */ ! 510: inline static unsigned int get_next_char(enum entity_charset charset, ! 511: unsigned char * str, ! 512: int str_len, ! 513: int * newpos, ! 514: unsigned char * mbseq, ! 515: int * mbseqlen, ! 516: int *status) ! 517: { ! 518: int pos = *newpos; ! 519: int mbpos = 0; ! 520: int mbspace = *mbseqlen; ! 521: unsigned int this_char = 0; ! 522: unsigned char next_char; ! 523: ! 524: *status = SUCCESS; ! 525: ! 526: if (mbspace <= 0) { ! 527: *mbseqlen = 0; ! 528: CHECK_LEN(pos, 1); ! 529: *newpos = pos + 1; ! 530: return str[pos]; ! 531: } ! 532: ! 533: switch (charset) { ! 534: case cs_utf_8: ! 535: { ! 536: unsigned char c; ! 537: CHECK_LEN(pos, 1); ! 538: c = str[pos]; ! 539: if (c < 0x80) { ! 540: MB_WRITE(c); ! 541: this_char = c; ! 542: pos++; ! 543: } else if (c < 0xc2) { ! 544: MB_FAILURE(pos); ! 545: } else if (c < 0xe0) { ! 546: CHECK_LEN(pos, 2); ! 547: if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { ! 548: MB_FAILURE(pos); ! 549: } ! 550: this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f); ! 551: if (this_char < 0x80) { ! 552: MB_FAILURE(pos); ! 553: } ! 554: MB_WRITE((unsigned char)c); ! 555: MB_WRITE((unsigned char)str[pos + 1]); ! 556: pos += 2; ! 557: } else if (c < 0xf0) { ! 558: CHECK_LEN(pos, 3); ! 559: if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { ! 560: MB_FAILURE(pos); ! 561: } ! 562: if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) { ! 563: MB_FAILURE(pos); ! 564: } ! 565: this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f); ! 566: if (this_char < 0x800) { ! 567: MB_FAILURE(pos); ! 568: } else if (this_char >= 0xd800 && this_char <= 0xdfff) { ! 569: MB_FAILURE(pos); ! 570: } ! 571: MB_WRITE((unsigned char)c); ! 572: MB_WRITE((unsigned char)str[pos + 1]); ! 573: MB_WRITE((unsigned char)str[pos + 2]); ! 574: pos += 3; ! 575: } else if (c < 0xf5) { ! 576: CHECK_LEN(pos, 4); ! 577: if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { ! 578: MB_FAILURE(pos); ! 579: } ! 580: if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) { ! 581: MB_FAILURE(pos); ! 582: } ! 583: if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) { ! 584: MB_FAILURE(pos); ! 585: } ! 586: this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f); ! 587: if (this_char < 0x10000 || this_char > 0x10FFFF) { ! 588: MB_FAILURE(pos); ! 589: } ! 590: MB_WRITE((unsigned char)c); ! 591: MB_WRITE((unsigned char)str[pos + 1]); ! 592: MB_WRITE((unsigned char)str[pos + 2]); ! 593: MB_WRITE((unsigned char)str[pos + 3]); ! 594: pos += 4; ! 595: } else { ! 596: MB_FAILURE(pos); ! 597: } ! 598: } ! 599: break; ! 600: case cs_big5: ! 601: case cs_gb2312: ! 602: case cs_big5hkscs: ! 603: { ! 604: CHECK_LEN(pos, 1); ! 605: this_char = str[pos++]; ! 606: /* check if this is the first of a 2-byte sequence */ ! 607: if (this_char >= 0x81 && this_char <= 0xfe) { ! 608: /* peek at the next char */ ! 609: CHECK_LEN(pos, 1); ! 610: next_char = str[pos++]; ! 611: if ((next_char >= 0x40 && next_char <= 0x7e) || ! 612: (next_char >= 0xa1 && next_char <= 0xfe)) { ! 613: /* yes, this a wide char */ ! 614: MB_WRITE(this_char); ! 615: MB_WRITE(next_char); ! 616: this_char = (this_char << 8) | next_char; ! 617: } else { ! 618: MB_FAILURE(pos); ! 619: } ! 620: } else { ! 621: MB_WRITE(this_char); ! 622: } ! 623: } ! 624: break; ! 625: case cs_sjis: ! 626: { ! 627: CHECK_LEN(pos, 1); ! 628: this_char = str[pos++]; ! 629: /* check if this is the first of a 2-byte sequence */ ! 630: if ((this_char >= 0x81 && this_char <= 0x9f) || ! 631: (this_char >= 0xe0 && this_char <= 0xfc)) { ! 632: /* peek at the next char */ ! 633: CHECK_LEN(pos, 1); ! 634: next_char = str[pos++]; ! 635: if ((next_char >= 0x40 && next_char <= 0x7e) || ! 636: (next_char >= 0x80 && next_char <= 0xfc)) ! 637: { ! 638: /* yes, this a wide char */ ! 639: MB_WRITE(this_char); ! 640: MB_WRITE(next_char); ! 641: this_char = (this_char << 8) | next_char; ! 642: } else { ! 643: MB_FAILURE(pos); ! 644: } ! 645: } else { ! 646: MB_WRITE(this_char); ! 647: } ! 648: break; ! 649: } ! 650: case cs_eucjp: ! 651: { ! 652: CHECK_LEN(pos, 1); ! 653: this_char = str[pos++]; ! 654: /* check if this is the first of a multi-byte sequence */ ! 655: if (this_char >= 0xa1 && this_char <= 0xfe) { ! 656: /* peek at the next char */ ! 657: CHECK_LEN(pos, 1); ! 658: next_char = str[pos++]; ! 659: if (next_char >= 0xa1 && next_char <= 0xfe) { ! 660: /* yes, this a jis kanji char */ ! 661: MB_WRITE(this_char); ! 662: MB_WRITE(next_char); ! 663: this_char = (this_char << 8) | next_char; ! 664: } else { ! 665: MB_FAILURE(pos); ! 666: } ! 667: } else if (this_char == 0x8e) { ! 668: /* peek at the next char */ ! 669: CHECK_LEN(pos, 1); ! 670: next_char = str[pos++]; ! 671: if (next_char >= 0xa1 && next_char <= 0xdf) { ! 672: /* JIS X 0201 kana */ ! 673: MB_WRITE(this_char); ! 674: MB_WRITE(next_char); ! 675: this_char = (this_char << 8) | next_char; ! 676: } else { ! 677: MB_FAILURE(pos); ! 678: } ! 679: } else if (this_char == 0x8f) { ! 680: /* peek at the next two char */ ! 681: unsigned char next2_char; ! 682: CHECK_LEN(pos, 2); ! 683: next_char = str[pos]; ! 684: next2_char = str[pos + 1]; ! 685: pos += 2; ! 686: if ((next_char >= 0xa1 && next_char <= 0xfe) && ! 687: (next2_char >= 0xa1 && next2_char <= 0xfe)) { ! 688: /* JIS X 0212 hojo-kanji */ ! 689: MB_WRITE(this_char); ! 690: MB_WRITE(next_char); ! 691: MB_WRITE(next2_char); ! 692: this_char = (this_char << 16) | (next_char << 8) | next2_char; ! 693: } else { ! 694: MB_FAILURE(pos); ! 695: } ! 696: } else { ! 697: MB_WRITE(this_char); ! 698: } ! 699: break; ! 700: } ! 701: default: ! 702: /* single-byte charsets */ ! 703: CHECK_LEN(pos, 1); ! 704: this_char = str[pos++]; ! 705: MB_WRITE(this_char); ! 706: break; ! 707: } ! 708: MB_RETURN; ! 709: } ! 710: /* }}} */ ! 711: ! 712: /* {{{ entity_charset determine_charset ! 713: * returns the charset identifier based on current locale or a hint. ! 714: * defaults to iso-8859-1 */ ! 715: static enum entity_charset determine_charset(char *charset_hint TSRMLS_DC) ! 716: { ! 717: int i; ! 718: enum entity_charset charset = cs_8859_1; ! 719: int len = 0; ! 720: zval *uf_result = NULL; ! 721: ! 722: /* Guarantee default behaviour for backwards compatibility */ ! 723: if (charset_hint == NULL) ! 724: return cs_8859_1; ! 725: ! 726: if ((len = strlen(charset_hint)) != 0) { ! 727: goto det_charset; ! 728: } ! 729: #if HAVE_MBSTRING ! 730: #if !defined(COMPILE_DL_MBSTRING) ! 731: /* XXX: Ugly things. Why don't we look for a more sophisticated way? */ ! 732: switch (MBSTRG(current_internal_encoding)) { ! 733: case mbfl_no_encoding_8859_1: ! 734: return cs_8859_1; ! 735: ! 736: case mbfl_no_encoding_utf8: ! 737: return cs_utf_8; ! 738: ! 739: case mbfl_no_encoding_euc_jp: ! 740: case mbfl_no_encoding_eucjp_win: ! 741: return cs_eucjp; ! 742: ! 743: case mbfl_no_encoding_sjis: ! 744: case mbfl_no_encoding_sjis_open: ! 745: case mbfl_no_encoding_cp932: ! 746: return cs_sjis; ! 747: ! 748: case mbfl_no_encoding_cp1252: ! 749: return cs_cp1252; ! 750: ! 751: case mbfl_no_encoding_8859_15: ! 752: return cs_8859_15; ! 753: ! 754: case mbfl_no_encoding_big5: ! 755: return cs_big5; ! 756: ! 757: case mbfl_no_encoding_euc_cn: ! 758: case mbfl_no_encoding_hz: ! 759: case mbfl_no_encoding_cp936: ! 760: return cs_gb2312; ! 761: ! 762: case mbfl_no_encoding_koi8r: ! 763: return cs_koi8r; ! 764: ! 765: case mbfl_no_encoding_cp866: ! 766: return cs_cp866; ! 767: ! 768: case mbfl_no_encoding_cp1251: ! 769: return cs_cp1251; ! 770: ! 771: case mbfl_no_encoding_8859_5: ! 772: return cs_8859_5; ! 773: ! 774: default: ! 775: ; ! 776: } ! 777: #else ! 778: { ! 779: zval nm_mb_internal_encoding; ! 780: ! 781: ZVAL_STRING(&nm_mb_internal_encoding, "mb_internal_encoding", 0); ! 782: ! 783: if (call_user_function_ex(CG(function_table), NULL, &nm_mb_internal_encoding, &uf_result, 0, NULL, 1, NULL TSRMLS_CC) != FAILURE) { ! 784: ! 785: charset_hint = Z_STRVAL_P(uf_result); ! 786: len = Z_STRLEN_P(uf_result); ! 787: ! 788: if (len == 4) { /* sizeof(none|auto|pass)-1 */ ! 789: if (!memcmp("pass", charset_hint, sizeof("pass") - 1) || ! 790: !memcmp("auto", charset_hint, sizeof("auto") - 1) || ! 791: !memcmp("none", charset_hint, sizeof("none") - 1)) { ! 792: ! 793: charset_hint = NULL; ! 794: len = 0; ! 795: } ! 796: } ! 797: goto det_charset; ! 798: } ! 799: } ! 800: #endif ! 801: #endif ! 802: ! 803: charset_hint = SG(default_charset); ! 804: if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) { ! 805: goto det_charset; ! 806: } ! 807: ! 808: /* try to detect the charset for the locale */ ! 809: #if HAVE_NL_LANGINFO && HAVE_LOCALE_H && defined(CODESET) ! 810: charset_hint = nl_langinfo(CODESET); ! 811: if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) { ! 812: goto det_charset; ! 813: } ! 814: #endif ! 815: ! 816: #if HAVE_LOCALE_H ! 817: /* try to figure out the charset from the locale */ ! 818: { ! 819: char *localename; ! 820: char *dot, *at; ! 821: ! 822: /* lang[_territory][.codeset][@modifier] */ ! 823: localename = setlocale(LC_CTYPE, NULL); ! 824: ! 825: dot = strchr(localename, '.'); ! 826: if (dot) { ! 827: dot++; ! 828: /* locale specifies a codeset */ ! 829: at = strchr(dot, '@'); ! 830: if (at) ! 831: len = at - dot; ! 832: else ! 833: len = strlen(dot); ! 834: charset_hint = dot; ! 835: } else { ! 836: /* no explicit name; see if the name itself ! 837: * is the charset */ ! 838: charset_hint = localename; ! 839: len = strlen(charset_hint); ! 840: } ! 841: } ! 842: #endif ! 843: ! 844: det_charset: ! 845: ! 846: if (charset_hint) { ! 847: int found = 0; ! 848: ! 849: /* now walk the charset map and look for the codeset */ ! 850: for (i = 0; charset_map[i].codeset; i++) { ! 851: if (len == strlen(charset_map[i].codeset) && strncasecmp(charset_hint, charset_map[i].codeset, len) == 0) { ! 852: charset = charset_map[i].charset; ! 853: found = 1; ! 854: break; ! 855: } ! 856: } ! 857: if (!found) { ! 858: php_error_docref(NULL TSRMLS_CC, E_WARNING, "charset `%s' not supported, assuming iso-8859-1", ! 859: charset_hint); ! 860: } ! 861: } ! 862: if (uf_result != NULL) { ! 863: zval_ptr_dtor(&uf_result); ! 864: } ! 865: return charset; ! 866: } ! 867: /* }}} */ ! 868: ! 869: /* {{{ php_utf32_utf8 */ ! 870: size_t php_utf32_utf8(unsigned char *buf, unsigned k) ! 871: { ! 872: size_t retval = 0; ! 873: ! 874: if (k < 0x80) { ! 875: buf[0] = k; ! 876: retval = 1; ! 877: } else if (k < 0x800) { ! 878: buf[0] = 0xc0 | (k >> 6); ! 879: buf[1] = 0x80 | (k & 0x3f); ! 880: retval = 2; ! 881: } else if (k < 0x10000) { ! 882: buf[0] = 0xe0 | (k >> 12); ! 883: buf[1] = 0x80 | ((k >> 6) & 0x3f); ! 884: buf[2] = 0x80 | (k & 0x3f); ! 885: retval = 3; ! 886: } else if (k < 0x200000) { ! 887: buf[0] = 0xf0 | (k >> 18); ! 888: buf[1] = 0x80 | ((k >> 12) & 0x3f); ! 889: buf[2] = 0x80 | ((k >> 6) & 0x3f); ! 890: buf[3] = 0x80 | (k & 0x3f); ! 891: retval = 4; ! 892: } else if (k < 0x4000000) { ! 893: buf[0] = 0xf8 | (k >> 24); ! 894: buf[1] = 0x80 | ((k >> 18) & 0x3f); ! 895: buf[2] = 0x80 | ((k >> 12) & 0x3f); ! 896: buf[3] = 0x80 | ((k >> 6) & 0x3f); ! 897: buf[4] = 0x80 | (k & 0x3f); ! 898: retval = 5; ! 899: } else { ! 900: buf[0] = 0xfc | (k >> 30); ! 901: buf[1] = 0x80 | ((k >> 24) & 0x3f); ! 902: buf[2] = 0x80 | ((k >> 18) & 0x3f); ! 903: buf[3] = 0x80 | ((k >> 12) & 0x3f); ! 904: buf[4] = 0x80 | ((k >> 6) & 0x3f); ! 905: buf[5] = 0x80 | (k & 0x3f); ! 906: retval = 6; ! 907: } ! 908: buf[retval] = '\0'; ! 909: ! 910: return retval; ! 911: } ! 912: /* }}} */ ! 913: ! 914: /* {{{ php_unescape_html_entities ! 915: */ ! 916: PHPAPI char *php_unescape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) ! 917: { ! 918: int retlen; ! 919: int j, k; ! 920: char *replaced, *ret, *p, *q, *lim, *next; ! 921: enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC); ! 922: unsigned char replacement[15]; ! 923: int replacement_len; ! 924: ! 925: ret = estrndup(old, oldlen); ! 926: retlen = oldlen; ! 927: if (!retlen) { ! 928: goto empty_source; ! 929: } ! 930: ! 931: if (all) { ! 932: /* look for a match in the maps for this charset */ ! 933: for (j = 0; entity_map[j].charset != cs_terminator; j++) { ! 934: if (entity_map[j].charset != charset) ! 935: continue; ! 936: ! 937: for (k = entity_map[j].basechar; k <= entity_map[j].endchar; k++) { ! 938: unsigned char entity[32]; ! 939: int entity_length = 0; ! 940: ! 941: if (entity_map[j].table[k - entity_map[j].basechar] == NULL) ! 942: continue; ! 943: ! 944: entity_length = slprintf(entity, sizeof(entity), "&%s;", entity_map[j].table[k - entity_map[j].basechar]); ! 945: if (entity_length >= sizeof(entity)) { ! 946: continue; ! 947: } ! 948: ! 949: /* When we have MBCS entities in the tables above, this will need to handle it */ ! 950: replacement_len = 0; ! 951: switch (charset) { ! 952: case cs_8859_1: ! 953: case cs_cp1252: ! 954: case cs_8859_15: ! 955: case cs_cp1251: ! 956: case cs_8859_5: ! 957: case cs_cp866: ! 958: case cs_koi8r: ! 959: replacement[0] = k; ! 960: replacement[1] = '\0'; ! 961: replacement_len = 1; ! 962: break; ! 963: ! 964: case cs_big5: ! 965: case cs_gb2312: ! 966: case cs_big5hkscs: ! 967: case cs_sjis: ! 968: case cs_eucjp: ! 969: /* we cannot properly handle those multibyte encodings ! 970: * with php_str_to_str. skip it. */ ! 971: continue; ! 972: ! 973: case cs_utf_8: ! 974: replacement_len = php_utf32_utf8(replacement, k); ! 975: break; ! 976: ! 977: default: ! 978: php_error_docref(NULL TSRMLS_CC, E_WARNING, "cannot yet handle MBCS!"); ! 979: efree(ret); ! 980: return NULL; ! 981: } ! 982: ! 983: if (php_memnstr(ret, entity, entity_length, ret+retlen)) { ! 984: replaced = php_str_to_str(ret, retlen, entity, entity_length, replacement, replacement_len, &retlen); ! 985: efree(ret); ! 986: ret = replaced; ! 987: } ! 988: } ! 989: } ! 990: } ! 991: ! 992: for (j = 0; basic_entities[j].charcode != 0; j++) { ! 993: ! 994: if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0) ! 995: continue; ! 996: ! 997: replacement[0] = (unsigned char)basic_entities[j].charcode; ! 998: replacement[1] = '\0'; ! 999: ! 1000: if (php_memnstr(ret, basic_entities[j].entity, basic_entities[j].entitylen, ret+retlen)) { ! 1001: replaced = php_str_to_str(ret, retlen, basic_entities[j].entity, basic_entities[j].entitylen, replacement, 1, &retlen); ! 1002: efree(ret); ! 1003: ret = replaced; ! 1004: } ! 1005: } ! 1006: ! 1007: /* replace numeric entities & "&" */ ! 1008: lim = ret + retlen; ! 1009: for (p = ret, q = ret; p < lim;) { ! 1010: int code; ! 1011: ! 1012: if (p[0] == '&') { ! 1013: if (p + 2 < lim) { ! 1014: if (p[1] == '#') { ! 1015: int invalid_code = 0; ! 1016: ! 1017: if (p[2] == 'x' || p[2] == 'X') { ! 1018: code = strtol(p + 3, &next, 16); ! 1019: } else { ! 1020: code = strtol(p + 2, &next, 10); ! 1021: } ! 1022: ! 1023: if ((code == '\'' && !(quote_style & ENT_HTML_QUOTE_SINGLE)) || ! 1024: (code == '"' && !(quote_style & ENT_HTML_QUOTE_DOUBLE))) { ! 1025: invalid_code = 1; ! 1026: } ! 1027: ! 1028: if (next != NULL && *next == ';' && !invalid_code) { ! 1029: switch (charset) { ! 1030: case cs_utf_8: ! 1031: q += php_utf32_utf8(q, code); ! 1032: break; ! 1033: ! 1034: case cs_8859_1: ! 1035: case cs_8859_5: ! 1036: case cs_8859_15: ! 1037: if ((code >= 0x80 && code < 0xa0) || code > 0xff) { ! 1038: invalid_code = 1; ! 1039: } else { ! 1040: *(q++) = code; ! 1041: } ! 1042: break; ! 1043: ! 1044: case cs_cp1252: ! 1045: if (code > 0xff) { ! 1046: invalid_code = 1; ! 1047: } else { ! 1048: *(q++) = code; ! 1049: } ! 1050: break; ! 1051: ! 1052: case cs_cp1251: ! 1053: case cs_cp866: ! 1054: case cs_big5: ! 1055: case cs_big5hkscs: ! 1056: case cs_sjis: ! 1057: case cs_eucjp: ! 1058: if (code >= 0x80) { ! 1059: invalid_code = 1; ! 1060: } else { ! 1061: *(q++) = code; ! 1062: } ! 1063: break; ! 1064: ! 1065: case cs_gb2312: ! 1066: if (code >= 0x81) { ! 1067: invalid_code = 1; ! 1068: } else { ! 1069: *(q++) = code; ! 1070: } ! 1071: break; ! 1072: ! 1073: default: ! 1074: /* for backwards compatilibity */ ! 1075: invalid_code = 1; ! 1076: break; ! 1077: } ! 1078: if (invalid_code) { ! 1079: for (; p <= next; p++) { ! 1080: *(q++) = *p; ! 1081: } ! 1082: } ! 1083: p = next + 1; ! 1084: } else { ! 1085: *(q++) = *(p++); ! 1086: *(q++) = *(p++); ! 1087: } ! 1088: } else if (p + 4 < lim && ! 1089: p[1] == 'a' && p[2] == 'm' &&p[3] == 'p' && ! 1090: p[4] == ';') { ! 1091: *(q++) = '&'; ! 1092: p += 5; ! 1093: } else { ! 1094: *(q++) = *(p++); ! 1095: *(q++) = *(p++); ! 1096: } ! 1097: } else { ! 1098: *(q++) = *(p++); ! 1099: } ! 1100: } else { ! 1101: *(q++) = *(p++); ! 1102: } ! 1103: } ! 1104: *q = '\0'; ! 1105: retlen = (size_t)(q - ret); ! 1106: empty_source: ! 1107: *newlen = retlen; ! 1108: return ret; ! 1109: } ! 1110: /* }}} */ ! 1111: ! 1112: PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) ! 1113: { ! 1114: return php_escape_html_entities_ex(old, oldlen, newlen, all, quote_style, hint_charset, 1 TSRMLS_CC); ! 1115: } ! 1116: ! 1117: ! 1118: /* {{{ php_escape_html_entities ! 1119: */ ! 1120: PHPAPI char *php_escape_html_entities_ex(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset, zend_bool double_encode TSRMLS_DC) ! 1121: { ! 1122: int i, j, maxlen, len; ! 1123: char *replaced; ! 1124: enum entity_charset charset = determine_charset(hint_charset TSRMLS_CC); ! 1125: int matches_map; ! 1126: ! 1127: maxlen = 2 * oldlen; ! 1128: if (maxlen < 128) ! 1129: maxlen = 128; ! 1130: replaced = emalloc (maxlen); ! 1131: len = 0; ! 1132: i = 0; ! 1133: while (i < oldlen) { ! 1134: unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */ ! 1135: int mbseqlen = sizeof(mbsequence); ! 1136: int status = SUCCESS; ! 1137: unsigned int this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status); ! 1138: ! 1139: if(status == FAILURE) { ! 1140: /* invalid MB sequence */ ! 1141: if (quote_style & ENT_HTML_IGNORE_ERRORS) { ! 1142: continue; ! 1143: } ! 1144: efree(replaced); ! 1145: if(!PG(display_errors)) { ! 1146: php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument"); ! 1147: } ! 1148: *newlen = 0; ! 1149: return STR_EMPTY_ALLOC(); ! 1150: } ! 1151: matches_map = 0; ! 1152: ! 1153: if (len + 16 > maxlen) ! 1154: replaced = erealloc (replaced, maxlen += 128); ! 1155: ! 1156: if (all) { ! 1157: /* look for a match in the maps for this charset */ ! 1158: unsigned char *rep = NULL; ! 1159: ! 1160: ! 1161: for (j = 0; entity_map[j].charset != cs_terminator; j++) { ! 1162: if (entity_map[j].charset == charset ! 1163: && this_char >= entity_map[j].basechar ! 1164: && this_char <= entity_map[j].endchar) { ! 1165: rep = (unsigned char*)entity_map[j].table[this_char - entity_map[j].basechar]; ! 1166: if (rep == NULL) { ! 1167: /* there is no entity for this position; fall through and ! 1168: * just output the character itself */ ! 1169: break; ! 1170: } ! 1171: ! 1172: matches_map = 1; ! 1173: break; ! 1174: } ! 1175: } ! 1176: ! 1177: if (matches_map) { ! 1178: int l = strlen(rep); ! 1179: /* increase the buffer size */ ! 1180: if (len + 2 + l >= maxlen) { ! 1181: replaced = erealloc(replaced, maxlen += 128); ! 1182: } ! 1183: ! 1184: replaced[len++] = '&'; ! 1185: strlcpy(replaced + len, rep, maxlen); ! 1186: len += l; ! 1187: replaced[len++] = ';'; ! 1188: } ! 1189: } ! 1190: if (!matches_map) { ! 1191: int is_basic = 0; ! 1192: ! 1193: if (this_char == '&') { ! 1194: if (double_encode) { ! 1195: encode_amp: ! 1196: memcpy(replaced + len, "&", sizeof("&") - 1); ! 1197: len += sizeof("&") - 1; ! 1198: } else { ! 1199: char *e = memchr(old + i, ';', oldlen - i); ! 1200: char *s = old + i; ! 1201: ! 1202: if (!e || (e - s) > 10) { /* minor optimization to avoid "entities" over 10 chars in length */ ! 1203: goto encode_amp; ! 1204: } else { ! 1205: if (*s == '#') { /* numeric entities */ ! 1206: s++; ! 1207: /* Hex (Z) */ ! 1208: if (*s == 'x' || *s == 'X') { ! 1209: s++; ! 1210: while (s < e) { ! 1211: if (!isxdigit((int)*(unsigned char *)s++)) { ! 1212: goto encode_amp; ! 1213: } ! 1214: } ! 1215: /* Dec (Z)*/ ! 1216: } else { ! 1217: while (s < e) { ! 1218: if (!isdigit((int)*(unsigned char *)s++)) { ! 1219: goto encode_amp; ! 1220: } ! 1221: } ! 1222: } ! 1223: } else { /* text entities */ ! 1224: while (s < e) { ! 1225: if (!isalnum((int)*(unsigned char *)s++)) { ! 1226: goto encode_amp; ! 1227: } ! 1228: } ! 1229: } ! 1230: replaced[len++] = '&'; ! 1231: } ! 1232: } ! 1233: is_basic = 1; ! 1234: } else { ! 1235: for (j = 0; basic_entities[j].charcode != 0; j++) { ! 1236: if ((basic_entities[j].charcode != this_char) || ! 1237: (basic_entities[j].flags && ! 1238: (quote_style & basic_entities[j].flags) == 0)) { ! 1239: continue; ! 1240: } ! 1241: ! 1242: memcpy(replaced + len, basic_entities[j].entity, basic_entities[j].entitylen); ! 1243: len += basic_entities[j].entitylen; ! 1244: ! 1245: is_basic = 1; ! 1246: break; ! 1247: } ! 1248: } ! 1249: ! 1250: if (!is_basic) { ! 1251: /* a wide char without a named entity; pass through the original sequence */ ! 1252: if (mbseqlen > 1) { ! 1253: memcpy(replaced + len, mbsequence, mbseqlen); ! 1254: len += mbseqlen; ! 1255: } else { ! 1256: replaced[len++] = (unsigned char)this_char; ! 1257: } ! 1258: } ! 1259: } ! 1260: } ! 1261: replaced[len] = '\0'; ! 1262: *newlen = len; ! 1263: ! 1264: return replaced; ! 1265: ! 1266: ! 1267: } ! 1268: /* }}} */ ! 1269: ! 1270: /* {{{ php_html_entities ! 1271: */ ! 1272: static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) ! 1273: { ! 1274: char *str, *hint_charset = NULL; ! 1275: int str_len, hint_charset_len = 0; ! 1276: int len; ! 1277: long quote_style = ENT_COMPAT; ! 1278: char *replaced; ! 1279: zend_bool double_encode = 1; ! 1280: ! 1281: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls!b", &str, &str_len, "e_style, &hint_charset, &hint_charset_len, &double_encode) == FAILURE) { ! 1282: return; ! 1283: } ! 1284: ! 1285: replaced = php_escape_html_entities_ex(str, str_len, &len, all, quote_style, hint_charset, double_encode TSRMLS_CC); ! 1286: RETVAL_STRINGL(replaced, len, 0); ! 1287: } ! 1288: /* }}} */ ! 1289: ! 1290: #define HTML_SPECIALCHARS 0 ! 1291: #define HTML_ENTITIES 1 ! 1292: ! 1293: /* {{{ register_html_constants ! 1294: */ ! 1295: void register_html_constants(INIT_FUNC_ARGS) ! 1296: { ! 1297: REGISTER_LONG_CONSTANT("HTML_SPECIALCHARS", HTML_SPECIALCHARS, CONST_PERSISTENT|CONST_CS); ! 1298: REGISTER_LONG_CONSTANT("HTML_ENTITIES", HTML_ENTITIES, CONST_PERSISTENT|CONST_CS); ! 1299: REGISTER_LONG_CONSTANT("ENT_COMPAT", ENT_COMPAT, CONST_PERSISTENT|CONST_CS); ! 1300: REGISTER_LONG_CONSTANT("ENT_QUOTES", ENT_QUOTES, CONST_PERSISTENT|CONST_CS); ! 1301: REGISTER_LONG_CONSTANT("ENT_NOQUOTES", ENT_NOQUOTES, CONST_PERSISTENT|CONST_CS); ! 1302: REGISTER_LONG_CONSTANT("ENT_IGNORE", ENT_IGNORE, CONST_PERSISTENT|CONST_CS); ! 1303: } ! 1304: /* }}} */ ! 1305: ! 1306: /* {{{ proto string htmlspecialchars(string string [, int quote_style[, string charset[, bool double_encode]]]) ! 1307: Convert special characters to HTML entities */ ! 1308: PHP_FUNCTION(htmlspecialchars) ! 1309: { ! 1310: php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); ! 1311: } ! 1312: /* }}} */ ! 1313: ! 1314: /* {{{ proto string htmlspecialchars_decode(string string [, int quote_style]) ! 1315: Convert special HTML entities back to characters */ ! 1316: PHP_FUNCTION(htmlspecialchars_decode) ! 1317: { ! 1318: char *str, *new_str, *e, *p; ! 1319: int len, j, i, new_len; ! 1320: long quote_style = ENT_COMPAT; ! 1321: struct basic_entities_dec basic_entities_dec[8]; ! 1322: ! 1323: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|l", &str, &len, "e_style) == FAILURE) { ! 1324: return; ! 1325: } ! 1326: ! 1327: new_str = estrndup(str, len); ! 1328: new_len = len; ! 1329: e = new_str + new_len; ! 1330: ! 1331: if (!(p = memchr(new_str, '&', new_len))) { ! 1332: RETURN_STRINGL(new_str, new_len, 0); ! 1333: } ! 1334: ! 1335: for (j = 0, i = 0; basic_entities[i].charcode != 0; i++) { ! 1336: if (basic_entities[i].flags && !(quote_style & basic_entities[i].flags)) { ! 1337: continue; ! 1338: } ! 1339: basic_entities_dec[j].charcode = basic_entities[i].charcode; ! 1340: memcpy(basic_entities_dec[j].entity, basic_entities[i].entity, basic_entities[i].entitylen + 1); ! 1341: basic_entities_dec[j].entitylen = basic_entities[i].entitylen; ! 1342: j++; ! 1343: } ! 1344: basic_entities_dec[j].charcode = '&'; ! 1345: basic_entities_dec[j].entitylen = sizeof("&") - 1; ! 1346: memcpy(basic_entities_dec[j].entity, "&", sizeof("&")); ! 1347: i = j + 1; ! 1348: ! 1349: do { ! 1350: int l = e - p; ! 1351: ! 1352: for (j = 0; j < i; j++) { ! 1353: if (basic_entities_dec[j].entitylen > l) { ! 1354: continue; ! 1355: } ! 1356: if (!memcmp(p, basic_entities_dec[j].entity, basic_entities_dec[j].entitylen)) { ! 1357: int e_len = basic_entities_dec[j].entitylen - 1; ! 1358: ! 1359: *p++ = basic_entities_dec[j].charcode; ! 1360: memmove(p, p + e_len, (e - p - e_len)); ! 1361: e -= e_len; ! 1362: goto done; ! 1363: } ! 1364: } ! 1365: p++; ! 1366: ! 1367: done: ! 1368: if (p >= e) { ! 1369: break; ! 1370: } ! 1371: } while ((p = memchr(p, '&', (e - p)))); ! 1372: ! 1373: new_len = e - new_str; ! 1374: ! 1375: new_str[new_len] = '\0'; ! 1376: RETURN_STRINGL(new_str, new_len, 0); ! 1377: } ! 1378: /* }}} */ ! 1379: ! 1380: /* {{{ proto string html_entity_decode(string string [, int quote_style][, string charset]) ! 1381: Convert all HTML entities to their applicable characters */ ! 1382: PHP_FUNCTION(html_entity_decode) ! 1383: { ! 1384: char *str, *hint_charset = NULL; ! 1385: int str_len, hint_charset_len = 0, len; ! 1386: long quote_style = ENT_COMPAT; ! 1387: char *replaced; ! 1388: ! 1389: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|ls", &str, &str_len, ! 1390: "e_style, &hint_charset, &hint_charset_len) == FAILURE) { ! 1391: return; ! 1392: } ! 1393: ! 1394: replaced = php_unescape_html_entities(str, str_len, &len, 1, quote_style, hint_charset TSRMLS_CC); ! 1395: if (replaced) { ! 1396: RETURN_STRINGL(replaced, len, 0); ! 1397: } ! 1398: RETURN_FALSE; ! 1399: } ! 1400: /* }}} */ ! 1401: ! 1402: ! 1403: /* {{{ proto string htmlentities(string string [, int quote_style[, string charset[, bool double_encode]]]) ! 1404: Convert all applicable characters to HTML entities */ ! 1405: PHP_FUNCTION(htmlentities) ! 1406: { ! 1407: php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1); ! 1408: } ! 1409: /* }}} */ ! 1410: ! 1411: /* {{{ proto array get_html_translation_table([int table [, int quote_style [, string charset_hint]]]) ! 1412: Returns the internal translation table used by htmlspecialchars and htmlentities */ ! 1413: PHP_FUNCTION(get_html_translation_table) ! 1414: { ! 1415: long which = HTML_SPECIALCHARS, quote_style = ENT_COMPAT; ! 1416: unsigned int i; ! 1417: int j; ! 1418: unsigned char ind[5]; /* max # of 8-bit code units (4; for UTF-8) + 1 for \0 */ ! 1419: void *dummy; ! 1420: char *charset_hint = NULL; ! 1421: int charset_hint_len; ! 1422: enum entity_charset charset; ! 1423: ! 1424: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "|lls", ! 1425: &which, "e_style, &charset_hint, &charset_hint_len) == FAILURE) { ! 1426: return; ! 1427: } ! 1428: ! 1429: charset = determine_charset(charset_hint TSRMLS_CC); ! 1430: ! 1431: array_init(return_value); ! 1432: ! 1433: switch (which) { ! 1434: case HTML_ENTITIES: ! 1435: for (j = 0; entity_map[j].charset != cs_terminator; j++) { ! 1436: if (entity_map[j].charset != charset) ! 1437: continue; ! 1438: for (i = 0; i <= entity_map[j].endchar - entity_map[j].basechar; i++) { ! 1439: char buffer[16]; ! 1440: unsigned k; ! 1441: size_t written; ! 1442: ! 1443: if (entity_map[j].table[i] == NULL) ! 1444: continue; ! 1445: ! 1446: k = i + entity_map[j].basechar; ! 1447: ! 1448: switch (charset) { ! 1449: case cs_utf_8: ! 1450: written = php_utf32_utf8(ind, k); ! 1451: ind[written] = '\0'; ! 1452: break; ! 1453: case cs_big5: ! 1454: case cs_gb2312: ! 1455: case cs_big5hkscs: ! 1456: case cs_sjis: ! 1457: /* we have no mappings for these, but if we had... */ ! 1458: /* break through */ ! 1459: default: /* one byte */ ! 1460: written = 1; ! 1461: ind[0] = (unsigned char)k; ! 1462: ind[1] = '\0'; ! 1463: break; ! 1464: } ! 1465: ! 1466: snprintf(buffer, sizeof(buffer), "&%s;", entity_map[j].table[i]); ! 1467: if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, written+1, &dummy) == FAILURE) { ! 1468: /* in case of the single quote, which is repeated, the first one wins, ! 1469: * so don't replace the existint mapping */ ! 1470: add_assoc_string(return_value, (const char*)ind, buffer, 1); ! 1471: } ! 1472: } ! 1473: } ! 1474: /* break thru */ ! 1475: ! 1476: case HTML_SPECIALCHARS: ! 1477: add_assoc_stringl(return_value, "&", "&", sizeof("&") - 1, 1); ! 1478: for (j = 0; basic_entities[j].charcode != 0; j++) { ! 1479: if (basic_entities[j].flags && (quote_style & basic_entities[j].flags) == 0) ! 1480: continue; ! 1481: ! 1482: ind[0] = (unsigned char)basic_entities[j].charcode; ! 1483: ind[1] = '\0'; ! 1484: if (zend_hash_find(Z_ARRVAL_P(return_value), (const char*)ind, 2, &dummy) == FAILURE) { ! 1485: add_assoc_stringl(return_value, ind, basic_entities[j].entity, ! 1486: basic_entities[j].entitylen, 1); ! 1487: } ! 1488: } ! 1489: ! 1490: break; ! 1491: } ! 1492: } ! 1493: /* }}} */ ! 1494: ! 1495: /* ! 1496: * Local variables: ! 1497: * tab-width: 4 ! 1498: * c-basic-offset: 4 ! 1499: * End: ! 1500: * vim600: sw=4 ts=4 fdm=marker ! 1501: * vim<600: sw=4 ts=4 ! 1502: */