Annotation of embedaddon/php/ext/fileinfo/libmagic/ascmagic.c, revision 1.1
1.1 ! misho 1: /*
! 2: * Copyright (c) Ian F. Darwin 1986-1995.
! 3: * Software written by Ian F. Darwin and others;
! 4: * maintained 1995-present by Christos Zoulas and others.
! 5: *
! 6: * Redistribution and use in source and binary forms, with or without
! 7: * modification, are permitted provided that the following conditions
! 8: * are met:
! 9: * 1. Redistributions of source code must retain the above copyright
! 10: * notice immediately at the beginning of the file, without modification,
! 11: * this list of conditions, and the following disclaimer.
! 12: * 2. Redistributions in binary form must reproduce the above copyright
! 13: * notice, this list of conditions and the following disclaimer in the
! 14: * documentation and/or other materials provided with the distribution.
! 15: *
! 16: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
! 17: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 18: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 19: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
! 20: * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
! 21: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
! 22: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
! 23: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
! 24: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
! 25: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
! 26: * SUCH DAMAGE.
! 27: */
! 28: /*
! 29: * ASCII magic -- file types that we know based on keywords
! 30: * that can appear anywhere in the file.
! 31: *
! 32: * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
! 33: * to handle character codes other than ASCII on a unified basis.
! 34: */
! 35:
! 36: #include "file.h"
! 37:
! 38: #ifndef lint
! 39: FILE_RCSID("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $")
! 40: #endif /* lint */
! 41:
! 42: #include "magic.h"
! 43: #include <string.h>
! 44: #include <memory.h>
! 45: #include <ctype.h>
! 46: #include <stdlib.h>
! 47: #ifdef HAVE_UNISTD_H
! 48: #include <unistd.h>
! 49: #endif
! 50: #include "names.h"
! 51:
! 52: #define MAXLINELEN 300 /* longest sane line length */
! 53: #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
! 54: || (x) == 0x85 || (x) == '\f')
! 55:
! 56: private int ascmatch(const unsigned char *, const unichar *, size_t);
! 57: private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
! 58: private size_t trim_nuls(const unsigned char *, size_t);
! 59:
! 60: /*
! 61: * Undo the NUL-termination kindly provided by process()
! 62: * but leave at least one byte to look at
! 63: */
! 64: private size_t
! 65: trim_nuls(const unsigned char *buf, size_t nbytes)
! 66: {
! 67: while (nbytes > 1 && buf[nbytes - 1] == '\0')
! 68: nbytes--;
! 69:
! 70: return nbytes;
! 71: }
! 72:
! 73: protected int
! 74: file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
! 75: {
! 76: unichar *ubuf = NULL;
! 77: size_t ulen;
! 78: int rv = 1;
! 79:
! 80: const char *code = NULL;
! 81: const char *code_mime = NULL;
! 82: const char *type = NULL;
! 83:
! 84: if (ms->flags & MAGIC_APPLE)
! 85: return 0;
! 86:
! 87: nbytes = trim_nuls(buf, nbytes);
! 88:
! 89: /* If file doesn't look like any sort of text, give up. */
! 90: if (file_encoding(ms, buf, nbytes, &ubuf, &ulen, &code, &code_mime,
! 91: &type) == 0) {
! 92: rv = 0;
! 93: goto done;
! 94: }
! 95:
! 96: rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code,
! 97: type);
! 98:
! 99: done:
! 100: if (ubuf)
! 101: free(ubuf);
! 102:
! 103: return rv;
! 104: }
! 105:
! 106: protected int
! 107: file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
! 108: size_t nbytes, unichar *ubuf, size_t ulen, const char *code,
! 109: const char *type)
! 110: {
! 111: unsigned char *utf8_buf = NULL, *utf8_end;
! 112: size_t mlen, i;
! 113: const struct names *p;
! 114: int rv = -1;
! 115: int mime = ms->flags & MAGIC_MIME;
! 116:
! 117: const char *subtype = NULL;
! 118: const char *subtype_mime = NULL;
! 119:
! 120: int has_escapes = 0;
! 121: int has_backspace = 0;
! 122: int seen_cr = 0;
! 123:
! 124: int n_crlf = 0;
! 125: int n_lf = 0;
! 126: int n_cr = 0;
! 127: int n_nel = 0;
! 128:
! 129: size_t last_line_end = (size_t)-1;
! 130: int has_long_lines = 0;
! 131:
! 132: if (ms->flags & MAGIC_APPLE)
! 133: return 0;
! 134:
! 135: nbytes = trim_nuls(buf, nbytes);
! 136:
! 137: /* If we have fewer than 2 bytes, give up. */
! 138: if (nbytes <= 1) {
! 139: rv = 0;
! 140: goto done;
! 141: }
! 142:
! 143: /* Convert ubuf to UTF-8 and try text soft magic */
! 144: /* malloc size is a conservative overestimate; could be
! 145: improved, or at least realloced after conversion. */
! 146: mlen = ulen * 6;
! 147: utf8_buf = emalloc(mlen);
! 148:
! 149: if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL)
! 150: goto done;
! 151: if ((rv = file_softmagic(ms, utf8_buf, (size_t)(utf8_end - utf8_buf),
! 152: TEXTTEST)) != 0)
! 153: goto done;
! 154: else
! 155: rv = -1;
! 156:
! 157: /* look for tokens from names.h - this is expensive! */
! 158: if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
! 159: goto subtype_identified;
! 160:
! 161: i = 0;
! 162: while (i < ulen) {
! 163: size_t end;
! 164:
! 165: /* skip past any leading space */
! 166: while (i < ulen && ISSPC(ubuf[i]))
! 167: i++;
! 168: if (i >= ulen)
! 169: break;
! 170:
! 171: /* find the next whitespace */
! 172: for (end = i + 1; end < nbytes; end++)
! 173: if (ISSPC(ubuf[end]))
! 174: break;
! 175:
! 176: /* compare the word thus isolated against the token list */
! 177: for (p = names; p < names + NNAMES; p++) {
! 178: if (ascmatch((const unsigned char *)p->name, ubuf + i,
! 179: end - i)) {
! 180: subtype = types[p->type].human;
! 181: subtype_mime = types[p->type].mime;
! 182: goto subtype_identified;
! 183: }
! 184: }
! 185:
! 186: i = end;
! 187: }
! 188:
! 189: subtype_identified:
! 190:
! 191: /* Now try to discover other details about the file. */
! 192: for (i = 0; i < ulen; i++) {
! 193: if (ubuf[i] == '\n') {
! 194: if (seen_cr)
! 195: n_crlf++;
! 196: else
! 197: n_lf++;
! 198: last_line_end = i;
! 199: } else if (seen_cr)
! 200: n_cr++;
! 201:
! 202: seen_cr = (ubuf[i] == '\r');
! 203: if (seen_cr)
! 204: last_line_end = i;
! 205:
! 206: if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
! 207: n_nel++;
! 208: last_line_end = i;
! 209: }
! 210:
! 211: /* If this line is _longer_ than MAXLINELEN, remember it. */
! 212: if (i > last_line_end + MAXLINELEN)
! 213: has_long_lines = 1;
! 214:
! 215: if (ubuf[i] == '\033')
! 216: has_escapes = 1;
! 217: if (ubuf[i] == '\b')
! 218: has_backspace = 1;
! 219: }
! 220:
! 221: /* Beware, if the data has been truncated, the final CR could have
! 222: been followed by a LF. If we have HOWMANY bytes, it indicates
! 223: that the data might have been truncated, probably even before
! 224: this function was called. */
! 225: if (seen_cr && nbytes < HOWMANY)
! 226: n_cr++;
! 227:
! 228: if (strcmp(type, "binary") == 0) {
! 229: rv = 0;
! 230: goto done;
! 231: }
! 232: if (mime) {
! 233: if ((mime & MAGIC_MIME_TYPE) != 0) {
! 234: if (subtype_mime) {
! 235: if (file_printf(ms, "%s", subtype_mime) == -1)
! 236: goto done;
! 237: } else {
! 238: if (file_printf(ms, "text/plain") == -1)
! 239: goto done;
! 240: }
! 241: }
! 242: } else {
! 243: if (file_printf(ms, "%s", code) == -1)
! 244: goto done;
! 245:
! 246: if (subtype) {
! 247: if (file_printf(ms, " %s", subtype) == -1)
! 248: goto done;
! 249: }
! 250:
! 251: if (file_printf(ms, " %s", type) == -1)
! 252: goto done;
! 253:
! 254: if (has_long_lines)
! 255: if (file_printf(ms, ", with very long lines") == -1)
! 256: goto done;
! 257:
! 258: /*
! 259: * Only report line terminators if we find one other than LF,
! 260: * or if we find none at all.
! 261: */
! 262: if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
! 263: (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
! 264: if (file_printf(ms, ", with") == -1)
! 265: goto done;
! 266:
! 267: if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
! 268: if (file_printf(ms, " no") == -1)
! 269: goto done;
! 270: } else {
! 271: if (n_crlf) {
! 272: if (file_printf(ms, " CRLF") == -1)
! 273: goto done;
! 274: if (n_cr || n_lf || n_nel)
! 275: if (file_printf(ms, ",") == -1)
! 276: goto done;
! 277: }
! 278: if (n_cr) {
! 279: if (file_printf(ms, " CR") == -1)
! 280: goto done;
! 281: if (n_lf || n_nel)
! 282: if (file_printf(ms, ",") == -1)
! 283: goto done;
! 284: }
! 285: if (n_lf) {
! 286: if (file_printf(ms, " LF") == -1)
! 287: goto done;
! 288: if (n_nel)
! 289: if (file_printf(ms, ",") == -1)
! 290: goto done;
! 291: }
! 292: if (n_nel)
! 293: if (file_printf(ms, " NEL") == -1)
! 294: goto done;
! 295: }
! 296:
! 297: if (file_printf(ms, " line terminators") == -1)
! 298: goto done;
! 299: }
! 300:
! 301: if (has_escapes)
! 302: if (file_printf(ms, ", with escape sequences") == -1)
! 303: goto done;
! 304: if (has_backspace)
! 305: if (file_printf(ms, ", with overstriking") == -1)
! 306: goto done;
! 307: }
! 308: rv = 1;
! 309: done:
! 310: if (utf8_buf)
! 311: efree(utf8_buf);
! 312:
! 313: return rv;
! 314: }
! 315:
! 316: private int
! 317: ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
! 318: {
! 319: size_t i;
! 320:
! 321: for (i = 0; i < ulen; i++) {
! 322: if (s[i] != us[i])
! 323: return 0;
! 324: }
! 325:
! 326: if (s[i])
! 327: return 0;
! 328: else
! 329: return 1;
! 330: }
! 331:
! 332: /*
! 333: * Encode Unicode string as UTF-8, returning pointer to character
! 334: * after end of string, or NULL if an invalid character is found.
! 335: */
! 336: private unsigned char *
! 337: encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
! 338: {
! 339: size_t i;
! 340: unsigned char *end = buf + len;
! 341:
! 342: for (i = 0; i < ulen; i++) {
! 343: if (ubuf[i] <= 0x7f) {
! 344: if (end - buf < 1)
! 345: return NULL;
! 346: *buf++ = (unsigned char)ubuf[i];
! 347: } else if (ubuf[i] <= 0x7ff) {
! 348: if (end - buf < 2)
! 349: return NULL;
! 350: *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0);
! 351: *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
! 352: } else if (ubuf[i] <= 0xffff) {
! 353: if (end - buf < 3)
! 354: return NULL;
! 355: *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0);
! 356: *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
! 357: *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
! 358: } else if (ubuf[i] <= 0x1fffff) {
! 359: if (end - buf < 4)
! 360: return NULL;
! 361: *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0);
! 362: *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
! 363: *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
! 364: *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
! 365: } else if (ubuf[i] <= 0x3ffffff) {
! 366: if (end - buf < 5)
! 367: return NULL;
! 368: *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8);
! 369: *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
! 370: *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
! 371: *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
! 372: *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
! 373: } else if (ubuf[i] <= 0x7fffffff) {
! 374: if (end - buf < 6)
! 375: return NULL;
! 376: *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc);
! 377: *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80);
! 378: *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
! 379: *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
! 380: *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
! 381: *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
! 382: } else /* Invalid character */
! 383: return NULL;
! 384: }
! 385:
! 386: return buf;
! 387: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>