Annotation of embedaddon/php/ext/fileinfo/libmagic/ascmagic.c, revision 1.1.1.1
1.1 misho 1: /*
2: * Copyright (c) Ian F. Darwin 1986-1995.
3: * Software written by Ian F. Darwin and others;
4: * maintained 1995-present by Christos Zoulas and others.
5: *
6: * Redistribution and use in source and binary forms, with or without
7: * modification, are permitted provided that the following conditions
8: * are met:
9: * 1. Redistributions of source code must retain the above copyright
10: * notice immediately at the beginning of the file, without modification,
11: * this list of conditions, and the following disclaimer.
12: * 2. Redistributions in binary form must reproduce the above copyright
13: * notice, this list of conditions and the following disclaimer in the
14: * documentation and/or other materials provided with the distribution.
15: *
16: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
20: * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26: * SUCH DAMAGE.
27: */
28: /*
29: * ASCII magic -- file types that we know based on keywords
30: * that can appear anywhere in the file.
31: *
32: * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
33: * to handle character codes other than ASCII on a unified basis.
34: */
35:
36: #include "file.h"
37:
38: #ifndef lint
39: FILE_RCSID("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $")
40: #endif /* lint */
41:
42: #include "magic.h"
43: #include <string.h>
44: #include <memory.h>
45: #include <ctype.h>
46: #include <stdlib.h>
47: #ifdef HAVE_UNISTD_H
48: #include <unistd.h>
49: #endif
50: #include "names.h"
51:
52: #define MAXLINELEN 300 /* longest sane line length */
53: #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
54: || (x) == 0x85 || (x) == '\f')
55:
56: private int ascmatch(const unsigned char *, const unichar *, size_t);
57: private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
58: private size_t trim_nuls(const unsigned char *, size_t);
59:
60: /*
61: * Undo the NUL-termination kindly provided by process()
62: * but leave at least one byte to look at
63: */
64: private size_t
65: trim_nuls(const unsigned char *buf, size_t nbytes)
66: {
67: while (nbytes > 1 && buf[nbytes - 1] == '\0')
68: nbytes--;
69:
70: return nbytes;
71: }
72:
73: protected int
74: file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
75: {
76: unichar *ubuf = NULL;
77: size_t ulen;
78: int rv = 1;
79:
80: const char *code = NULL;
81: const char *code_mime = NULL;
82: const char *type = NULL;
83:
84: if (ms->flags & MAGIC_APPLE)
85: return 0;
86:
87: nbytes = trim_nuls(buf, nbytes);
88:
89: /* If file doesn't look like any sort of text, give up. */
90: if (file_encoding(ms, buf, nbytes, &ubuf, &ulen, &code, &code_mime,
91: &type) == 0) {
92: rv = 0;
93: goto done;
94: }
95:
96: rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code,
97: type);
98:
99: done:
100: if (ubuf)
101: free(ubuf);
102:
103: return rv;
104: }
105:
106: protected int
107: file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
108: size_t nbytes, unichar *ubuf, size_t ulen, const char *code,
109: const char *type)
110: {
111: unsigned char *utf8_buf = NULL, *utf8_end;
112: size_t mlen, i;
113: const struct names *p;
114: int rv = -1;
115: int mime = ms->flags & MAGIC_MIME;
116:
117: const char *subtype = NULL;
118: const char *subtype_mime = NULL;
119:
120: int has_escapes = 0;
121: int has_backspace = 0;
122: int seen_cr = 0;
123:
124: int n_crlf = 0;
125: int n_lf = 0;
126: int n_cr = 0;
127: int n_nel = 0;
128:
129: size_t last_line_end = (size_t)-1;
130: int has_long_lines = 0;
131:
132: if (ms->flags & MAGIC_APPLE)
133: return 0;
134:
135: nbytes = trim_nuls(buf, nbytes);
136:
137: /* If we have fewer than 2 bytes, give up. */
138: if (nbytes <= 1) {
139: rv = 0;
140: goto done;
141: }
142:
143: /* Convert ubuf to UTF-8 and try text soft magic */
144: /* malloc size is a conservative overestimate; could be
145: improved, or at least realloced after conversion. */
146: mlen = ulen * 6;
147: utf8_buf = emalloc(mlen);
148:
149: if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL)
150: goto done;
151: if ((rv = file_softmagic(ms, utf8_buf, (size_t)(utf8_end - utf8_buf),
152: TEXTTEST)) != 0)
153: goto done;
154: else
155: rv = -1;
156:
157: /* look for tokens from names.h - this is expensive! */
158: if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
159: goto subtype_identified;
160:
161: i = 0;
162: while (i < ulen) {
163: size_t end;
164:
165: /* skip past any leading space */
166: while (i < ulen && ISSPC(ubuf[i]))
167: i++;
168: if (i >= ulen)
169: break;
170:
171: /* find the next whitespace */
172: for (end = i + 1; end < nbytes; end++)
173: if (ISSPC(ubuf[end]))
174: break;
175:
176: /* compare the word thus isolated against the token list */
177: for (p = names; p < names + NNAMES; p++) {
178: if (ascmatch((const unsigned char *)p->name, ubuf + i,
179: end - i)) {
180: subtype = types[p->type].human;
181: subtype_mime = types[p->type].mime;
182: goto subtype_identified;
183: }
184: }
185:
186: i = end;
187: }
188:
189: subtype_identified:
190:
191: /* Now try to discover other details about the file. */
192: for (i = 0; i < ulen; i++) {
193: if (ubuf[i] == '\n') {
194: if (seen_cr)
195: n_crlf++;
196: else
197: n_lf++;
198: last_line_end = i;
199: } else if (seen_cr)
200: n_cr++;
201:
202: seen_cr = (ubuf[i] == '\r');
203: if (seen_cr)
204: last_line_end = i;
205:
206: if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
207: n_nel++;
208: last_line_end = i;
209: }
210:
211: /* If this line is _longer_ than MAXLINELEN, remember it. */
212: if (i > last_line_end + MAXLINELEN)
213: has_long_lines = 1;
214:
215: if (ubuf[i] == '\033')
216: has_escapes = 1;
217: if (ubuf[i] == '\b')
218: has_backspace = 1;
219: }
220:
221: /* Beware, if the data has been truncated, the final CR could have
222: been followed by a LF. If we have HOWMANY bytes, it indicates
223: that the data might have been truncated, probably even before
224: this function was called. */
225: if (seen_cr && nbytes < HOWMANY)
226: n_cr++;
227:
228: if (strcmp(type, "binary") == 0) {
229: rv = 0;
230: goto done;
231: }
232: if (mime) {
233: if ((mime & MAGIC_MIME_TYPE) != 0) {
234: if (subtype_mime) {
235: if (file_printf(ms, "%s", subtype_mime) == -1)
236: goto done;
237: } else {
238: if (file_printf(ms, "text/plain") == -1)
239: goto done;
240: }
241: }
242: } else {
243: if (file_printf(ms, "%s", code) == -1)
244: goto done;
245:
246: if (subtype) {
247: if (file_printf(ms, " %s", subtype) == -1)
248: goto done;
249: }
250:
251: if (file_printf(ms, " %s", type) == -1)
252: goto done;
253:
254: if (has_long_lines)
255: if (file_printf(ms, ", with very long lines") == -1)
256: goto done;
257:
258: /*
259: * Only report line terminators if we find one other than LF,
260: * or if we find none at all.
261: */
262: if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
263: (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
264: if (file_printf(ms, ", with") == -1)
265: goto done;
266:
267: if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
268: if (file_printf(ms, " no") == -1)
269: goto done;
270: } else {
271: if (n_crlf) {
272: if (file_printf(ms, " CRLF") == -1)
273: goto done;
274: if (n_cr || n_lf || n_nel)
275: if (file_printf(ms, ",") == -1)
276: goto done;
277: }
278: if (n_cr) {
279: if (file_printf(ms, " CR") == -1)
280: goto done;
281: if (n_lf || n_nel)
282: if (file_printf(ms, ",") == -1)
283: goto done;
284: }
285: if (n_lf) {
286: if (file_printf(ms, " LF") == -1)
287: goto done;
288: if (n_nel)
289: if (file_printf(ms, ",") == -1)
290: goto done;
291: }
292: if (n_nel)
293: if (file_printf(ms, " NEL") == -1)
294: goto done;
295: }
296:
297: if (file_printf(ms, " line terminators") == -1)
298: goto done;
299: }
300:
301: if (has_escapes)
302: if (file_printf(ms, ", with escape sequences") == -1)
303: goto done;
304: if (has_backspace)
305: if (file_printf(ms, ", with overstriking") == -1)
306: goto done;
307: }
308: rv = 1;
309: done:
310: if (utf8_buf)
311: efree(utf8_buf);
312:
313: return rv;
314: }
315:
316: private int
317: ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
318: {
319: size_t i;
320:
321: for (i = 0; i < ulen; i++) {
322: if (s[i] != us[i])
323: return 0;
324: }
325:
326: if (s[i])
327: return 0;
328: else
329: return 1;
330: }
331:
332: /*
333: * Encode Unicode string as UTF-8, returning pointer to character
334: * after end of string, or NULL if an invalid character is found.
335: */
336: private unsigned char *
337: encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
338: {
339: size_t i;
340: unsigned char *end = buf + len;
341:
342: for (i = 0; i < ulen; i++) {
343: if (ubuf[i] <= 0x7f) {
344: if (end - buf < 1)
345: return NULL;
346: *buf++ = (unsigned char)ubuf[i];
347: } else if (ubuf[i] <= 0x7ff) {
348: if (end - buf < 2)
349: return NULL;
350: *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0);
351: *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
352: } else if (ubuf[i] <= 0xffff) {
353: if (end - buf < 3)
354: return NULL;
355: *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0);
356: *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
357: *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
358: } else if (ubuf[i] <= 0x1fffff) {
359: if (end - buf < 4)
360: return NULL;
361: *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0);
362: *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
363: *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
364: *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
365: } else if (ubuf[i] <= 0x3ffffff) {
366: if (end - buf < 5)
367: return NULL;
368: *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8);
369: *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
370: *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
371: *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
372: *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
373: } else if (ubuf[i] <= 0x7fffffff) {
374: if (end - buf < 6)
375: return NULL;
376: *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc);
377: *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80);
378: *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
379: *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
380: *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
381: *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
382: } else /* Invalid character */
383: return NULL;
384: }
385:
386: return buf;
387: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>