Annotation of embedaddon/tmux/utf8.c, revision 1.1.1.1
1.1 misho 1: /* $OpenBSD$ */
2:
3: /*
4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
21: #include <errno.h>
22: #include <stdlib.h>
23: #include <string.h>
24: #include <wchar.h>
25:
26: #include "tmux.h"
27:
28: static int utf8_width(wchar_t);
29:
30: /* Set a single character. */
31: void
32: utf8_set(struct utf8_data *ud, u_char ch)
33: {
34: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
35:
36: memcpy(ud, &empty, sizeof *ud);
37: *ud->data = ch;
38: }
39:
40: /* Copy UTF-8 character. */
41: void
42: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
43: {
44: u_int i;
45:
46: memcpy(to, from, sizeof *to);
47:
48: for (i = to->size; i < sizeof to->data; i++)
49: to->data[i] = '\0';
50: }
51:
52: /*
53: * Open UTF-8 sequence.
54: *
55: * 11000010-11011111 C2-DF start of 2-byte sequence
56: * 11100000-11101111 E0-EF start of 3-byte sequence
57: * 11110000-11110100 F0-F4 start of 4-byte sequence
58: */
59: enum utf8_state
60: utf8_open(struct utf8_data *ud, u_char ch)
61: {
62: memset(ud, 0, sizeof *ud);
63: if (ch >= 0xc2 && ch <= 0xdf)
64: ud->size = 2;
65: else if (ch >= 0xe0 && ch <= 0xef)
66: ud->size = 3;
67: else if (ch >= 0xf0 && ch <= 0xf4)
68: ud->size = 4;
69: else
70: return (UTF8_ERROR);
71: utf8_append(ud, ch);
72: return (UTF8_MORE);
73: }
74:
75: /* Append character to UTF-8, closing if finished. */
76: enum utf8_state
77: utf8_append(struct utf8_data *ud, u_char ch)
78: {
79: wchar_t wc;
80: int width;
81:
82: if (ud->have >= ud->size)
83: fatalx("UTF-8 character overflow");
84: if (ud->size > sizeof ud->data)
85: fatalx("UTF-8 character size too large");
86:
87: if (ud->have != 0 && (ch & 0xc0) != 0x80)
88: ud->width = 0xff;
89:
90: ud->data[ud->have++] = ch;
91: if (ud->have != ud->size)
92: return (UTF8_MORE);
93:
94: if (ud->width == 0xff)
95: return (UTF8_ERROR);
96:
97: if (utf8_combine(ud, &wc) != UTF8_DONE)
98: return (UTF8_ERROR);
99: if ((width = utf8_width(wc)) < 0)
100: return (UTF8_ERROR);
101: ud->width = width;
102:
103: return (UTF8_DONE);
104: }
105:
106: /* Get width of Unicode character. */
107: static int
108: utf8_width(wchar_t wc)
109: {
110: int width;
111:
112: #ifdef HAVE_UTF8PROC
113: width = utf8proc_wcwidth(wc);
114: #else
115: width = wcwidth(wc);
116: #endif
117: if (width < 0 || width > 0xff) {
118: log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
119:
120: #ifndef __OpenBSD__
121: /*
122: * Many platforms (particularly and inevitably OS X) have no
123: * width for relatively common characters (wcwidth() returns
124: * -1); assume width 1 in this case. This will be wrong for
125: * genuinely nonprintable characters, but they should be
126: * rare. We may pass through stuff that ideally we would block,
127: * but this is no worse than sending the same to the terminal
128: * without tmux.
129: */
130: if (width < 0)
131: return (1);
132: #endif
133: return (-1);
134: }
135: return (width);
136: }
137:
138: /* Combine UTF-8 into Unicode. */
139: enum utf8_state
140: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
141: {
142: #ifdef HAVE_UTF8PROC
143: switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
144: #else
145: switch (mbtowc(wc, ud->data, ud->size)) {
146: #endif
147: case -1:
148: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
149: errno);
150: mbtowc(NULL, NULL, MB_CUR_MAX);
151: return (UTF8_ERROR);
152: case 0:
153: return (UTF8_ERROR);
154: default:
155: return (UTF8_DONE);
156: }
157: }
158:
159: /* Split Unicode into UTF-8. */
160: enum utf8_state
161: utf8_split(wchar_t wc, struct utf8_data *ud)
162: {
163: char s[MB_LEN_MAX];
164: int slen;
165:
166: #ifdef HAVE_UTF8PROC
167: slen = utf8proc_wctomb(s, wc);
168: #else
169: slen = wctomb(s, wc);
170: #endif
171: if (slen <= 0 || slen > (int)sizeof ud->data)
172: return (UTF8_ERROR);
173:
174: memcpy(ud->data, s, slen);
175: ud->size = slen;
176:
177: ud->width = utf8_width(wc);
178: return (UTF8_DONE);
179: }
180:
181: /*
182: * Encode len characters from src into dst, which is guaranteed to have four
183: * bytes available for each character from src (for \abc or UTF-8) plus space
184: * for \0.
185: */
186: int
187: utf8_strvis(char *dst, const char *src, size_t len, int flag)
188: {
189: struct utf8_data ud;
190: const char *start, *end;
191: enum utf8_state more;
192: size_t i;
193:
194: start = dst;
195: end = src + len;
196:
197: while (src < end) {
198: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
199: while (++src < end && more == UTF8_MORE)
200: more = utf8_append(&ud, *src);
201: if (more == UTF8_DONE) {
202: /* UTF-8 character finished. */
203: for (i = 0; i < ud.size; i++)
204: *dst++ = ud.data[i];
205: continue;
206: }
207: /* Not a complete, valid UTF-8 character. */
208: src -= ud.have;
209: }
210: if (src < end - 1)
211: dst = vis(dst, src[0], flag, src[1]);
212: else if (src < end)
213: dst = vis(dst, src[0], flag, '\0');
214: src++;
215: }
216:
217: *dst = '\0';
218: return (dst - start);
219: }
220:
221: /* Same as utf8_strvis but allocate the buffer. */
222: int
223: utf8_stravis(char **dst, const char *src, int flag)
224: {
225: char *buf;
226: int len;
227:
228: buf = xreallocarray(NULL, 4, strlen(src) + 1);
229: len = utf8_strvis(buf, src, strlen(src), flag);
230:
231: *dst = xrealloc(buf, len + 1);
232: return (len);
233: }
234:
235: /*
236: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
237: * the returned string. Anything not valid printable ASCII or UTF-8 is
238: * stripped.
239: */
240: char *
241: utf8_sanitize(const char *src)
242: {
243: char *dst;
244: size_t n;
245: enum utf8_state more;
246: struct utf8_data ud;
247: u_int i;
248:
249: dst = NULL;
250:
251: n = 0;
252: while (*src != '\0') {
253: dst = xreallocarray(dst, n + 1, sizeof *dst);
254: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
255: while (*++src != '\0' && more == UTF8_MORE)
256: more = utf8_append(&ud, *src);
257: if (more == UTF8_DONE) {
258: dst = xreallocarray(dst, n + ud.width,
259: sizeof *dst);
260: for (i = 0; i < ud.width; i++)
261: dst[n++] = '_';
262: continue;
263: }
264: src -= ud.have;
265: }
266: if (*src > 0x1f && *src < 0x7f)
267: dst[n++] = *src;
268: else
269: dst[n++] = '_';
270: src++;
271: }
272:
273: dst = xreallocarray(dst, n + 1, sizeof *dst);
274: dst[n] = '\0';
275: return (dst);
276: }
277:
278: /* Get UTF-8 buffer length. */
279: size_t
280: utf8_strlen(const struct utf8_data *s)
281: {
282: size_t i;
283:
284: for (i = 0; s[i].size != 0; i++)
285: /* nothing */;
286: return (i);
287: }
288:
289: /* Get UTF-8 string width. */
290: u_int
291: utf8_strwidth(const struct utf8_data *s, ssize_t n)
292: {
293: ssize_t i;
294: u_int width;
295:
296: width = 0;
297: for (i = 0; s[i].size != 0; i++) {
298: if (n != -1 && n == i)
299: break;
300: width += s[i].width;
301: }
302: return (width);
303: }
304:
305: /*
306: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
307: * Caller frees.
308: */
309: struct utf8_data *
310: utf8_fromcstr(const char *src)
311: {
312: struct utf8_data *dst;
313: size_t n;
314: enum utf8_state more;
315:
316: dst = NULL;
317:
318: n = 0;
319: while (*src != '\0') {
320: dst = xreallocarray(dst, n + 1, sizeof *dst);
321: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
322: while (*++src != '\0' && more == UTF8_MORE)
323: more = utf8_append(&dst[n], *src);
324: if (more == UTF8_DONE) {
325: n++;
326: continue;
327: }
328: src -= dst[n].have;
329: }
330: utf8_set(&dst[n], *src);
331: n++;
332: src++;
333: }
334:
335: dst = xreallocarray(dst, n + 1, sizeof *dst);
336: dst[n].size = 0;
337: return (dst);
338: }
339:
340: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
341: char *
342: utf8_tocstr(struct utf8_data *src)
343: {
344: char *dst;
345: size_t n;
346:
347: dst = NULL;
348:
349: n = 0;
350: for(; src->size != 0; src++) {
351: dst = xreallocarray(dst, n + src->size, 1);
352: memcpy(dst + n, src->data, src->size);
353: n += src->size;
354: }
355:
356: dst = xreallocarray(dst, n + 1, 1);
357: dst[n] = '\0';
358: return (dst);
359: }
360:
361: /* Get width of UTF-8 string. */
362: u_int
363: utf8_cstrwidth(const char *s)
364: {
365: struct utf8_data tmp;
366: u_int width;
367: enum utf8_state more;
368:
369: width = 0;
370: while (*s != '\0') {
371: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
372: while (*++s != '\0' && more == UTF8_MORE)
373: more = utf8_append(&tmp, *s);
374: if (more == UTF8_DONE) {
375: width += tmp.width;
376: continue;
377: }
378: s -= tmp.have;
379: }
380: if (*s > 0x1f && *s != 0x7f)
381: width++;
382: s++;
383: }
384: return (width);
385: }
386:
387: /* Trim UTF-8 string to width. Caller frees. */
388: char *
389: utf8_trimcstr(const char *s, u_int width)
390: {
391: struct utf8_data *tmp, *next;
392: char *out;
393: u_int at;
394:
395: tmp = utf8_fromcstr(s);
396:
397: at = 0;
398: for (next = tmp; next->size != 0; next++) {
399: if (at + next->width > width) {
400: next->size = 0;
401: break;
402: }
403: at += next->width;
404: }
405:
406: out = utf8_tocstr(tmp);
407: free(tmp);
408: return (out);
409: }
410:
411: /* Trim UTF-8 string to width. Caller frees. */
412: char *
413: utf8_rtrimcstr(const char *s, u_int width)
414: {
415: struct utf8_data *tmp, *next, *end;
416: char *out;
417: u_int at;
418:
419: tmp = utf8_fromcstr(s);
420:
421: for (end = tmp; end->size != 0; end++)
422: /* nothing */;
423: if (end == tmp) {
424: free(tmp);
425: return (xstrdup(""));
426: }
427: next = end - 1;
428:
429: at = 0;
430: for (;;)
431: {
432: if (at + next->width > width) {
433: next++;
434: break;
435: }
436: at += next->width;
437:
438: if (next == tmp)
439: break;
440: next--;
441: }
442:
443: out = utf8_tocstr(next);
444: free(tmp);
445: return (out);
446: }
447:
448: /* Pad UTF-8 string to width. Caller frees. */
449: char *
450: utf8_padcstr(const char *s, u_int width)
451: {
452: size_t slen;
453: char *out;
454: u_int n, i;
455:
456: n = utf8_cstrwidth(s);
457: if (n >= width)
458: return (xstrdup(s));
459:
460: slen = strlen(s);
461: out = xmalloc(slen + 1 + (width - n));
462: memcpy(out, s, slen);
463: for (i = n; i < width; i++)
464: out[slen++] = ' ';
465: out[slen] = '\0';
466: return (out);
467: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>