Annotation of embedaddon/tmux/utf8.c, revision 1.1
1.1 ! misho 1: /* $OpenBSD$ */
! 2:
! 3: /*
! 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
! 5: *
! 6: * Permission to use, copy, modify, and distribute this software for any
! 7: * purpose with or without fee is hereby granted, provided that the above
! 8: * copyright notice and this permission notice appear in all copies.
! 9: *
! 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
! 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
! 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
! 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
! 14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
! 15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
! 16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
! 17: */
! 18:
! 19: #include <sys/types.h>
! 20:
! 21: #include <errno.h>
! 22: #include <stdlib.h>
! 23: #include <string.h>
! 24: #include <wchar.h>
! 25:
! 26: #include "tmux.h"
! 27:
! 28: static int utf8_width(wchar_t);
! 29:
! 30: /* Set a single character. */
! 31: void
! 32: utf8_set(struct utf8_data *ud, u_char ch)
! 33: {
! 34: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
! 35:
! 36: memcpy(ud, &empty, sizeof *ud);
! 37: *ud->data = ch;
! 38: }
! 39:
! 40: /* Copy UTF-8 character. */
! 41: void
! 42: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
! 43: {
! 44: u_int i;
! 45:
! 46: memcpy(to, from, sizeof *to);
! 47:
! 48: for (i = to->size; i < sizeof to->data; i++)
! 49: to->data[i] = '\0';
! 50: }
! 51:
! 52: /*
! 53: * Open UTF-8 sequence.
! 54: *
! 55: * 11000010-11011111 C2-DF start of 2-byte sequence
! 56: * 11100000-11101111 E0-EF start of 3-byte sequence
! 57: * 11110000-11110100 F0-F4 start of 4-byte sequence
! 58: */
! 59: enum utf8_state
! 60: utf8_open(struct utf8_data *ud, u_char ch)
! 61: {
! 62: memset(ud, 0, sizeof *ud);
! 63: if (ch >= 0xc2 && ch <= 0xdf)
! 64: ud->size = 2;
! 65: else if (ch >= 0xe0 && ch <= 0xef)
! 66: ud->size = 3;
! 67: else if (ch >= 0xf0 && ch <= 0xf4)
! 68: ud->size = 4;
! 69: else
! 70: return (UTF8_ERROR);
! 71: utf8_append(ud, ch);
! 72: return (UTF8_MORE);
! 73: }
! 74:
! 75: /* Append character to UTF-8, closing if finished. */
! 76: enum utf8_state
! 77: utf8_append(struct utf8_data *ud, u_char ch)
! 78: {
! 79: wchar_t wc;
! 80: int width;
! 81:
! 82: if (ud->have >= ud->size)
! 83: fatalx("UTF-8 character overflow");
! 84: if (ud->size > sizeof ud->data)
! 85: fatalx("UTF-8 character size too large");
! 86:
! 87: if (ud->have != 0 && (ch & 0xc0) != 0x80)
! 88: ud->width = 0xff;
! 89:
! 90: ud->data[ud->have++] = ch;
! 91: if (ud->have != ud->size)
! 92: return (UTF8_MORE);
! 93:
! 94: if (ud->width == 0xff)
! 95: return (UTF8_ERROR);
! 96:
! 97: if (utf8_combine(ud, &wc) != UTF8_DONE)
! 98: return (UTF8_ERROR);
! 99: if ((width = utf8_width(wc)) < 0)
! 100: return (UTF8_ERROR);
! 101: ud->width = width;
! 102:
! 103: return (UTF8_DONE);
! 104: }
! 105:
! 106: /* Get width of Unicode character. */
! 107: static int
! 108: utf8_width(wchar_t wc)
! 109: {
! 110: int width;
! 111:
! 112: #ifdef HAVE_UTF8PROC
! 113: width = utf8proc_wcwidth(wc);
! 114: #else
! 115: width = wcwidth(wc);
! 116: #endif
! 117: if (width < 0 || width > 0xff) {
! 118: log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
! 119:
! 120: #ifndef __OpenBSD__
! 121: /*
! 122: * Many platforms (particularly and inevitably OS X) have no
! 123: * width for relatively common characters (wcwidth() returns
! 124: * -1); assume width 1 in this case. This will be wrong for
! 125: * genuinely nonprintable characters, but they should be
! 126: * rare. We may pass through stuff that ideally we would block,
! 127: * but this is no worse than sending the same to the terminal
! 128: * without tmux.
! 129: */
! 130: if (width < 0)
! 131: return (1);
! 132: #endif
! 133: return (-1);
! 134: }
! 135: return (width);
! 136: }
! 137:
! 138: /* Combine UTF-8 into Unicode. */
! 139: enum utf8_state
! 140: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
! 141: {
! 142: #ifdef HAVE_UTF8PROC
! 143: switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
! 144: #else
! 145: switch (mbtowc(wc, ud->data, ud->size)) {
! 146: #endif
! 147: case -1:
! 148: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
! 149: errno);
! 150: mbtowc(NULL, NULL, MB_CUR_MAX);
! 151: return (UTF8_ERROR);
! 152: case 0:
! 153: return (UTF8_ERROR);
! 154: default:
! 155: return (UTF8_DONE);
! 156: }
! 157: }
! 158:
! 159: /* Split Unicode into UTF-8. */
! 160: enum utf8_state
! 161: utf8_split(wchar_t wc, struct utf8_data *ud)
! 162: {
! 163: char s[MB_LEN_MAX];
! 164: int slen;
! 165:
! 166: #ifdef HAVE_UTF8PROC
! 167: slen = utf8proc_wctomb(s, wc);
! 168: #else
! 169: slen = wctomb(s, wc);
! 170: #endif
! 171: if (slen <= 0 || slen > (int)sizeof ud->data)
! 172: return (UTF8_ERROR);
! 173:
! 174: memcpy(ud->data, s, slen);
! 175: ud->size = slen;
! 176:
! 177: ud->width = utf8_width(wc);
! 178: return (UTF8_DONE);
! 179: }
! 180:
! 181: /*
! 182: * Encode len characters from src into dst, which is guaranteed to have four
! 183: * bytes available for each character from src (for \abc or UTF-8) plus space
! 184: * for \0.
! 185: */
! 186: int
! 187: utf8_strvis(char *dst, const char *src, size_t len, int flag)
! 188: {
! 189: struct utf8_data ud;
! 190: const char *start, *end;
! 191: enum utf8_state more;
! 192: size_t i;
! 193:
! 194: start = dst;
! 195: end = src + len;
! 196:
! 197: while (src < end) {
! 198: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
! 199: while (++src < end && more == UTF8_MORE)
! 200: more = utf8_append(&ud, *src);
! 201: if (more == UTF8_DONE) {
! 202: /* UTF-8 character finished. */
! 203: for (i = 0; i < ud.size; i++)
! 204: *dst++ = ud.data[i];
! 205: continue;
! 206: }
! 207: /* Not a complete, valid UTF-8 character. */
! 208: src -= ud.have;
! 209: }
! 210: if (src < end - 1)
! 211: dst = vis(dst, src[0], flag, src[1]);
! 212: else if (src < end)
! 213: dst = vis(dst, src[0], flag, '\0');
! 214: src++;
! 215: }
! 216:
! 217: *dst = '\0';
! 218: return (dst - start);
! 219: }
! 220:
! 221: /* Same as utf8_strvis but allocate the buffer. */
! 222: int
! 223: utf8_stravis(char **dst, const char *src, int flag)
! 224: {
! 225: char *buf;
! 226: int len;
! 227:
! 228: buf = xreallocarray(NULL, 4, strlen(src) + 1);
! 229: len = utf8_strvis(buf, src, strlen(src), flag);
! 230:
! 231: *dst = xrealloc(buf, len + 1);
! 232: return (len);
! 233: }
! 234:
! 235: /*
! 236: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
! 237: * the returned string. Anything not valid printable ASCII or UTF-8 is
! 238: * stripped.
! 239: */
! 240: char *
! 241: utf8_sanitize(const char *src)
! 242: {
! 243: char *dst;
! 244: size_t n;
! 245: enum utf8_state more;
! 246: struct utf8_data ud;
! 247: u_int i;
! 248:
! 249: dst = NULL;
! 250:
! 251: n = 0;
! 252: while (*src != '\0') {
! 253: dst = xreallocarray(dst, n + 1, sizeof *dst);
! 254: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
! 255: while (*++src != '\0' && more == UTF8_MORE)
! 256: more = utf8_append(&ud, *src);
! 257: if (more == UTF8_DONE) {
! 258: dst = xreallocarray(dst, n + ud.width,
! 259: sizeof *dst);
! 260: for (i = 0; i < ud.width; i++)
! 261: dst[n++] = '_';
! 262: continue;
! 263: }
! 264: src -= ud.have;
! 265: }
! 266: if (*src > 0x1f && *src < 0x7f)
! 267: dst[n++] = *src;
! 268: else
! 269: dst[n++] = '_';
! 270: src++;
! 271: }
! 272:
! 273: dst = xreallocarray(dst, n + 1, sizeof *dst);
! 274: dst[n] = '\0';
! 275: return (dst);
! 276: }
! 277:
! 278: /* Get UTF-8 buffer length. */
! 279: size_t
! 280: utf8_strlen(const struct utf8_data *s)
! 281: {
! 282: size_t i;
! 283:
! 284: for (i = 0; s[i].size != 0; i++)
! 285: /* nothing */;
! 286: return (i);
! 287: }
! 288:
! 289: /* Get UTF-8 string width. */
! 290: u_int
! 291: utf8_strwidth(const struct utf8_data *s, ssize_t n)
! 292: {
! 293: ssize_t i;
! 294: u_int width;
! 295:
! 296: width = 0;
! 297: for (i = 0; s[i].size != 0; i++) {
! 298: if (n != -1 && n == i)
! 299: break;
! 300: width += s[i].width;
! 301: }
! 302: return (width);
! 303: }
! 304:
! 305: /*
! 306: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
! 307: * Caller frees.
! 308: */
! 309: struct utf8_data *
! 310: utf8_fromcstr(const char *src)
! 311: {
! 312: struct utf8_data *dst;
! 313: size_t n;
! 314: enum utf8_state more;
! 315:
! 316: dst = NULL;
! 317:
! 318: n = 0;
! 319: while (*src != '\0') {
! 320: dst = xreallocarray(dst, n + 1, sizeof *dst);
! 321: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
! 322: while (*++src != '\0' && more == UTF8_MORE)
! 323: more = utf8_append(&dst[n], *src);
! 324: if (more == UTF8_DONE) {
! 325: n++;
! 326: continue;
! 327: }
! 328: src -= dst[n].have;
! 329: }
! 330: utf8_set(&dst[n], *src);
! 331: n++;
! 332: src++;
! 333: }
! 334:
! 335: dst = xreallocarray(dst, n + 1, sizeof *dst);
! 336: dst[n].size = 0;
! 337: return (dst);
! 338: }
! 339:
! 340: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
! 341: char *
! 342: utf8_tocstr(struct utf8_data *src)
! 343: {
! 344: char *dst;
! 345: size_t n;
! 346:
! 347: dst = NULL;
! 348:
! 349: n = 0;
! 350: for(; src->size != 0; src++) {
! 351: dst = xreallocarray(dst, n + src->size, 1);
! 352: memcpy(dst + n, src->data, src->size);
! 353: n += src->size;
! 354: }
! 355:
! 356: dst = xreallocarray(dst, n + 1, 1);
! 357: dst[n] = '\0';
! 358: return (dst);
! 359: }
! 360:
! 361: /* Get width of UTF-8 string. */
! 362: u_int
! 363: utf8_cstrwidth(const char *s)
! 364: {
! 365: struct utf8_data tmp;
! 366: u_int width;
! 367: enum utf8_state more;
! 368:
! 369: width = 0;
! 370: while (*s != '\0') {
! 371: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
! 372: while (*++s != '\0' && more == UTF8_MORE)
! 373: more = utf8_append(&tmp, *s);
! 374: if (more == UTF8_DONE) {
! 375: width += tmp.width;
! 376: continue;
! 377: }
! 378: s -= tmp.have;
! 379: }
! 380: if (*s > 0x1f && *s != 0x7f)
! 381: width++;
! 382: s++;
! 383: }
! 384: return (width);
! 385: }
! 386:
! 387: /* Trim UTF-8 string to width. Caller frees. */
! 388: char *
! 389: utf8_trimcstr(const char *s, u_int width)
! 390: {
! 391: struct utf8_data *tmp, *next;
! 392: char *out;
! 393: u_int at;
! 394:
! 395: tmp = utf8_fromcstr(s);
! 396:
! 397: at = 0;
! 398: for (next = tmp; next->size != 0; next++) {
! 399: if (at + next->width > width) {
! 400: next->size = 0;
! 401: break;
! 402: }
! 403: at += next->width;
! 404: }
! 405:
! 406: out = utf8_tocstr(tmp);
! 407: free(tmp);
! 408: return (out);
! 409: }
! 410:
! 411: /* Trim UTF-8 string to width. Caller frees. */
! 412: char *
! 413: utf8_rtrimcstr(const char *s, u_int width)
! 414: {
! 415: struct utf8_data *tmp, *next, *end;
! 416: char *out;
! 417: u_int at;
! 418:
! 419: tmp = utf8_fromcstr(s);
! 420:
! 421: for (end = tmp; end->size != 0; end++)
! 422: /* nothing */;
! 423: if (end == tmp) {
! 424: free(tmp);
! 425: return (xstrdup(""));
! 426: }
! 427: next = end - 1;
! 428:
! 429: at = 0;
! 430: for (;;)
! 431: {
! 432: if (at + next->width > width) {
! 433: next++;
! 434: break;
! 435: }
! 436: at += next->width;
! 437:
! 438: if (next == tmp)
! 439: break;
! 440: next--;
! 441: }
! 442:
! 443: out = utf8_tocstr(next);
! 444: free(tmp);
! 445: return (out);
! 446: }
! 447:
! 448: /* Pad UTF-8 string to width. Caller frees. */
! 449: char *
! 450: utf8_padcstr(const char *s, u_int width)
! 451: {
! 452: size_t slen;
! 453: char *out;
! 454: u_int n, i;
! 455:
! 456: n = utf8_cstrwidth(s);
! 457: if (n >= width)
! 458: return (xstrdup(s));
! 459:
! 460: slen = strlen(s);
! 461: out = xmalloc(slen + 1 + (width - n));
! 462: memcpy(out, s, slen);
! 463: for (i = n; i < width; i++)
! 464: out[slen++] = ' ';
! 465: out[slen] = '\0';
! 466: return (out);
! 467: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>