embedaddon/tmux/utf8.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / tmux / utf8.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Jun 14 12:22:44 2017 UTC (7 years ago) by misho
Branches: tmux, MAIN
CVS tags: v2_4p0, v2_4, HEAD

tmux 2.4

1: /* $OpenBSD$ */ 2: 3: /* 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> 5: * 6: * Permission to use, copy, modify, and distribute this software for any 7: * purpose with or without fee is hereby granted, provided that the above 8: * copyright notice and this permission notice appear in all copies. 9: * 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17: */ 18: 19: #include <sys/types.h> 20: 21: #include <errno.h> 22: #include <stdlib.h> 23: #include <string.h> 24: #include <wchar.h> 25: 26: #include "tmux.h" 27: 28: static int utf8_width(wchar_t); 29: 30: /* Set a single character. */ 31: void 32: utf8_set(struct utf8_data *ud, u_char ch) 33: { 34: static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; 35: 36: memcpy(ud, &empty, sizeof *ud); 37: *ud->data = ch; 38: } 39: 40: /* Copy UTF-8 character. */ 41: void 42: utf8_copy(struct utf8_data *to, const struct utf8_data *from) 43: { 44: u_int i; 45: 46: memcpy(to, from, sizeof *to); 47: 48: for (i = to->size; i < sizeof to->data; i++) 49: to->data[i] = '\0'; 50: } 51: 52: /* 53: * Open UTF-8 sequence. 54: * 55: * 11000010-11011111 C2-DF start of 2-byte sequence 56: * 11100000-11101111 E0-EF start of 3-byte sequence 57: * 11110000-11110100 F0-F4 start of 4-byte sequence 58: */ 59: enum utf8_state 60: utf8_open(struct utf8_data *ud, u_char ch) 61: { 62: memset(ud, 0, sizeof *ud); 63: if (ch >= 0xc2 && ch <= 0xdf) 64: ud->size = 2; 65: else if (ch >= 0xe0 && ch <= 0xef) 66: ud->size = 3; 67: else if (ch >= 0xf0 && ch <= 0xf4) 68: ud->size = 4; 69: else 70: return (UTF8_ERROR); 71: utf8_append(ud, ch); 72: return (UTF8_MORE); 73: } 74: 75: /* Append character to UTF-8, closing if finished. */ 76: enum utf8_state 77: utf8_append(struct utf8_data *ud, u_char ch) 78: { 79: wchar_t wc; 80: int width; 81: 82: if (ud->have >= ud->size) 83: fatalx("UTF-8 character overflow"); 84: if (ud->size > sizeof ud->data) 85: fatalx("UTF-8 character size too large"); 86: 87: if (ud->have != 0 && (ch & 0xc0) != 0x80) 88: ud->width = 0xff; 89: 90: ud->data[ud->have++] = ch; 91: if (ud->have != ud->size) 92: return (UTF8_MORE); 93: 94: if (ud->width == 0xff) 95: return (UTF8_ERROR); 96: 97: if (utf8_combine(ud, &wc) != UTF8_DONE) 98: return (UTF8_ERROR); 99: if ((width = utf8_width(wc)) < 0) 100: return (UTF8_ERROR); 101: ud->width = width; 102: 103: return (UTF8_DONE); 104: } 105: 106: /* Get width of Unicode character. */ 107: static int 108: utf8_width(wchar_t wc) 109: { 110: int width; 111: 112: #ifdef HAVE_UTF8PROC 113: width = utf8proc_wcwidth(wc); 114: #else 115: width = wcwidth(wc); 116: #endif 117: if (width < 0 || width > 0xff) { 118: log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width); 119: 120: #ifndef __OpenBSD__ 121: /* 122: * Many platforms (particularly and inevitably OS X) have no 123: * width for relatively common characters (wcwidth() returns 124: * -1); assume width 1 in this case. This will be wrong for 125: * genuinely nonprintable characters, but they should be 126: * rare. We may pass through stuff that ideally we would block, 127: * but this is no worse than sending the same to the terminal 128: * without tmux. 129: */ 130: if (width < 0) 131: return (1); 132: #endif 133: return (-1); 134: } 135: return (width); 136: } 137: 138: /* Combine UTF-8 into Unicode. */ 139: enum utf8_state 140: utf8_combine(const struct utf8_data *ud, wchar_t *wc) 141: { 142: #ifdef HAVE_UTF8PROC 143: switch (utf8proc_mbtowc(wc, ud->data, ud->size)) { 144: #else 145: switch (mbtowc(wc, ud->data, ud->size)) { 146: #endif 147: case -1: 148: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, 149: errno); 150: mbtowc(NULL, NULL, MB_CUR_MAX); 151: return (UTF8_ERROR); 152: case 0: 153: return (UTF8_ERROR); 154: default: 155: return (UTF8_DONE); 156: } 157: } 158: 159: /* Split Unicode into UTF-8. */ 160: enum utf8_state 161: utf8_split(wchar_t wc, struct utf8_data *ud) 162: { 163: char s[MB_LEN_MAX]; 164: int slen; 165: 166: #ifdef HAVE_UTF8PROC 167: slen = utf8proc_wctomb(s, wc); 168: #else 169: slen = wctomb(s, wc); 170: #endif 171: if (slen <= 0 || slen > (int)sizeof ud->data) 172: return (UTF8_ERROR); 173: 174: memcpy(ud->data, s, slen); 175: ud->size = slen; 176: 177: ud->width = utf8_width(wc); 178: return (UTF8_DONE); 179: } 180: 181: /* 182: * Encode len characters from src into dst, which is guaranteed to have four 183: * bytes available for each character from src (for \abc or UTF-8) plus space 184: * for \0. 185: */ 186: int 187: utf8_strvis(char *dst, const char *src, size_t len, int flag) 188: { 189: struct utf8_data ud; 190: const char *start, *end; 191: enum utf8_state more; 192: size_t i; 193: 194: start = dst; 195: end = src + len; 196: 197: while (src < end) { 198: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 199: while (++src < end && more == UTF8_MORE) 200: more = utf8_append(&ud, *src); 201: if (more == UTF8_DONE) { 202: /* UTF-8 character finished. */ 203: for (i = 0; i < ud.size; i++) 204: *dst++ = ud.data[i]; 205: continue; 206: } 207: /* Not a complete, valid UTF-8 character. */ 208: src -= ud.have; 209: } 210: if (src < end - 1) 211: dst = vis(dst, src[0], flag, src[1]); 212: else if (src < end) 213: dst = vis(dst, src[0], flag, '\0'); 214: src++; 215: } 216: 217: *dst = '\0'; 218: return (dst - start); 219: } 220: 221: /* Same as utf8_strvis but allocate the buffer. */ 222: int 223: utf8_stravis(char **dst, const char *src, int flag) 224: { 225: char *buf; 226: int len; 227: 228: buf = xreallocarray(NULL, 4, strlen(src) + 1); 229: len = utf8_strvis(buf, src, strlen(src), flag); 230: 231: *dst = xrealloc(buf, len + 1); 232: return (len); 233: } 234: 235: /* 236: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free 237: * the returned string. Anything not valid printable ASCII or UTF-8 is 238: * stripped. 239: */ 240: char * 241: utf8_sanitize(const char *src) 242: { 243: char *dst; 244: size_t n; 245: enum utf8_state more; 246: struct utf8_data ud; 247: u_int i; 248: 249: dst = NULL; 250: 251: n = 0; 252: while (*src != '\0') { 253: dst = xreallocarray(dst, n + 1, sizeof *dst); 254: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 255: while (*++src != '\0' && more == UTF8_MORE) 256: more = utf8_append(&ud, *src); 257: if (more == UTF8_DONE) { 258: dst = xreallocarray(dst, n + ud.width, 259: sizeof *dst); 260: for (i = 0; i < ud.width; i++) 261: dst[n++] = '_'; 262: continue; 263: } 264: src -= ud.have; 265: } 266: if (*src > 0x1f && *src < 0x7f) 267: dst[n++] = *src; 268: else 269: dst[n++] = '_'; 270: src++; 271: } 272: 273: dst = xreallocarray(dst, n + 1, sizeof *dst); 274: dst[n] = '\0'; 275: return (dst); 276: } 277: 278: /* Get UTF-8 buffer length. */ 279: size_t 280: utf8_strlen(const struct utf8_data *s) 281: { 282: size_t i; 283: 284: for (i = 0; s[i].size != 0; i++) 285: /* nothing */; 286: return (i); 287: } 288: 289: /* Get UTF-8 string width. */ 290: u_int 291: utf8_strwidth(const struct utf8_data *s, ssize_t n) 292: { 293: ssize_t i; 294: u_int width; 295: 296: width = 0; 297: for (i = 0; s[i].size != 0; i++) { 298: if (n != -1 && n == i) 299: break; 300: width += s[i].width; 301: } 302: return (width); 303: } 304: 305: /* 306: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. 307: * Caller frees. 308: */ 309: struct utf8_data * 310: utf8_fromcstr(const char *src) 311: { 312: struct utf8_data *dst; 313: size_t n; 314: enum utf8_state more; 315: 316: dst = NULL; 317: 318: n = 0; 319: while (*src != '\0') { 320: dst = xreallocarray(dst, n + 1, sizeof *dst); 321: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { 322: while (*++src != '\0' && more == UTF8_MORE) 323: more = utf8_append(&dst[n], *src); 324: if (more == UTF8_DONE) { 325: n++; 326: continue; 327: } 328: src -= dst[n].have; 329: } 330: utf8_set(&dst[n], *src); 331: n++; 332: src++; 333: } 334: 335: dst = xreallocarray(dst, n + 1, sizeof *dst); 336: dst[n].size = 0; 337: return (dst); 338: } 339: 340: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ 341: char * 342: utf8_tocstr(struct utf8_data *src) 343: { 344: char *dst; 345: size_t n; 346: 347: dst = NULL; 348: 349: n = 0; 350: for(; src->size != 0; src++) { 351: dst = xreallocarray(dst, n + src->size, 1); 352: memcpy(dst + n, src->data, src->size); 353: n += src->size; 354: } 355: 356: dst = xreallocarray(dst, n + 1, 1); 357: dst[n] = '\0'; 358: return (dst); 359: } 360: 361: /* Get width of UTF-8 string. */ 362: u_int 363: utf8_cstrwidth(const char *s) 364: { 365: struct utf8_data tmp; 366: u_int width; 367: enum utf8_state more; 368: 369: width = 0; 370: while (*s != '\0') { 371: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { 372: while (*++s != '\0' && more == UTF8_MORE) 373: more = utf8_append(&tmp, *s); 374: if (more == UTF8_DONE) { 375: width += tmp.width; 376: continue; 377: } 378: s -= tmp.have; 379: } 380: if (*s > 0x1f && *s != 0x7f) 381: width++; 382: s++; 383: } 384: return (width); 385: } 386: 387: /* Trim UTF-8 string to width. Caller frees. */ 388: char * 389: utf8_trimcstr(const char *s, u_int width) 390: { 391: struct utf8_data *tmp, *next; 392: char *out; 393: u_int at; 394: 395: tmp = utf8_fromcstr(s); 396: 397: at = 0; 398: for (next = tmp; next->size != 0; next++) { 399: if (at + next->width > width) { 400: next->size = 0; 401: break; 402: } 403: at += next->width; 404: } 405: 406: out = utf8_tocstr(tmp); 407: free(tmp); 408: return (out); 409: } 410: 411: /* Trim UTF-8 string to width. Caller frees. */ 412: char * 413: utf8_rtrimcstr(const char *s, u_int width) 414: { 415: struct utf8_data *tmp, *next, *end; 416: char *out; 417: u_int at; 418: 419: tmp = utf8_fromcstr(s); 420: 421: for (end = tmp; end->size != 0; end++) 422: /* nothing */; 423: if (end == tmp) { 424: free(tmp); 425: return (xstrdup("")); 426: } 427: next = end - 1; 428: 429: at = 0; 430: for (;;) 431: { 432: if (at + next->width > width) { 433: next++; 434: break; 435: } 436: at += next->width; 437: 438: if (next == tmp) 439: break; 440: next--; 441: } 442: 443: out = utf8_tocstr(next); 444: free(tmp); 445: return (out); 446: } 447: 448: /* Pad UTF-8 string to width. Caller frees. */ 449: char * 450: utf8_padcstr(const char *s, u_int width) 451: { 452: size_t slen; 453: char *out; 454: u_int n, i; 455: 456: n = utf8_cstrwidth(s); 457: if (n >= width) 458: return (xstrdup(s)); 459: 460: slen = strlen(s); 461: out = xmalloc(slen + 1 + (width - n)); 462: memcpy(out, s, slen); 463: for (i = n; i < width; i++) 464: out[slen++] = ' '; 465: out[slen] = '\0'; 466: return (out); 467: }