File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / tmux / utf8.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Jun 14 12:22:44 2017 UTC (7 years ago) by misho
Branches: tmux, MAIN
CVS tags: v2_4p0, v2_4, HEAD
tmux 2.4

    1: /* $OpenBSD$ */
    2: 
    3: /*
    4:  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
    5:  *
    6:  * Permission to use, copy, modify, and distribute this software for any
    7:  * purpose with or without fee is hereby granted, provided that the above
    8:  * copyright notice and this permission notice appear in all copies.
    9:  *
   10:  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   11:  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
   12:  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
   13:  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
   14:  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
   15:  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
   16:  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
   17:  */
   18: 
   19: #include <sys/types.h>
   20: 
   21: #include <errno.h>
   22: #include <stdlib.h>
   23: #include <string.h>
   24: #include <wchar.h>
   25: 
   26: #include "tmux.h"
   27: 
   28: static int	utf8_width(wchar_t);
   29: 
   30: /* Set a single character. */
   31: void
   32: utf8_set(struct utf8_data *ud, u_char ch)
   33: {
   34: 	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
   35: 
   36: 	memcpy(ud, &empty, sizeof *ud);
   37: 	*ud->data = ch;
   38: }
   39: 
   40: /* Copy UTF-8 character. */
   41: void
   42: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
   43: {
   44: 	u_int	i;
   45: 
   46: 	memcpy(to, from, sizeof *to);
   47: 
   48: 	for (i = to->size; i < sizeof to->data; i++)
   49: 		to->data[i] = '\0';
   50: }
   51: 
   52: /*
   53:  * Open UTF-8 sequence.
   54:  *
   55:  * 11000010-11011111 C2-DF start of 2-byte sequence
   56:  * 11100000-11101111 E0-EF start of 3-byte sequence
   57:  * 11110000-11110100 F0-F4 start of 4-byte sequence
   58:  */
   59: enum utf8_state
   60: utf8_open(struct utf8_data *ud, u_char ch)
   61: {
   62: 	memset(ud, 0, sizeof *ud);
   63: 	if (ch >= 0xc2 && ch <= 0xdf)
   64: 		ud->size = 2;
   65: 	else if (ch >= 0xe0 && ch <= 0xef)
   66: 		ud->size = 3;
   67: 	else if (ch >= 0xf0 && ch <= 0xf4)
   68: 		ud->size = 4;
   69: 	else
   70: 		return (UTF8_ERROR);
   71: 	utf8_append(ud, ch);
   72: 	return (UTF8_MORE);
   73: }
   74: 
   75: /* Append character to UTF-8, closing if finished. */
   76: enum utf8_state
   77: utf8_append(struct utf8_data *ud, u_char ch)
   78: {
   79: 	wchar_t	wc;
   80: 	int	width;
   81: 
   82: 	if (ud->have >= ud->size)
   83: 		fatalx("UTF-8 character overflow");
   84: 	if (ud->size > sizeof ud->data)
   85: 		fatalx("UTF-8 character size too large");
   86: 
   87: 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
   88: 		ud->width = 0xff;
   89: 
   90: 	ud->data[ud->have++] = ch;
   91: 	if (ud->have != ud->size)
   92: 		return (UTF8_MORE);
   93: 
   94: 	if (ud->width == 0xff)
   95: 		return (UTF8_ERROR);
   96: 
   97: 	if (utf8_combine(ud, &wc) != UTF8_DONE)
   98: 		return (UTF8_ERROR);
   99: 	if ((width = utf8_width(wc)) < 0)
  100: 		return (UTF8_ERROR);
  101: 	ud->width = width;
  102: 
  103: 	return (UTF8_DONE);
  104: }
  105: 
  106: /* Get width of Unicode character. */
  107: static int
  108: utf8_width(wchar_t wc)
  109: {
  110: 	int	width;
  111: 
  112: #ifdef HAVE_UTF8PROC
  113: 	width = utf8proc_wcwidth(wc);
  114: #else
  115: 	width = wcwidth(wc);
  116: #endif
  117: 	if (width < 0 || width > 0xff) {
  118: 		log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
  119: 
  120: #ifndef __OpenBSD__
  121: 		/*
  122: 		 * Many platforms (particularly and inevitably OS X) have no
  123: 		 * width for relatively common characters (wcwidth() returns
  124: 		 * -1); assume width 1 in this case. This will be wrong for
  125: 		 * genuinely nonprintable characters, but they should be
  126: 		 * rare. We may pass through stuff that ideally we would block,
  127: 		 * but this is no worse than sending the same to the terminal
  128: 		 * without tmux.
  129: 		 */
  130: 		if (width < 0)
  131: 			return (1);
  132: #endif
  133: 		return (-1);
  134: 	}
  135: 	return (width);
  136: }
  137: 
  138: /* Combine UTF-8 into Unicode. */
  139: enum utf8_state
  140: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
  141: {
  142: #ifdef HAVE_UTF8PROC
  143: 	switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
  144: #else
  145: 	switch (mbtowc(wc, ud->data, ud->size)) {
  146: #endif
  147: 	case -1:
  148: 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
  149: 		    errno);
  150: 		mbtowc(NULL, NULL, MB_CUR_MAX);
  151: 		return (UTF8_ERROR);
  152: 	case 0:
  153: 		return (UTF8_ERROR);
  154: 	default:
  155: 		return (UTF8_DONE);
  156: 	}
  157: }
  158: 
  159: /* Split Unicode into UTF-8. */
  160: enum utf8_state
  161: utf8_split(wchar_t wc, struct utf8_data *ud)
  162: {
  163: 	char	s[MB_LEN_MAX];
  164: 	int	slen;
  165: 
  166: #ifdef HAVE_UTF8PROC
  167: 	slen = utf8proc_wctomb(s, wc);
  168: #else
  169: 	slen = wctomb(s, wc);
  170: #endif
  171: 	if (slen <= 0 || slen > (int)sizeof ud->data)
  172: 		return (UTF8_ERROR);
  173: 
  174: 	memcpy(ud->data, s, slen);
  175: 	ud->size = slen;
  176: 
  177: 	ud->width = utf8_width(wc);
  178: 	return (UTF8_DONE);
  179: }
  180: 
  181: /*
  182:  * Encode len characters from src into dst, which is guaranteed to have four
  183:  * bytes available for each character from src (for \abc or UTF-8) plus space
  184:  * for \0.
  185:  */
  186: int
  187: utf8_strvis(char *dst, const char *src, size_t len, int flag)
  188: {
  189: 	struct utf8_data	 ud;
  190: 	const char		*start, *end;
  191: 	enum utf8_state		 more;
  192: 	size_t			 i;
  193: 
  194: 	start = dst;
  195: 	end = src + len;
  196: 
  197: 	while (src < end) {
  198: 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
  199: 			while (++src < end && more == UTF8_MORE)
  200: 				more = utf8_append(&ud, *src);
  201: 			if (more == UTF8_DONE) {
  202: 				/* UTF-8 character finished. */
  203: 				for (i = 0; i < ud.size; i++)
  204: 					*dst++ = ud.data[i];
  205: 				continue;
  206: 			}
  207: 			/* Not a complete, valid UTF-8 character. */
  208: 			src -= ud.have;
  209: 		}
  210: 		if (src < end - 1)
  211: 			dst = vis(dst, src[0], flag, src[1]);
  212: 		else if (src < end)
  213: 			dst = vis(dst, src[0], flag, '\0');
  214: 		src++;
  215: 	}
  216: 
  217: 	*dst = '\0';
  218: 	return (dst - start);
  219: }
  220: 
  221: /* Same as utf8_strvis but allocate the buffer. */
  222: int
  223: utf8_stravis(char **dst, const char *src, int flag)
  224: {
  225: 	char	*buf;
  226: 	int	 len;
  227: 
  228: 	buf = xreallocarray(NULL, 4, strlen(src) + 1);
  229: 	len = utf8_strvis(buf, src, strlen(src), flag);
  230: 
  231: 	*dst = xrealloc(buf, len + 1);
  232: 	return (len);
  233: }
  234: 
  235: /*
  236:  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
  237:  * the returned string. Anything not valid printable ASCII or UTF-8 is
  238:  * stripped.
  239:  */
  240: char *
  241: utf8_sanitize(const char *src)
  242: {
  243: 	char			*dst;
  244: 	size_t			 n;
  245: 	enum utf8_state		 more;
  246: 	struct utf8_data	 ud;
  247: 	u_int			 i;
  248: 
  249: 	dst = NULL;
  250: 
  251: 	n = 0;
  252: 	while (*src != '\0') {
  253: 		dst = xreallocarray(dst, n + 1, sizeof *dst);
  254: 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
  255: 			while (*++src != '\0' && more == UTF8_MORE)
  256: 				more = utf8_append(&ud, *src);
  257: 			if (more == UTF8_DONE) {
  258: 				dst = xreallocarray(dst, n + ud.width,
  259: 				    sizeof *dst);
  260: 				for (i = 0; i < ud.width; i++)
  261: 					dst[n++] = '_';
  262: 				continue;
  263: 			}
  264: 			src -= ud.have;
  265: 		}
  266: 		if (*src > 0x1f && *src < 0x7f)
  267: 			dst[n++] = *src;
  268: 		else
  269: 			dst[n++] = '_';
  270: 		src++;
  271: 	}
  272: 
  273: 	dst = xreallocarray(dst, n + 1, sizeof *dst);
  274: 	dst[n] = '\0';
  275: 	return (dst);
  276: }
  277: 
  278: /* Get UTF-8 buffer length. */
  279: size_t
  280: utf8_strlen(const struct utf8_data *s)
  281: {
  282: 	size_t	i;
  283: 
  284: 	for (i = 0; s[i].size != 0; i++)
  285: 		/* nothing */;
  286: 	return (i);
  287: }
  288: 
  289: /* Get UTF-8 string width. */
  290: u_int
  291: utf8_strwidth(const struct utf8_data *s, ssize_t n)
  292: {
  293: 	ssize_t	i;
  294: 	u_int	width;
  295: 
  296: 	width = 0;
  297: 	for (i = 0; s[i].size != 0; i++) {
  298: 		if (n != -1 && n == i)
  299: 			break;
  300: 		width += s[i].width;
  301: 	}
  302: 	return (width);
  303: }
  304: 
  305: /*
  306:  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
  307:  * Caller frees.
  308:  */
  309: struct utf8_data *
  310: utf8_fromcstr(const char *src)
  311: {
  312: 	struct utf8_data	*dst;
  313: 	size_t			 n;
  314: 	enum utf8_state		 more;
  315: 
  316: 	dst = NULL;
  317: 
  318: 	n = 0;
  319: 	while (*src != '\0') {
  320: 		dst = xreallocarray(dst, n + 1, sizeof *dst);
  321: 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
  322: 			while (*++src != '\0' && more == UTF8_MORE)
  323: 				more = utf8_append(&dst[n], *src);
  324: 			if (more == UTF8_DONE) {
  325: 				n++;
  326: 				continue;
  327: 			}
  328: 			src -= dst[n].have;
  329: 		}
  330: 		utf8_set(&dst[n], *src);
  331: 		n++;
  332: 		src++;
  333: 	}
  334: 
  335: 	dst = xreallocarray(dst, n + 1, sizeof *dst);
  336: 	dst[n].size = 0;
  337: 	return (dst);
  338: }
  339: 
  340: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
  341: char *
  342: utf8_tocstr(struct utf8_data *src)
  343: {
  344: 	char	*dst;
  345: 	size_t	 n;
  346: 
  347: 	dst = NULL;
  348: 
  349: 	n = 0;
  350: 	for(; src->size != 0; src++) {
  351: 		dst = xreallocarray(dst, n + src->size, 1);
  352: 		memcpy(dst + n, src->data, src->size);
  353: 		n += src->size;
  354: 	}
  355: 
  356: 	dst = xreallocarray(dst, n + 1, 1);
  357: 	dst[n] = '\0';
  358: 	return (dst);
  359: }
  360: 
  361: /* Get width of UTF-8 string. */
  362: u_int
  363: utf8_cstrwidth(const char *s)
  364: {
  365: 	struct utf8_data	tmp;
  366: 	u_int			width;
  367: 	enum utf8_state		more;
  368: 
  369: 	width = 0;
  370: 	while (*s != '\0') {
  371: 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
  372: 			while (*++s != '\0' && more == UTF8_MORE)
  373: 				more = utf8_append(&tmp, *s);
  374: 			if (more == UTF8_DONE) {
  375: 				width += tmp.width;
  376: 				continue;
  377: 			}
  378: 			s -= tmp.have;
  379: 		}
  380: 		if (*s > 0x1f && *s != 0x7f)
  381: 			width++;
  382: 		s++;
  383: 	}
  384: 	return (width);
  385: }
  386: 
  387: /* Trim UTF-8 string to width. Caller frees. */
  388: char *
  389: utf8_trimcstr(const char *s, u_int width)
  390: {
  391: 	struct utf8_data	*tmp, *next;
  392: 	char			*out;
  393: 	u_int			 at;
  394: 
  395: 	tmp = utf8_fromcstr(s);
  396: 
  397: 	at = 0;
  398: 	for (next = tmp; next->size != 0; next++) {
  399: 		if (at + next->width > width) {
  400: 			next->size = 0;
  401: 			break;
  402: 		}
  403: 		at += next->width;
  404: 	}
  405: 
  406: 	out = utf8_tocstr(tmp);
  407: 	free(tmp);
  408: 	return (out);
  409: }
  410: 
  411: /* Trim UTF-8 string to width. Caller frees. */
  412: char *
  413: utf8_rtrimcstr(const char *s, u_int width)
  414: {
  415: 	struct utf8_data	*tmp, *next, *end;
  416: 	char			*out;
  417: 	u_int			 at;
  418: 
  419: 	tmp = utf8_fromcstr(s);
  420: 
  421: 	for (end = tmp; end->size != 0; end++)
  422: 		/* nothing */;
  423: 	if (end == tmp) {
  424: 		free(tmp);
  425: 		return (xstrdup(""));
  426: 	}
  427: 	next = end - 1;
  428: 
  429: 	at = 0;
  430: 	for (;;)
  431: 	{
  432: 		if (at + next->width > width) {
  433: 			next++;
  434: 			break;
  435: 		}
  436: 		at += next->width;
  437: 
  438: 		if (next == tmp)
  439: 			break;
  440: 		next--;
  441: 	}
  442: 
  443: 	out = utf8_tocstr(next);
  444: 	free(tmp);
  445: 	return (out);
  446: }
  447: 
  448: /* Pad UTF-8 string to width. Caller frees. */
  449: char *
  450: utf8_padcstr(const char *s, u_int width)
  451: {
  452: 	size_t	 slen;
  453: 	char	*out;
  454: 	u_int	  n, i;
  455: 
  456: 	n = utf8_cstrwidth(s);
  457: 	if (n >= width)
  458: 		return (xstrdup(s));
  459: 
  460: 	slen = strlen(s);
  461: 	out = xmalloc(slen + 1 + (width - n));
  462: 	memcpy(out, s, slen);
  463: 	for (i = n; i < width; i++)
  464: 		out[slen++] = ' ';
  465: 	out[slen] = '\0';
  466: 	return (out);
  467: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>