File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / readline / mbutil.c
Revision 1.1.1.2 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 01:01:01 2021 UTC (3 years, 3 months ago) by misho
Branches: readline, MAIN
CVS tags: v8_2p0, v8_1p0, HEAD
readline 8.1

    1: /* mbutil.c -- readline multibyte character utility functions */
    2: 
    3: /* Copyright (C) 2001-2020 Free Software Foundation, Inc.
    4: 
    5:    This file is part of the GNU Readline Library (Readline), a library
    6:    for reading lines of text with interactive input and history editing.      
    7: 
    8:    Readline is free software: you can redistribute it and/or modify
    9:    it under the terms of the GNU General Public License as published by
   10:    the Free Software Foundation, either version 3 of the License, or
   11:    (at your option) any later version.
   12: 
   13:    Readline is distributed in the hope that it will be useful,
   14:    but WITHOUT ANY WARRANTY; without even the implied warranty of
   15:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   16:    GNU General Public License for more details.
   17: 
   18:    You should have received a copy of the GNU General Public License
   19:    along with Readline.  If not, see <http://www.gnu.org/licenses/>.
   20: */
   21: 
   22: #define READLINE_LIBRARY
   23: 
   24: #if defined (HAVE_CONFIG_H)
   25: #  include <config.h>
   26: #endif
   27: 
   28: #include <sys/types.h>
   29: #include <fcntl.h>
   30: #include "posixjmp.h"
   31: 
   32: #if defined (HAVE_UNISTD_H)
   33: #  include <unistd.h>	   /* for _POSIX_VERSION */
   34: #endif /* HAVE_UNISTD_H */
   35: 
   36: #if defined (HAVE_STDLIB_H)
   37: #  include <stdlib.h>
   38: #else
   39: #  include "ansi_stdlib.h"
   40: #endif /* HAVE_STDLIB_H */
   41: 
   42: #include <stdio.h>
   43: #include <ctype.h>
   44: 
   45: /* System-specific feature definitions and include files. */
   46: #include "rldefs.h"
   47: #include "rlmbutil.h"
   48: 
   49: #if defined (TIOCSTAT_IN_SYS_IOCTL)
   50: #  include <sys/ioctl.h>
   51: #endif /* TIOCSTAT_IN_SYS_IOCTL */
   52: 
   53: /* Some standard library routines. */
   54: #include "readline.h"
   55: 
   56: #include "rlprivate.h"
   57: #include "xmalloc.h"
   58: 
   59: /* Declared here so it can be shared between the readline and history
   60:    libraries. */
   61: #if defined (HANDLE_MULTIBYTE)
   62: int rl_byte_oriented = 0;
   63: #else
   64: int rl_byte_oriented = 1;
   65: #endif
   66: 
   67: /* Ditto */
   68: int _rl_utf8locale = 0;
   69: 
   70: /* **************************************************************** */
   71: /*								    */
   72: /*		Multibyte Character Utility Functions		    */
   73: /*								    */
   74: /* **************************************************************** */
   75: 
   76: #if defined(HANDLE_MULTIBYTE)
   77: 
   78: /* **************************************************************** */
   79: /*								    */
   80: /*		UTF-8 specific Character Utility Functions	    */
   81: /*								    */
   82: /* **************************************************************** */
   83: 
   84: /* Return the length in bytes of the possibly-multibyte character beginning
   85:    at S. Encoding is UTF-8. */
   86: static int
   87: _rl_utf8_mblen (const char *s, size_t n)
   88: {
   89:   unsigned char c, c1, c2, c3;
   90: 
   91:   if (s == 0)
   92:     return (0);	/* no shift states */
   93:   if (n <= 0)
   94:     return (-1);
   95: 
   96:   c = (unsigned char)*s;
   97:   if (c < 0x80)
   98:     return (c != 0);
   99:   if (c >= 0xc2)
  100:     {
  101:       c1 = (unsigned char)s[1];
  102:       if (c < 0xe0)
  103: 	{
  104: 	  if (n == 1)
  105: 	    return -2;
  106: 	  if (n >= 2 && (c1 ^ 0x80) < 0x40)
  107: 	    return 2;
  108: 	}
  109:       else if (c < 0xf0)
  110: 	{
  111: 	  if (n == 1)
  112: 	    return -2;
  113: 	  if ((c1 ^ 0x80) < 0x40
  114: 		&& (c >= 0xe1 || c1 >= 0xa0)
  115: 		&& (c != 0xed || c1 < 0xa0))
  116: 	    {
  117: 	      if (n == 2)
  118: 		return -2;
  119: 	      c2 = (unsigned char)s[2];
  120: 	      if ((c2 ^ 0x80) < 0x40)
  121: 		return 3;
  122: 	    }
  123: 	}
  124:       else if (c < 0xf4)
  125: 	{
  126: 	  if (n == 1)
  127: 	    return -2;
  128: 	  if (((c1 ^ 0x80) < 0x40)
  129: 		&& (c >= 0xf1 || c1 >= 0x90)
  130: 		&& (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
  131: 	    {
  132: 	      if (n == 2)
  133: 		return -2;
  134: 	      c2 = (unsigned char)s[2];
  135: 	      if ((c2 ^ 0x80) < 0x40)
  136: 		{
  137: 		  if (n == 3)
  138: 		    return -2;
  139: 		  c3 = (unsigned char)s[3];
  140: 		  if ((c3 ^ 0x80) < 0x40)
  141: 		    return 4;
  142: 		}
  143: 	    }
  144: 	}
  145:     }
  146:   /* invalid or incomplete multibyte character */
  147:   return -1;
  148: }
  149: 
  150: static int
  151: _rl_find_next_mbchar_internal (char *string, int seed, int count, int find_non_zero)
  152: {
  153:   size_t tmp, len;
  154:   mbstate_t ps;
  155:   int point;
  156:   wchar_t wc;
  157: 
  158:   tmp = 0;
  159: 
  160:   memset(&ps, 0, sizeof (mbstate_t));
  161:   if (seed < 0)
  162:     seed = 0;
  163:   if (count <= 0)
  164:     return seed;
  165: 
  166:   point = seed + _rl_adjust_point (string, seed, &ps);
  167:   /* if _rl_adjust_point returns -1, the character or string is invalid.
  168:      treat as a byte. */
  169:   if (point == seed - 1)	/* invalid */
  170:     return seed + 1;
  171:     
  172:   /* if this is true, means that seed was not pointing to a byte indicating
  173:      the beginning of a multibyte character.  Correct the point and consume
  174:      one char. */
  175:   if (seed < point)
  176:     count--;
  177: 
  178:   while (count > 0)  
  179:     {
  180:       len = strlen (string + point);
  181:       if (len == 0)
  182: 	break;
  183:       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
  184: 	{
  185: 	  tmp = 1;
  186: 	  wc = (wchar_t) string[point];
  187: 	  memset(&ps, 0, sizeof(mbstate_t));
  188: 	}
  189:       else
  190: 	tmp = mbrtowc (&wc, string+point, len, &ps);
  191:       if (MB_INVALIDCH ((size_t)tmp))
  192: 	{
  193: 	  /* invalid bytes. assume a byte represents a character */
  194: 	  point++;
  195: 	  count--;
  196: 	  /* reset states. */
  197: 	  memset(&ps, 0, sizeof(mbstate_t));
  198: 	}
  199:       else if (MB_NULLWCH (tmp))
  200: 	break;			/* found wide '\0' */
  201:       else
  202: 	{
  203: 	  /* valid bytes */
  204: 	  point += tmp;
  205: 	  if (find_non_zero)
  206: 	    {
  207: 	      if (WCWIDTH (wc) == 0)
  208: 		continue;
  209: 	      else
  210: 		count--;
  211: 	    }
  212: 	  else
  213: 	    count--;
  214: 	}
  215:     }
  216: 
  217:   if (find_non_zero)
  218:     {
  219:       tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
  220:       while (MB_NULLWCH (tmp) == 0 && MB_INVALIDCH (tmp) == 0 && WCWIDTH (wc) == 0)
  221: 	{
  222: 	  point += tmp;
  223: 	  tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
  224: 	}
  225:     }
  226: 
  227:   return point;
  228: }
  229: 
  230: static inline int
  231: _rl_test_nonzero (char *string, int ind, int len)
  232: {
  233:   size_t tmp;
  234:   wchar_t wc;
  235:   mbstate_t ps;
  236: 
  237:   memset (&ps, 0, sizeof (mbstate_t));
  238:   tmp = mbrtowc (&wc, string + ind, len - ind, &ps);
  239:   /* treat invalid multibyte sequences as non-zero-width */
  240:   return (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp) || WCWIDTH (wc) > 0);
  241: }
  242: 
  243: /* experimental -- needs to handle zero-width characters better */
  244: static int
  245: _rl_find_prev_utf8char (char *string, int seed, int find_non_zero)
  246: {
  247:   char *s;
  248:   unsigned char b;
  249:   int save, prev;
  250:   size_t len;
  251: 
  252:   if (find_non_zero)
  253:     len = RL_STRLEN (string);
  254: 
  255:   prev = seed - 1;
  256:   while (prev >= 0)
  257:    {
  258:       b = (unsigned char)string[prev];
  259:       if (UTF8_SINGLEBYTE (b))
  260: 	return (prev);
  261: 
  262:       save = prev;
  263: 
  264:       /* Move back until we're not in the middle of a multibyte char */
  265:       if (UTF8_MBCHAR (b))
  266: 	{
  267: 	  while (prev > 0 && (b = (unsigned char)string[--prev]) && UTF8_MBCHAR (b))
  268: 	    ;
  269: 	}
  270: 
  271:       if (UTF8_MBFIRSTCHAR (b))
  272: 	{
  273: 	  if (find_non_zero)
  274: 	    {
  275: 	      if (_rl_test_nonzero (string, prev, len))
  276: 		return (prev);
  277: 	      else		/* valid but WCWIDTH (wc) == 0 */
  278: 		prev = prev - 1;
  279: 	    }
  280: 	  else
  281: 	    return (prev);
  282: 	}
  283:       else
  284: 	return (save);			/* invalid utf-8 multibyte sequence */
  285:     }
  286: 
  287:   return ((prev < 0) ? 0 : prev);
  288: }  
  289: 
  290: /*static*/ int
  291: _rl_find_prev_mbchar_internal (char *string, int seed, int find_non_zero)
  292: {
  293:   mbstate_t ps;
  294:   int prev, non_zero_prev, point, length;
  295:   size_t tmp;
  296:   wchar_t wc;
  297: 
  298:   if (_rl_utf8locale)
  299:     return (_rl_find_prev_utf8char (string, seed, find_non_zero));
  300: 
  301:   memset(&ps, 0, sizeof(mbstate_t));
  302:   length = strlen(string);
  303:   
  304:   if (seed < 0)
  305:     return 0;
  306:   else if (length < seed)
  307:     return length;
  308: 
  309:   prev = non_zero_prev = point = 0;
  310:   while (point < seed)
  311:     {
  312:       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
  313: 	{
  314: 	  tmp = 1;
  315: 	  wc = (wchar_t) string[point];
  316: 	  memset(&ps, 0, sizeof(mbstate_t));
  317: 	}
  318:       else
  319: 	tmp = mbrtowc (&wc, string + point, length - point, &ps);
  320:       if (MB_INVALIDCH ((size_t)tmp))
  321: 	{
  322: 	  /* in this case, bytes are invalid or too short to compose
  323: 	     multibyte char, so assume that the first byte represents
  324: 	     a single character anyway. */
  325: 	  tmp = 1;
  326: 	  /* clear the state of the byte sequence, because
  327: 	     in this case effect of mbstate is undefined  */
  328: 	  memset(&ps, 0, sizeof (mbstate_t));
  329: 
  330: 	  /* Since we're assuming that this byte represents a single
  331: 	     non-zero-width character, don't forget about it. */
  332: 	  prev = point;
  333: 	}
  334:       else if (MB_NULLWCH (tmp))
  335: 	break;			/* Found '\0' char.  Can this happen? */
  336:       else
  337: 	{
  338: 	  if (find_non_zero)
  339: 	    {
  340: 	      if (WCWIDTH (wc) != 0)
  341: 		prev = point;
  342: 	    }
  343: 	  else
  344: 	    prev = point;  
  345: 	}
  346: 
  347:       point += tmp;
  348:     }
  349: 
  350:   return prev;
  351: }
  352: 
  353: /* return the number of bytes parsed from the multibyte sequence starting
  354:    at src, if a non-L'\0' wide character was recognized. It returns 0, 
  355:    if a L'\0' wide character was recognized. It  returns (size_t)(-1), 
  356:    if an invalid multibyte sequence was encountered. It returns (size_t)(-2) 
  357:    if it couldn't parse a complete  multibyte character.  */
  358: int
  359: _rl_get_char_len (char *src, mbstate_t *ps)
  360: {
  361:   size_t tmp, l;
  362:   int mb_cur_max;
  363: 
  364:   /* Look at no more than MB_CUR_MAX characters */
  365:   l = (size_t)strlen (src);
  366:   if (_rl_utf8locale && l > 0 && UTF8_SINGLEBYTE(*src))
  367:     tmp = (*src != 0) ? 1 : 0;
  368:   else
  369:     {
  370:       mb_cur_max = MB_CUR_MAX;
  371:       tmp = mbrlen((const char *)src, (l < mb_cur_max) ? l : mb_cur_max, ps);
  372:     }
  373:   if (tmp == (size_t)(-2))
  374:     {
  375:       /* too short to compose multibyte char */
  376:       if (ps)
  377: 	memset (ps, 0, sizeof(mbstate_t));
  378:       return -2;
  379:     }
  380:   else if (tmp == (size_t)(-1))
  381:     {
  382:       /* invalid to compose multibyte char */
  383:       /* initialize the conversion state */
  384:       if (ps)
  385: 	memset (ps, 0, sizeof(mbstate_t));
  386:       return -1;
  387:     }
  388:   else if (tmp == (size_t)0)
  389:     return 0;
  390:   else
  391:     return (int)tmp;
  392: }
  393: 
  394: /* compare the specified two characters. If the characters matched,
  395:    return 1. Otherwise return 0. */
  396: int
  397: _rl_compare_chars (char *buf1, int pos1, mbstate_t *ps1, char *buf2, int pos2, mbstate_t *ps2)
  398: {
  399:   int i, w1, w2;
  400: 
  401:   if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 || 
  402: 	(w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
  403: 	(w1 != w2) ||
  404: 	(buf1[pos1] != buf2[pos2]))
  405:     return 0;
  406: 
  407:   for (i = 1; i < w1; i++)
  408:     if (buf1[pos1+i] != buf2[pos2+i])
  409:       return 0;
  410: 
  411:   return 1;
  412: }
  413: 
  414: /* adjust pointed byte and find mbstate of the point of string.
  415:    adjusted point will be point <= adjusted_point, and returns
  416:    differences of the byte(adjusted_point - point).
  417:    if point is invalid (point < 0 || more than string length),
  418:    it returns -1 */
  419: int
  420: _rl_adjust_point (char *string, int point, mbstate_t *ps)
  421: {
  422:   size_t tmp;
  423:   int length, pos;
  424: 
  425:   tmp = 0;
  426:   pos = 0;
  427:   length = strlen(string);
  428:   if (point < 0)
  429:     return -1;
  430:   if (length < point)
  431:     return -1;
  432:   
  433:   while (pos < point)
  434:     {
  435:       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[pos]))
  436: 	tmp = 1;
  437:       else
  438: 	tmp = mbrlen (string + pos, length - pos, ps);
  439:       if (MB_INVALIDCH ((size_t)tmp))
  440: 	{
  441: 	  /* in this case, bytes are invalid or too short to compose
  442: 	     multibyte char, so assume that the first byte represents
  443: 	     a single character anyway. */
  444: 	  pos++;
  445: 	  /* clear the state of the byte sequence, because
  446: 	     in this case effect of mbstate is undefined  */
  447: 	  if (ps)
  448: 	    memset (ps, 0, sizeof (mbstate_t));
  449: 	}
  450:       else if (MB_NULLWCH (tmp))
  451: 	pos++;
  452:       else
  453: 	pos += tmp;
  454:     }
  455: 
  456:   return (pos - point);
  457: }
  458: 
  459: int
  460: _rl_is_mbchar_matched (char *string, int seed, int end, char *mbchar, int length)
  461: {
  462:   int i;
  463: 
  464:   if ((end - seed) < length)
  465:     return 0;
  466: 
  467:   for (i = 0; i < length; i++)
  468:     if (string[seed + i] != mbchar[i])
  469:       return 0;
  470:   return 1;
  471: }
  472: 
  473: wchar_t
  474: _rl_char_value (char *buf, int ind)
  475: {
  476:   size_t tmp;
  477:   wchar_t wc;
  478:   mbstate_t ps;
  479:   int l;
  480: 
  481:   if (MB_LEN_MAX == 1 || rl_byte_oriented)
  482:     return ((wchar_t) buf[ind]);
  483:   if (_rl_utf8locale && UTF8_SINGLEBYTE(buf[ind]))
  484:     return ((wchar_t) buf[ind]);
  485:   l = strlen (buf);
  486:   if (ind >= l - 1)
  487:     return ((wchar_t) buf[ind]);
  488:   if (l < ind)			/* Sanity check */
  489:     l = strlen (buf+ind);
  490:   memset (&ps, 0, sizeof (mbstate_t));
  491:   tmp = mbrtowc (&wc, buf + ind, l - ind, &ps);
  492:   if (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp))  
  493:     return ((wchar_t) buf[ind]);
  494:   return wc;
  495: }
  496: #endif /* HANDLE_MULTIBYTE */
  497: 
  498: /* Find next `count' characters started byte point of the specified seed.
  499:    If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
  500:    characters. */
  501: #undef _rl_find_next_mbchar
  502: int
  503: _rl_find_next_mbchar (char *string, int seed, int count, int flags)
  504: {
  505: #if defined (HANDLE_MULTIBYTE)
  506:   return _rl_find_next_mbchar_internal (string, seed, count, flags);
  507: #else
  508:   return (seed + count);
  509: #endif
  510: }
  511: 
  512: /* Find previous character started byte point of the specified seed.
  513:    Returned point will be point <= seed.  If flags is MB_FIND_NONZERO,
  514:    we look for non-zero-width multibyte characters. */
  515: #undef _rl_find_prev_mbchar
  516: int
  517: _rl_find_prev_mbchar (char *string, int seed, int flags)
  518: {
  519: #if defined (HANDLE_MULTIBYTE)
  520:   return _rl_find_prev_mbchar_internal (string, seed, flags);
  521: #else
  522:   return ((seed == 0) ? seed : seed - 1);
  523: #endif
  524: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>