Annotation of embedaddon/readline/mbutil.c, revision 1.1.1.2

1.1       misho       1: /* mbutil.c -- readline multibyte character utility functions */
                      2: 
1.1.1.2 ! misho       3: /* Copyright (C) 2001-2020 Free Software Foundation, Inc.
1.1       misho       4: 
                      5:    This file is part of the GNU Readline Library (Readline), a library
                      6:    for reading lines of text with interactive input and history editing.      
                      7: 
                      8:    Readline is free software: you can redistribute it and/or modify
                      9:    it under the terms of the GNU General Public License as published by
                     10:    the Free Software Foundation, either version 3 of the License, or
                     11:    (at your option) any later version.
                     12: 
                     13:    Readline is distributed in the hope that it will be useful,
                     14:    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     15:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     16:    GNU General Public License for more details.
                     17: 
                     18:    You should have received a copy of the GNU General Public License
                     19:    along with Readline.  If not, see <http://www.gnu.org/licenses/>.
                     20: */
                     21: 
                     22: #define READLINE_LIBRARY
                     23: 
                     24: #if defined (HAVE_CONFIG_H)
                     25: #  include <config.h>
                     26: #endif
                     27: 
                     28: #include <sys/types.h>
                     29: #include <fcntl.h>
                     30: #include "posixjmp.h"
                     31: 
                     32: #if defined (HAVE_UNISTD_H)
                     33: #  include <unistd.h>     /* for _POSIX_VERSION */
                     34: #endif /* HAVE_UNISTD_H */
                     35: 
                     36: #if defined (HAVE_STDLIB_H)
                     37: #  include <stdlib.h>
                     38: #else
                     39: #  include "ansi_stdlib.h"
                     40: #endif /* HAVE_STDLIB_H */
                     41: 
                     42: #include <stdio.h>
                     43: #include <ctype.h>
                     44: 
                     45: /* System-specific feature definitions and include files. */
                     46: #include "rldefs.h"
                     47: #include "rlmbutil.h"
                     48: 
                     49: #if defined (TIOCSTAT_IN_SYS_IOCTL)
                     50: #  include <sys/ioctl.h>
                     51: #endif /* TIOCSTAT_IN_SYS_IOCTL */
                     52: 
                     53: /* Some standard library routines. */
                     54: #include "readline.h"
                     55: 
                     56: #include "rlprivate.h"
                     57: #include "xmalloc.h"
                     58: 
                     59: /* Declared here so it can be shared between the readline and history
                     60:    libraries. */
                     61: #if defined (HANDLE_MULTIBYTE)
                     62: int rl_byte_oriented = 0;
                     63: #else
                     64: int rl_byte_oriented = 1;
                     65: #endif
                     66: 
                     67: /* Ditto */
                     68: int _rl_utf8locale = 0;
                     69: 
                     70: /* **************************************************************** */
                     71: /*                                                                 */
                     72: /*             Multibyte Character Utility Functions               */
                     73: /*                                                                 */
                     74: /* **************************************************************** */
                     75: 
                     76: #if defined(HANDLE_MULTIBYTE)
                     77: 
1.1.1.2 ! misho      78: /* **************************************************************** */
        !            79: /*                                                                 */
        !            80: /*             UTF-8 specific Character Utility Functions          */
        !            81: /*                                                                 */
        !            82: /* **************************************************************** */
        !            83: 
        !            84: /* Return the length in bytes of the possibly-multibyte character beginning
        !            85:    at S. Encoding is UTF-8. */
1.1       misho      86: static int
1.1.1.2 ! misho      87: _rl_utf8_mblen (const char *s, size_t n)
        !            88: {
        !            89:   unsigned char c, c1, c2, c3;
        !            90: 
        !            91:   if (s == 0)
        !            92:     return (0);        /* no shift states */
        !            93:   if (n <= 0)
        !            94:     return (-1);
        !            95: 
        !            96:   c = (unsigned char)*s;
        !            97:   if (c < 0x80)
        !            98:     return (c != 0);
        !            99:   if (c >= 0xc2)
        !           100:     {
        !           101:       c1 = (unsigned char)s[1];
        !           102:       if (c < 0xe0)
        !           103:        {
        !           104:          if (n == 1)
        !           105:            return -2;
        !           106:          if (n >= 2 && (c1 ^ 0x80) < 0x40)
        !           107:            return 2;
        !           108:        }
        !           109:       else if (c < 0xf0)
        !           110:        {
        !           111:          if (n == 1)
        !           112:            return -2;
        !           113:          if ((c1 ^ 0x80) < 0x40
        !           114:                && (c >= 0xe1 || c1 >= 0xa0)
        !           115:                && (c != 0xed || c1 < 0xa0))
        !           116:            {
        !           117:              if (n == 2)
        !           118:                return -2;
        !           119:              c2 = (unsigned char)s[2];
        !           120:              if ((c2 ^ 0x80) < 0x40)
        !           121:                return 3;
        !           122:            }
        !           123:        }
        !           124:       else if (c < 0xf4)
        !           125:        {
        !           126:          if (n == 1)
        !           127:            return -2;
        !           128:          if (((c1 ^ 0x80) < 0x40)
        !           129:                && (c >= 0xf1 || c1 >= 0x90)
        !           130:                && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
        !           131:            {
        !           132:              if (n == 2)
        !           133:                return -2;
        !           134:              c2 = (unsigned char)s[2];
        !           135:              if ((c2 ^ 0x80) < 0x40)
        !           136:                {
        !           137:                  if (n == 3)
        !           138:                    return -2;
        !           139:                  c3 = (unsigned char)s[3];
        !           140:                  if ((c3 ^ 0x80) < 0x40)
        !           141:                    return 4;
        !           142:                }
        !           143:            }
        !           144:        }
        !           145:     }
        !           146:   /* invalid or incomplete multibyte character */
        !           147:   return -1;
        !           148: }
        !           149: 
        !           150: static int
        !           151: _rl_find_next_mbchar_internal (char *string, int seed, int count, int find_non_zero)
1.1       misho     152: {
                    153:   size_t tmp, len;
                    154:   mbstate_t ps;
                    155:   int point;
                    156:   wchar_t wc;
                    157: 
                    158:   tmp = 0;
                    159: 
                    160:   memset(&ps, 0, sizeof (mbstate_t));
                    161:   if (seed < 0)
                    162:     seed = 0;
                    163:   if (count <= 0)
                    164:     return seed;
                    165: 
                    166:   point = seed + _rl_adjust_point (string, seed, &ps);
1.1.1.2 ! misho     167:   /* if _rl_adjust_point returns -1, the character or string is invalid.
        !           168:      treat as a byte. */
        !           169:   if (point == seed - 1)       /* invalid */
        !           170:     return seed + 1;
        !           171:     
1.1       misho     172:   /* if this is true, means that seed was not pointing to a byte indicating
                    173:      the beginning of a multibyte character.  Correct the point and consume
                    174:      one char. */
                    175:   if (seed < point)
                    176:     count--;
                    177: 
                    178:   while (count > 0)  
                    179:     {
                    180:       len = strlen (string + point);
                    181:       if (len == 0)
                    182:        break;
1.1.1.2 ! misho     183:       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
        !           184:        {
        !           185:          tmp = 1;
        !           186:          wc = (wchar_t) string[point];
        !           187:          memset(&ps, 0, sizeof(mbstate_t));
        !           188:        }
        !           189:       else
        !           190:        tmp = mbrtowc (&wc, string+point, len, &ps);
1.1       misho     191:       if (MB_INVALIDCH ((size_t)tmp))
                    192:        {
                    193:          /* invalid bytes. assume a byte represents a character */
                    194:          point++;
                    195:          count--;
                    196:          /* reset states. */
                    197:          memset(&ps, 0, sizeof(mbstate_t));
                    198:        }
                    199:       else if (MB_NULLWCH (tmp))
                    200:        break;                  /* found wide '\0' */
                    201:       else
                    202:        {
                    203:          /* valid bytes */
                    204:          point += tmp;
                    205:          if (find_non_zero)
                    206:            {
                    207:              if (WCWIDTH (wc) == 0)
                    208:                continue;
                    209:              else
                    210:                count--;
                    211:            }
                    212:          else
                    213:            count--;
                    214:        }
                    215:     }
                    216: 
                    217:   if (find_non_zero)
                    218:     {
                    219:       tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
                    220:       while (MB_NULLWCH (tmp) == 0 && MB_INVALIDCH (tmp) == 0 && WCWIDTH (wc) == 0)
                    221:        {
                    222:          point += tmp;
                    223:          tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
                    224:        }
                    225:     }
                    226: 
                    227:   return point;
                    228: }
                    229: 
1.1.1.2 ! misho     230: static inline int
        !           231: _rl_test_nonzero (char *string, int ind, int len)
        !           232: {
        !           233:   size_t tmp;
        !           234:   wchar_t wc;
        !           235:   mbstate_t ps;
        !           236: 
        !           237:   memset (&ps, 0, sizeof (mbstate_t));
        !           238:   tmp = mbrtowc (&wc, string + ind, len - ind, &ps);
        !           239:   /* treat invalid multibyte sequences as non-zero-width */
        !           240:   return (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp) || WCWIDTH (wc) > 0);
        !           241: }
        !           242: 
        !           243: /* experimental -- needs to handle zero-width characters better */
1.1       misho     244: static int
1.1.1.2 ! misho     245: _rl_find_prev_utf8char (char *string, int seed, int find_non_zero)
        !           246: {
        !           247:   char *s;
        !           248:   unsigned char b;
        !           249:   int save, prev;
        !           250:   size_t len;
        !           251: 
        !           252:   if (find_non_zero)
        !           253:     len = RL_STRLEN (string);
        !           254: 
        !           255:   prev = seed - 1;
        !           256:   while (prev >= 0)
        !           257:    {
        !           258:       b = (unsigned char)string[prev];
        !           259:       if (UTF8_SINGLEBYTE (b))
        !           260:        return (prev);
        !           261: 
        !           262:       save = prev;
        !           263: 
        !           264:       /* Move back until we're not in the middle of a multibyte char */
        !           265:       if (UTF8_MBCHAR (b))
        !           266:        {
        !           267:          while (prev > 0 && (b = (unsigned char)string[--prev]) && UTF8_MBCHAR (b))
        !           268:            ;
        !           269:        }
        !           270: 
        !           271:       if (UTF8_MBFIRSTCHAR (b))
        !           272:        {
        !           273:          if (find_non_zero)
        !           274:            {
        !           275:              if (_rl_test_nonzero (string, prev, len))
        !           276:                return (prev);
        !           277:              else              /* valid but WCWIDTH (wc) == 0 */
        !           278:                prev = prev - 1;
        !           279:            }
        !           280:          else
        !           281:            return (prev);
        !           282:        }
        !           283:       else
        !           284:        return (save);                  /* invalid utf-8 multibyte sequence */
        !           285:     }
        !           286: 
        !           287:   return ((prev < 0) ? 0 : prev);
        !           288: }  
        !           289: 
        !           290: /*static*/ int
        !           291: _rl_find_prev_mbchar_internal (char *string, int seed, int find_non_zero)
1.1       misho     292: {
                    293:   mbstate_t ps;
                    294:   int prev, non_zero_prev, point, length;
                    295:   size_t tmp;
                    296:   wchar_t wc;
                    297: 
1.1.1.2 ! misho     298:   if (_rl_utf8locale)
        !           299:     return (_rl_find_prev_utf8char (string, seed, find_non_zero));
        !           300: 
1.1       misho     301:   memset(&ps, 0, sizeof(mbstate_t));
                    302:   length = strlen(string);
                    303:   
                    304:   if (seed < 0)
                    305:     return 0;
                    306:   else if (length < seed)
                    307:     return length;
                    308: 
                    309:   prev = non_zero_prev = point = 0;
                    310:   while (point < seed)
                    311:     {
1.1.1.2 ! misho     312:       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
        !           313:        {
        !           314:          tmp = 1;
        !           315:          wc = (wchar_t) string[point];
        !           316:          memset(&ps, 0, sizeof(mbstate_t));
        !           317:        }
        !           318:       else
        !           319:        tmp = mbrtowc (&wc, string + point, length - point, &ps);
1.1       misho     320:       if (MB_INVALIDCH ((size_t)tmp))
                    321:        {
1.1.1.2 ! misho     322:          /* in this case, bytes are invalid or too short to compose
1.1       misho     323:             multibyte char, so assume that the first byte represents
                    324:             a single character anyway. */
                    325:          tmp = 1;
                    326:          /* clear the state of the byte sequence, because
                    327:             in this case effect of mbstate is undefined  */
                    328:          memset(&ps, 0, sizeof (mbstate_t));
                    329: 
                    330:          /* Since we're assuming that this byte represents a single
                    331:             non-zero-width character, don't forget about it. */
                    332:          prev = point;
                    333:        }
                    334:       else if (MB_NULLWCH (tmp))
                    335:        break;                  /* Found '\0' char.  Can this happen? */
                    336:       else
                    337:        {
                    338:          if (find_non_zero)
                    339:            {
                    340:              if (WCWIDTH (wc) != 0)
                    341:                prev = point;
                    342:            }
                    343:          else
                    344:            prev = point;  
                    345:        }
                    346: 
                    347:       point += tmp;
                    348:     }
                    349: 
                    350:   return prev;
                    351: }
                    352: 
                    353: /* return the number of bytes parsed from the multibyte sequence starting
                    354:    at src, if a non-L'\0' wide character was recognized. It returns 0, 
                    355:    if a L'\0' wide character was recognized. It  returns (size_t)(-1), 
                    356:    if an invalid multibyte sequence was encountered. It returns (size_t)(-2) 
                    357:    if it couldn't parse a complete  multibyte character.  */
                    358: int
1.1.1.2 ! misho     359: _rl_get_char_len (char *src, mbstate_t *ps)
1.1       misho     360: {
1.1.1.2 ! misho     361:   size_t tmp, l;
        !           362:   int mb_cur_max;
1.1       misho     363: 
1.1.1.2 ! misho     364:   /* Look at no more than MB_CUR_MAX characters */
        !           365:   l = (size_t)strlen (src);
        !           366:   if (_rl_utf8locale && l > 0 && UTF8_SINGLEBYTE(*src))
        !           367:     tmp = (*src != 0) ? 1 : 0;
        !           368:   else
        !           369:     {
        !           370:       mb_cur_max = MB_CUR_MAX;
        !           371:       tmp = mbrlen((const char *)src, (l < mb_cur_max) ? l : mb_cur_max, ps);
        !           372:     }
1.1       misho     373:   if (tmp == (size_t)(-2))
                    374:     {
1.1.1.2 ! misho     375:       /* too short to compose multibyte char */
1.1       misho     376:       if (ps)
                    377:        memset (ps, 0, sizeof(mbstate_t));
                    378:       return -2;
                    379:     }
                    380:   else if (tmp == (size_t)(-1))
                    381:     {
                    382:       /* invalid to compose multibyte char */
                    383:       /* initialize the conversion state */
                    384:       if (ps)
                    385:        memset (ps, 0, sizeof(mbstate_t));
                    386:       return -1;
                    387:     }
                    388:   else if (tmp == (size_t)0)
                    389:     return 0;
                    390:   else
                    391:     return (int)tmp;
                    392: }
                    393: 
                    394: /* compare the specified two characters. If the characters matched,
                    395:    return 1. Otherwise return 0. */
                    396: int
1.1.1.2 ! misho     397: _rl_compare_chars (char *buf1, int pos1, mbstate_t *ps1, char *buf2, int pos2, mbstate_t *ps2)
1.1       misho     398: {
                    399:   int i, w1, w2;
                    400: 
                    401:   if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 || 
                    402:        (w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
                    403:        (w1 != w2) ||
                    404:        (buf1[pos1] != buf2[pos2]))
                    405:     return 0;
                    406: 
                    407:   for (i = 1; i < w1; i++)
                    408:     if (buf1[pos1+i] != buf2[pos2+i])
                    409:       return 0;
                    410: 
                    411:   return 1;
                    412: }
                    413: 
                    414: /* adjust pointed byte and find mbstate of the point of string.
                    415:    adjusted point will be point <= adjusted_point, and returns
                    416:    differences of the byte(adjusted_point - point).
1.1.1.2 ! misho     417:    if point is invalid (point < 0 || more than string length),
1.1       misho     418:    it returns -1 */
                    419: int
1.1.1.2 ! misho     420: _rl_adjust_point (char *string, int point, mbstate_t *ps)
        !           421: {
        !           422:   size_t tmp;
        !           423:   int length, pos;
1.1       misho     424: 
1.1.1.2 ! misho     425:   tmp = 0;
        !           426:   pos = 0;
1.1       misho     427:   length = strlen(string);
                    428:   if (point < 0)
                    429:     return -1;
                    430:   if (length < point)
                    431:     return -1;
                    432:   
                    433:   while (pos < point)
                    434:     {
1.1.1.2 ! misho     435:       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[pos]))
        !           436:        tmp = 1;
        !           437:       else
        !           438:        tmp = mbrlen (string + pos, length - pos, ps);
1.1       misho     439:       if (MB_INVALIDCH ((size_t)tmp))
                    440:        {
1.1.1.2 ! misho     441:          /* in this case, bytes are invalid or too short to compose
1.1       misho     442:             multibyte char, so assume that the first byte represents
                    443:             a single character anyway. */
                    444:          pos++;
                    445:          /* clear the state of the byte sequence, because
                    446:             in this case effect of mbstate is undefined  */
                    447:          if (ps)
                    448:            memset (ps, 0, sizeof (mbstate_t));
                    449:        }
                    450:       else if (MB_NULLWCH (tmp))
                    451:        pos++;
                    452:       else
                    453:        pos += tmp;
                    454:     }
                    455: 
                    456:   return (pos - point);
                    457: }
                    458: 
                    459: int
1.1.1.2 ! misho     460: _rl_is_mbchar_matched (char *string, int seed, int end, char *mbchar, int length)
1.1       misho     461: {
                    462:   int i;
                    463: 
                    464:   if ((end - seed) < length)
                    465:     return 0;
                    466: 
                    467:   for (i = 0; i < length; i++)
                    468:     if (string[seed + i] != mbchar[i])
                    469:       return 0;
                    470:   return 1;
                    471: }
                    472: 
                    473: wchar_t
1.1.1.2 ! misho     474: _rl_char_value (char *buf, int ind)
1.1       misho     475: {
                    476:   size_t tmp;
                    477:   wchar_t wc;
                    478:   mbstate_t ps;
                    479:   int l;
                    480: 
                    481:   if (MB_LEN_MAX == 1 || rl_byte_oriented)
                    482:     return ((wchar_t) buf[ind]);
1.1.1.2 ! misho     483:   if (_rl_utf8locale && UTF8_SINGLEBYTE(buf[ind]))
        !           484:     return ((wchar_t) buf[ind]);
1.1       misho     485:   l = strlen (buf);
                    486:   if (ind >= l - 1)
                    487:     return ((wchar_t) buf[ind]);
1.1.1.2 ! misho     488:   if (l < ind)                 /* Sanity check */
        !           489:     l = strlen (buf+ind);
1.1       misho     490:   memset (&ps, 0, sizeof (mbstate_t));
                    491:   tmp = mbrtowc (&wc, buf + ind, l - ind, &ps);
                    492:   if (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp))  
                    493:     return ((wchar_t) buf[ind]);
                    494:   return wc;
                    495: }
                    496: #endif /* HANDLE_MULTIBYTE */
                    497: 
                    498: /* Find next `count' characters started byte point of the specified seed.
                    499:    If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
                    500:    characters. */
                    501: #undef _rl_find_next_mbchar
                    502: int
1.1.1.2 ! misho     503: _rl_find_next_mbchar (char *string, int seed, int count, int flags)
1.1       misho     504: {
                    505: #if defined (HANDLE_MULTIBYTE)
                    506:   return _rl_find_next_mbchar_internal (string, seed, count, flags);
                    507: #else
                    508:   return (seed + count);
                    509: #endif
                    510: }
                    511: 
                    512: /* Find previous character started byte point of the specified seed.
                    513:    Returned point will be point <= seed.  If flags is MB_FIND_NONZERO,
                    514:    we look for non-zero-width multibyte characters. */
                    515: #undef _rl_find_prev_mbchar
                    516: int
1.1.1.2 ! misho     517: _rl_find_prev_mbchar (char *string, int seed, int flags)
1.1       misho     518: {
                    519: #if defined (HANDLE_MULTIBYTE)
                    520:   return _rl_find_prev_mbchar_internal (string, seed, flags);
                    521: #else
                    522:   return ((seed == 0) ? seed : seed - 1);
                    523: #endif
                    524: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>