Annotation of embedaddon/readline/mbutil.c, revision 1.1.1.2
1.1 misho 1: /* mbutil.c -- readline multibyte character utility functions */
2:
1.1.1.2 ! misho 3: /* Copyright (C) 2001-2020 Free Software Foundation, Inc.
1.1 misho 4:
5: This file is part of the GNU Readline Library (Readline), a library
6: for reading lines of text with interactive input and history editing.
7:
8: Readline is free software: you can redistribute it and/or modify
9: it under the terms of the GNU General Public License as published by
10: the Free Software Foundation, either version 3 of the License, or
11: (at your option) any later version.
12:
13: Readline is distributed in the hope that it will be useful,
14: but WITHOUT ANY WARRANTY; without even the implied warranty of
15: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16: GNU General Public License for more details.
17:
18: You should have received a copy of the GNU General Public License
19: along with Readline. If not, see <http://www.gnu.org/licenses/>.
20: */
21:
22: #define READLINE_LIBRARY
23:
24: #if defined (HAVE_CONFIG_H)
25: # include <config.h>
26: #endif
27:
28: #include <sys/types.h>
29: #include <fcntl.h>
30: #include "posixjmp.h"
31:
32: #if defined (HAVE_UNISTD_H)
33: # include <unistd.h> /* for _POSIX_VERSION */
34: #endif /* HAVE_UNISTD_H */
35:
36: #if defined (HAVE_STDLIB_H)
37: # include <stdlib.h>
38: #else
39: # include "ansi_stdlib.h"
40: #endif /* HAVE_STDLIB_H */
41:
42: #include <stdio.h>
43: #include <ctype.h>
44:
45: /* System-specific feature definitions and include files. */
46: #include "rldefs.h"
47: #include "rlmbutil.h"
48:
49: #if defined (TIOCSTAT_IN_SYS_IOCTL)
50: # include <sys/ioctl.h>
51: #endif /* TIOCSTAT_IN_SYS_IOCTL */
52:
53: /* Some standard library routines. */
54: #include "readline.h"
55:
56: #include "rlprivate.h"
57: #include "xmalloc.h"
58:
59: /* Declared here so it can be shared between the readline and history
60: libraries. */
61: #if defined (HANDLE_MULTIBYTE)
62: int rl_byte_oriented = 0;
63: #else
64: int rl_byte_oriented = 1;
65: #endif
66:
67: /* Ditto */
68: int _rl_utf8locale = 0;
69:
70: /* **************************************************************** */
71: /* */
72: /* Multibyte Character Utility Functions */
73: /* */
74: /* **************************************************************** */
75:
76: #if defined(HANDLE_MULTIBYTE)
77:
1.1.1.2 ! misho 78: /* **************************************************************** */
! 79: /* */
! 80: /* UTF-8 specific Character Utility Functions */
! 81: /* */
! 82: /* **************************************************************** */
! 83:
! 84: /* Return the length in bytes of the possibly-multibyte character beginning
! 85: at S. Encoding is UTF-8. */
1.1 misho 86: static int
1.1.1.2 ! misho 87: _rl_utf8_mblen (const char *s, size_t n)
! 88: {
! 89: unsigned char c, c1, c2, c3;
! 90:
! 91: if (s == 0)
! 92: return (0); /* no shift states */
! 93: if (n <= 0)
! 94: return (-1);
! 95:
! 96: c = (unsigned char)*s;
! 97: if (c < 0x80)
! 98: return (c != 0);
! 99: if (c >= 0xc2)
! 100: {
! 101: c1 = (unsigned char)s[1];
! 102: if (c < 0xe0)
! 103: {
! 104: if (n == 1)
! 105: return -2;
! 106: if (n >= 2 && (c1 ^ 0x80) < 0x40)
! 107: return 2;
! 108: }
! 109: else if (c < 0xf0)
! 110: {
! 111: if (n == 1)
! 112: return -2;
! 113: if ((c1 ^ 0x80) < 0x40
! 114: && (c >= 0xe1 || c1 >= 0xa0)
! 115: && (c != 0xed || c1 < 0xa0))
! 116: {
! 117: if (n == 2)
! 118: return -2;
! 119: c2 = (unsigned char)s[2];
! 120: if ((c2 ^ 0x80) < 0x40)
! 121: return 3;
! 122: }
! 123: }
! 124: else if (c < 0xf4)
! 125: {
! 126: if (n == 1)
! 127: return -2;
! 128: if (((c1 ^ 0x80) < 0x40)
! 129: && (c >= 0xf1 || c1 >= 0x90)
! 130: && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
! 131: {
! 132: if (n == 2)
! 133: return -2;
! 134: c2 = (unsigned char)s[2];
! 135: if ((c2 ^ 0x80) < 0x40)
! 136: {
! 137: if (n == 3)
! 138: return -2;
! 139: c3 = (unsigned char)s[3];
! 140: if ((c3 ^ 0x80) < 0x40)
! 141: return 4;
! 142: }
! 143: }
! 144: }
! 145: }
! 146: /* invalid or incomplete multibyte character */
! 147: return -1;
! 148: }
! 149:
! 150: static int
! 151: _rl_find_next_mbchar_internal (char *string, int seed, int count, int find_non_zero)
1.1 misho 152: {
153: size_t tmp, len;
154: mbstate_t ps;
155: int point;
156: wchar_t wc;
157:
158: tmp = 0;
159:
160: memset(&ps, 0, sizeof (mbstate_t));
161: if (seed < 0)
162: seed = 0;
163: if (count <= 0)
164: return seed;
165:
166: point = seed + _rl_adjust_point (string, seed, &ps);
1.1.1.2 ! misho 167: /* if _rl_adjust_point returns -1, the character or string is invalid.
! 168: treat as a byte. */
! 169: if (point == seed - 1) /* invalid */
! 170: return seed + 1;
! 171:
1.1 misho 172: /* if this is true, means that seed was not pointing to a byte indicating
173: the beginning of a multibyte character. Correct the point and consume
174: one char. */
175: if (seed < point)
176: count--;
177:
178: while (count > 0)
179: {
180: len = strlen (string + point);
181: if (len == 0)
182: break;
1.1.1.2 ! misho 183: if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
! 184: {
! 185: tmp = 1;
! 186: wc = (wchar_t) string[point];
! 187: memset(&ps, 0, sizeof(mbstate_t));
! 188: }
! 189: else
! 190: tmp = mbrtowc (&wc, string+point, len, &ps);
1.1 misho 191: if (MB_INVALIDCH ((size_t)tmp))
192: {
193: /* invalid bytes. assume a byte represents a character */
194: point++;
195: count--;
196: /* reset states. */
197: memset(&ps, 0, sizeof(mbstate_t));
198: }
199: else if (MB_NULLWCH (tmp))
200: break; /* found wide '\0' */
201: else
202: {
203: /* valid bytes */
204: point += tmp;
205: if (find_non_zero)
206: {
207: if (WCWIDTH (wc) == 0)
208: continue;
209: else
210: count--;
211: }
212: else
213: count--;
214: }
215: }
216:
217: if (find_non_zero)
218: {
219: tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
220: while (MB_NULLWCH (tmp) == 0 && MB_INVALIDCH (tmp) == 0 && WCWIDTH (wc) == 0)
221: {
222: point += tmp;
223: tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
224: }
225: }
226:
227: return point;
228: }
229:
1.1.1.2 ! misho 230: static inline int
! 231: _rl_test_nonzero (char *string, int ind, int len)
! 232: {
! 233: size_t tmp;
! 234: wchar_t wc;
! 235: mbstate_t ps;
! 236:
! 237: memset (&ps, 0, sizeof (mbstate_t));
! 238: tmp = mbrtowc (&wc, string + ind, len - ind, &ps);
! 239: /* treat invalid multibyte sequences as non-zero-width */
! 240: return (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp) || WCWIDTH (wc) > 0);
! 241: }
! 242:
! 243: /* experimental -- needs to handle zero-width characters better */
1.1 misho 244: static int
1.1.1.2 ! misho 245: _rl_find_prev_utf8char (char *string, int seed, int find_non_zero)
! 246: {
! 247: char *s;
! 248: unsigned char b;
! 249: int save, prev;
! 250: size_t len;
! 251:
! 252: if (find_non_zero)
! 253: len = RL_STRLEN (string);
! 254:
! 255: prev = seed - 1;
! 256: while (prev >= 0)
! 257: {
! 258: b = (unsigned char)string[prev];
! 259: if (UTF8_SINGLEBYTE (b))
! 260: return (prev);
! 261:
! 262: save = prev;
! 263:
! 264: /* Move back until we're not in the middle of a multibyte char */
! 265: if (UTF8_MBCHAR (b))
! 266: {
! 267: while (prev > 0 && (b = (unsigned char)string[--prev]) && UTF8_MBCHAR (b))
! 268: ;
! 269: }
! 270:
! 271: if (UTF8_MBFIRSTCHAR (b))
! 272: {
! 273: if (find_non_zero)
! 274: {
! 275: if (_rl_test_nonzero (string, prev, len))
! 276: return (prev);
! 277: else /* valid but WCWIDTH (wc) == 0 */
! 278: prev = prev - 1;
! 279: }
! 280: else
! 281: return (prev);
! 282: }
! 283: else
! 284: return (save); /* invalid utf-8 multibyte sequence */
! 285: }
! 286:
! 287: return ((prev < 0) ? 0 : prev);
! 288: }
! 289:
! 290: /*static*/ int
! 291: _rl_find_prev_mbchar_internal (char *string, int seed, int find_non_zero)
1.1 misho 292: {
293: mbstate_t ps;
294: int prev, non_zero_prev, point, length;
295: size_t tmp;
296: wchar_t wc;
297:
1.1.1.2 ! misho 298: if (_rl_utf8locale)
! 299: return (_rl_find_prev_utf8char (string, seed, find_non_zero));
! 300:
1.1 misho 301: memset(&ps, 0, sizeof(mbstate_t));
302: length = strlen(string);
303:
304: if (seed < 0)
305: return 0;
306: else if (length < seed)
307: return length;
308:
309: prev = non_zero_prev = point = 0;
310: while (point < seed)
311: {
1.1.1.2 ! misho 312: if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
! 313: {
! 314: tmp = 1;
! 315: wc = (wchar_t) string[point];
! 316: memset(&ps, 0, sizeof(mbstate_t));
! 317: }
! 318: else
! 319: tmp = mbrtowc (&wc, string + point, length - point, &ps);
1.1 misho 320: if (MB_INVALIDCH ((size_t)tmp))
321: {
1.1.1.2 ! misho 322: /* in this case, bytes are invalid or too short to compose
1.1 misho 323: multibyte char, so assume that the first byte represents
324: a single character anyway. */
325: tmp = 1;
326: /* clear the state of the byte sequence, because
327: in this case effect of mbstate is undefined */
328: memset(&ps, 0, sizeof (mbstate_t));
329:
330: /* Since we're assuming that this byte represents a single
331: non-zero-width character, don't forget about it. */
332: prev = point;
333: }
334: else if (MB_NULLWCH (tmp))
335: break; /* Found '\0' char. Can this happen? */
336: else
337: {
338: if (find_non_zero)
339: {
340: if (WCWIDTH (wc) != 0)
341: prev = point;
342: }
343: else
344: prev = point;
345: }
346:
347: point += tmp;
348: }
349:
350: return prev;
351: }
352:
353: /* return the number of bytes parsed from the multibyte sequence starting
354: at src, if a non-L'\0' wide character was recognized. It returns 0,
355: if a L'\0' wide character was recognized. It returns (size_t)(-1),
356: if an invalid multibyte sequence was encountered. It returns (size_t)(-2)
357: if it couldn't parse a complete multibyte character. */
358: int
1.1.1.2 ! misho 359: _rl_get_char_len (char *src, mbstate_t *ps)
1.1 misho 360: {
1.1.1.2 ! misho 361: size_t tmp, l;
! 362: int mb_cur_max;
1.1 misho 363:
1.1.1.2 ! misho 364: /* Look at no more than MB_CUR_MAX characters */
! 365: l = (size_t)strlen (src);
! 366: if (_rl_utf8locale && l > 0 && UTF8_SINGLEBYTE(*src))
! 367: tmp = (*src != 0) ? 1 : 0;
! 368: else
! 369: {
! 370: mb_cur_max = MB_CUR_MAX;
! 371: tmp = mbrlen((const char *)src, (l < mb_cur_max) ? l : mb_cur_max, ps);
! 372: }
1.1 misho 373: if (tmp == (size_t)(-2))
374: {
1.1.1.2 ! misho 375: /* too short to compose multibyte char */
1.1 misho 376: if (ps)
377: memset (ps, 0, sizeof(mbstate_t));
378: return -2;
379: }
380: else if (tmp == (size_t)(-1))
381: {
382: /* invalid to compose multibyte char */
383: /* initialize the conversion state */
384: if (ps)
385: memset (ps, 0, sizeof(mbstate_t));
386: return -1;
387: }
388: else if (tmp == (size_t)0)
389: return 0;
390: else
391: return (int)tmp;
392: }
393:
394: /* compare the specified two characters. If the characters matched,
395: return 1. Otherwise return 0. */
396: int
1.1.1.2 ! misho 397: _rl_compare_chars (char *buf1, int pos1, mbstate_t *ps1, char *buf2, int pos2, mbstate_t *ps2)
1.1 misho 398: {
399: int i, w1, w2;
400:
401: if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 ||
402: (w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
403: (w1 != w2) ||
404: (buf1[pos1] != buf2[pos2]))
405: return 0;
406:
407: for (i = 1; i < w1; i++)
408: if (buf1[pos1+i] != buf2[pos2+i])
409: return 0;
410:
411: return 1;
412: }
413:
414: /* adjust pointed byte and find mbstate of the point of string.
415: adjusted point will be point <= adjusted_point, and returns
416: differences of the byte(adjusted_point - point).
1.1.1.2 ! misho 417: if point is invalid (point < 0 || more than string length),
1.1 misho 418: it returns -1 */
419: int
1.1.1.2 ! misho 420: _rl_adjust_point (char *string, int point, mbstate_t *ps)
! 421: {
! 422: size_t tmp;
! 423: int length, pos;
1.1 misho 424:
1.1.1.2 ! misho 425: tmp = 0;
! 426: pos = 0;
1.1 misho 427: length = strlen(string);
428: if (point < 0)
429: return -1;
430: if (length < point)
431: return -1;
432:
433: while (pos < point)
434: {
1.1.1.2 ! misho 435: if (_rl_utf8locale && UTF8_SINGLEBYTE(string[pos]))
! 436: tmp = 1;
! 437: else
! 438: tmp = mbrlen (string + pos, length - pos, ps);
1.1 misho 439: if (MB_INVALIDCH ((size_t)tmp))
440: {
1.1.1.2 ! misho 441: /* in this case, bytes are invalid or too short to compose
1.1 misho 442: multibyte char, so assume that the first byte represents
443: a single character anyway. */
444: pos++;
445: /* clear the state of the byte sequence, because
446: in this case effect of mbstate is undefined */
447: if (ps)
448: memset (ps, 0, sizeof (mbstate_t));
449: }
450: else if (MB_NULLWCH (tmp))
451: pos++;
452: else
453: pos += tmp;
454: }
455:
456: return (pos - point);
457: }
458:
459: int
1.1.1.2 ! misho 460: _rl_is_mbchar_matched (char *string, int seed, int end, char *mbchar, int length)
1.1 misho 461: {
462: int i;
463:
464: if ((end - seed) < length)
465: return 0;
466:
467: for (i = 0; i < length; i++)
468: if (string[seed + i] != mbchar[i])
469: return 0;
470: return 1;
471: }
472:
473: wchar_t
1.1.1.2 ! misho 474: _rl_char_value (char *buf, int ind)
1.1 misho 475: {
476: size_t tmp;
477: wchar_t wc;
478: mbstate_t ps;
479: int l;
480:
481: if (MB_LEN_MAX == 1 || rl_byte_oriented)
482: return ((wchar_t) buf[ind]);
1.1.1.2 ! misho 483: if (_rl_utf8locale && UTF8_SINGLEBYTE(buf[ind]))
! 484: return ((wchar_t) buf[ind]);
1.1 misho 485: l = strlen (buf);
486: if (ind >= l - 1)
487: return ((wchar_t) buf[ind]);
1.1.1.2 ! misho 488: if (l < ind) /* Sanity check */
! 489: l = strlen (buf+ind);
1.1 misho 490: memset (&ps, 0, sizeof (mbstate_t));
491: tmp = mbrtowc (&wc, buf + ind, l - ind, &ps);
492: if (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp))
493: return ((wchar_t) buf[ind]);
494: return wc;
495: }
496: #endif /* HANDLE_MULTIBYTE */
497:
498: /* Find next `count' characters started byte point of the specified seed.
499: If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
500: characters. */
501: #undef _rl_find_next_mbchar
502: int
1.1.1.2 ! misho 503: _rl_find_next_mbchar (char *string, int seed, int count, int flags)
1.1 misho 504: {
505: #if defined (HANDLE_MULTIBYTE)
506: return _rl_find_next_mbchar_internal (string, seed, count, flags);
507: #else
508: return (seed + count);
509: #endif
510: }
511:
512: /* Find previous character started byte point of the specified seed.
513: Returned point will be point <= seed. If flags is MB_FIND_NONZERO,
514: we look for non-zero-width multibyte characters. */
515: #undef _rl_find_prev_mbchar
516: int
1.1.1.2 ! misho 517: _rl_find_prev_mbchar (char *string, int seed, int flags)
1.1 misho 518: {
519: #if defined (HANDLE_MULTIBYTE)
520: return _rl_find_prev_mbchar_internal (string, seed, flags);
521: #else
522: return ((seed == 0) ? seed : seed - 1);
523: #endif
524: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>