Annotation of embedaddon/smartmontools/regex/regcomp.c, revision 1.1
1.1 ! misho 1: /* Extended regular expression matching and search library.
! 2: Copyright (C) 2002, 2003 Free Software Foundation, Inc.
! 3: This file is part of the GNU C Library.
! 4: Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
! 5:
! 6: The GNU C Library is free software; you can redistribute it and/or
! 7: modify it under the terms of the GNU Lesser General Public
! 8: License as published by the Free Software Foundation; either
! 9: version 2.1 of the License, or (at your option) any later version.
! 10:
! 11: The GNU C Library is distributed in the hope that it will be useful,
! 12: but WITHOUT ANY WARRANTY; without even the implied warranty of
! 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 14: Lesser General Public License for more details.
! 15:
! 16: You should have received a copy of the GNU Lesser General Public
! 17: License along with the GNU C Library; if not, write to the Free
! 18: Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
! 19: 02111-1307 USA. */
! 20:
! 21: static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
! 22: int length, reg_syntax_t syntax);
! 23: static void re_compile_fastmap_iter (regex_t *bufp,
! 24: const re_dfastate_t *init_state,
! 25: char *fastmap);
! 26: static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
! 27: static reg_errcode_t init_word_char (re_dfa_t *dfa);
! 28: #ifdef RE_ENABLE_I18N
! 29: static void free_charset (re_charset_t *cset);
! 30: #endif /* RE_ENABLE_I18N */
! 31: static void free_workarea_compile (regex_t *preg);
! 32: static reg_errcode_t create_initial_state (re_dfa_t *dfa);
! 33: static reg_errcode_t analyze (re_dfa_t *dfa);
! 34: static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
! 35: static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
! 36: static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
! 37: static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
! 38: static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
! 39: int top_clone_node, int root_node,
! 40: unsigned int constraint);
! 41: static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
! 42: unsigned int constraint);
! 43: static int search_duplicated_node (re_dfa_t *dfa, int org_node,
! 44: unsigned int constraint);
! 45: static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
! 46: static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
! 47: int node, int root);
! 48: static void calc_inveclosure (re_dfa_t *dfa);
! 49: static int fetch_number (re_string_t *input, re_token_t *token,
! 50: reg_syntax_t syntax);
! 51: static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
! 52: static int peek_token (re_token_t *token, re_string_t *input,
! 53: reg_syntax_t syntax);
! 54: static int peek_token_bracket (re_token_t *token, re_string_t *input,
! 55: reg_syntax_t syntax);
! 56: static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
! 57: reg_syntax_t syntax, reg_errcode_t *err);
! 58: static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
! 59: re_token_t *token, reg_syntax_t syntax,
! 60: int nest, reg_errcode_t *err);
! 61: static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
! 62: re_token_t *token, reg_syntax_t syntax,
! 63: int nest, reg_errcode_t *err);
! 64: static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
! 65: re_token_t *token, reg_syntax_t syntax,
! 66: int nest, reg_errcode_t *err);
! 67: static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
! 68: re_token_t *token, reg_syntax_t syntax,
! 69: int nest, reg_errcode_t *err);
! 70: static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
! 71: re_dfa_t *dfa, re_token_t *token,
! 72: reg_syntax_t syntax, reg_errcode_t *err);
! 73: static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
! 74: re_token_t *token, reg_syntax_t syntax,
! 75: reg_errcode_t *err);
! 76: static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
! 77: re_string_t *regexp,
! 78: re_token_t *token, int token_len,
! 79: re_dfa_t *dfa,
! 80: reg_syntax_t syntax);
! 81: static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
! 82: re_string_t *regexp,
! 83: re_token_t *token);
! 84: #ifndef _LIBC
! 85: # ifdef RE_ENABLE_I18N
! 86: static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
! 87: re_charset_t *mbcset, int *range_alloc,
! 88: bracket_elem_t *start_elem,
! 89: bracket_elem_t *end_elem);
! 90: static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
! 91: re_charset_t *mbcset,
! 92: int *coll_sym_alloc,
! 93: const unsigned char *name);
! 94: # else /* not RE_ENABLE_I18N */
! 95: static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
! 96: bracket_elem_t *start_elem,
! 97: bracket_elem_t *end_elem);
! 98: static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
! 99: const unsigned char *name);
! 100: # endif /* not RE_ENABLE_I18N */
! 101: #endif /* not _LIBC */
! 102: #ifdef RE_ENABLE_I18N
! 103: static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
! 104: re_charset_t *mbcset,
! 105: int *equiv_class_alloc,
! 106: const unsigned char *name);
! 107: static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
! 108: re_charset_t *mbcset,
! 109: int *char_class_alloc,
! 110: const unsigned char *class_name,
! 111: reg_syntax_t syntax);
! 112: #else /* not RE_ENABLE_I18N */
! 113: static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
! 114: const unsigned char *name);
! 115: static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
! 116: const unsigned char *class_name,
! 117: reg_syntax_t syntax);
! 118: #endif /* not RE_ENABLE_I18N */
! 119: static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
! 120: static void free_bin_tree (bin_tree_t *tree);
! 121: static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
! 122: re_token_type_t type, int index);
! 123: static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
! 124:
! 125: /* This table gives an error message for each of the error codes listed
! 126: in regex.h. Obviously the order here has to be same as there.
! 127: POSIX doesn't require that we do anything for REG_NOERROR,
! 128: but why not be nice? */
! 129:
! 130: const char __re_error_msgid[] attribute_hidden =
! 131: {
! 132: #define REG_NOERROR_IDX 0
! 133: gettext_noop ("Success") /* REG_NOERROR */
! 134: "\0"
! 135: #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
! 136: gettext_noop ("No match") /* REG_NOMATCH */
! 137: "\0"
! 138: #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
! 139: gettext_noop ("Invalid regular expression") /* REG_BADPAT */
! 140: "\0"
! 141: #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
! 142: gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
! 143: "\0"
! 144: #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
! 145: gettext_noop ("Invalid character class name") /* REG_ECTYPE */
! 146: "\0"
! 147: #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
! 148: gettext_noop ("Trailing backslash") /* REG_EESCAPE */
! 149: "\0"
! 150: #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
! 151: gettext_noop ("Invalid back reference") /* REG_ESUBREG */
! 152: "\0"
! 153: #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
! 154: gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
! 155: "\0"
! 156: #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
! 157: gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
! 158: "\0"
! 159: #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
! 160: gettext_noop ("Unmatched \\{") /* REG_EBRACE */
! 161: "\0"
! 162: #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
! 163: gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
! 164: "\0"
! 165: #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
! 166: gettext_noop ("Invalid range end") /* REG_ERANGE */
! 167: "\0"
! 168: #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
! 169: gettext_noop ("Memory exhausted") /* REG_ESPACE */
! 170: "\0"
! 171: #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
! 172: gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
! 173: "\0"
! 174: #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
! 175: gettext_noop ("Premature end of regular expression") /* REG_EEND */
! 176: "\0"
! 177: #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
! 178: gettext_noop ("Regular expression too big") /* REG_ESIZE */
! 179: "\0"
! 180: #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
! 181: gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
! 182: };
! 183:
! 184: const size_t __re_error_msgid_idx[] attribute_hidden =
! 185: {
! 186: REG_NOERROR_IDX,
! 187: REG_NOMATCH_IDX,
! 188: REG_BADPAT_IDX,
! 189: REG_ECOLLATE_IDX,
! 190: REG_ECTYPE_IDX,
! 191: REG_EESCAPE_IDX,
! 192: REG_ESUBREG_IDX,
! 193: REG_EBRACK_IDX,
! 194: REG_EPAREN_IDX,
! 195: REG_EBRACE_IDX,
! 196: REG_BADBR_IDX,
! 197: REG_ERANGE_IDX,
! 198: REG_ESPACE_IDX,
! 199: REG_BADRPT_IDX,
! 200: REG_EEND_IDX,
! 201: REG_ESIZE_IDX,
! 202: REG_ERPAREN_IDX
! 203: };
! 204:
! 205: /* Entry points for GNU code. */
! 206:
! 207: /* re_compile_pattern is the GNU regular expression compiler: it
! 208: compiles PATTERN (of length LENGTH) and puts the result in BUFP.
! 209: Returns 0 if the pattern was valid, otherwise an error string.
! 210:
! 211: Assumes the `allocated' (and perhaps `buffer') and `translate' fields
! 212: are set in BUFP on entry. */
! 213:
! 214: const char *
! 215: re_compile_pattern (pattern, length, bufp)
! 216: const char *pattern;
! 217: size_t length;
! 218: struct re_pattern_buffer *bufp;
! 219: {
! 220: reg_errcode_t ret;
! 221:
! 222: /* And GNU code determines whether or not to get register information
! 223: by passing null for the REGS argument to re_match, etc., not by
! 224: setting no_sub. */
! 225: bufp->no_sub = 0;
! 226:
! 227: /* Match anchors at newline. */
! 228: bufp->newline_anchor = 1;
! 229:
! 230: ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
! 231:
! 232: if (!ret)
! 233: return NULL;
! 234: return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
! 235: }
! 236: #ifdef _LIBC
! 237: weak_alias (__re_compile_pattern, re_compile_pattern)
! 238: #endif
! 239:
! 240: /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
! 241: also be assigned to arbitrarily: each pattern buffer stores its own
! 242: syntax, so it can be changed between regex compilations. */
! 243: /* This has no initializer because initialized variables in Emacs
! 244: become read-only after dumping. */
! 245: reg_syntax_t re_syntax_options;
! 246:
! 247:
! 248: /* Specify the precise syntax of regexps for compilation. This provides
! 249: for compatibility for various utilities which historically have
! 250: different, incompatible syntaxes.
! 251:
! 252: The argument SYNTAX is a bit mask comprised of the various bits
! 253: defined in regex.h. We return the old syntax. */
! 254:
! 255: reg_syntax_t
! 256: re_set_syntax (syntax)
! 257: reg_syntax_t syntax;
! 258: {
! 259: reg_syntax_t ret = re_syntax_options;
! 260:
! 261: re_syntax_options = syntax;
! 262: return ret;
! 263: }
! 264: #ifdef _LIBC
! 265: weak_alias (__re_set_syntax, re_set_syntax)
! 266: #endif
! 267:
! 268: int
! 269: re_compile_fastmap (bufp)
! 270: struct re_pattern_buffer *bufp;
! 271: {
! 272: re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
! 273: char *fastmap = bufp->fastmap;
! 274:
! 275: memset (fastmap, '\0', sizeof (char) * SBC_MAX);
! 276: re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
! 277: if (dfa->init_state != dfa->init_state_word)
! 278: re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
! 279: if (dfa->init_state != dfa->init_state_nl)
! 280: re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
! 281: if (dfa->init_state != dfa->init_state_begbuf)
! 282: re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
! 283: bufp->fastmap_accurate = 1;
! 284: return 0;
! 285: }
! 286: #ifdef _LIBC
! 287: weak_alias (__re_compile_fastmap, re_compile_fastmap)
! 288: #endif
! 289:
! 290: static inline void
! 291: re_set_fastmap (char *fastmap, int icase, int ch)
! 292: {
! 293: fastmap[ch] = 1;
! 294: if (icase)
! 295: fastmap[tolower (ch)] = 1;
! 296: }
! 297:
! 298: /* Helper function for re_compile_fastmap.
! 299: Compile fastmap for the initial_state INIT_STATE. */
! 300:
! 301: static void
! 302: re_compile_fastmap_iter (bufp, init_state, fastmap)
! 303: regex_t *bufp;
! 304: const re_dfastate_t *init_state;
! 305: char *fastmap;
! 306: {
! 307: re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
! 308: int node_cnt;
! 309: int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
! 310: for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
! 311: {
! 312: int node = init_state->nodes.elems[node_cnt];
! 313: re_token_type_t type = dfa->nodes[node].type;
! 314:
! 315: if (type == CHARACTER)
! 316: re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
! 317: else if (type == SIMPLE_BRACKET)
! 318: {
! 319: int i, j, ch;
! 320: for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
! 321: for (j = 0; j < UINT_BITS; ++j, ++ch)
! 322: if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
! 323: re_set_fastmap (fastmap, icase, ch);
! 324: }
! 325: #ifdef RE_ENABLE_I18N
! 326: else if (type == COMPLEX_BRACKET)
! 327: {
! 328: int i;
! 329: re_charset_t *cset = dfa->nodes[node].opr.mbcset;
! 330: if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
! 331: || cset->nranges || cset->nchar_classes)
! 332: {
! 333: # ifdef _LIBC
! 334: if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
! 335: {
! 336: /* In this case we want to catch the bytes which are
! 337: the first byte of any collation elements.
! 338: e.g. In da_DK, we want to catch 'a' since "aa"
! 339: is a valid collation element, and don't catch
! 340: 'b' since 'b' is the only collation element
! 341: which starts from 'b'. */
! 342: int j, ch;
! 343: const int32_t *table = (const int32_t *)
! 344: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
! 345: for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
! 346: for (j = 0; j < UINT_BITS; ++j, ++ch)
! 347: if (table[ch] < 0)
! 348: re_set_fastmap (fastmap, icase, ch);
! 349: }
! 350: # else
! 351: if (MB_CUR_MAX > 1)
! 352: for (i = 0; i < SBC_MAX; ++i)
! 353: if (__btowc (i) == WEOF)
! 354: re_set_fastmap (fastmap, icase, i);
! 355: # endif /* not _LIBC */
! 356: }
! 357: for (i = 0; i < cset->nmbchars; ++i)
! 358: {
! 359: char buf[256];
! 360: mbstate_t state;
! 361: memset (&state, '\0', sizeof (state));
! 362: __wcrtomb (buf, cset->mbchars[i], &state);
! 363: re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
! 364: }
! 365: }
! 366: #endif /* RE_ENABLE_I18N */
! 367: else if (type == END_OF_RE || type == OP_PERIOD)
! 368: {
! 369: memset (fastmap, '\1', sizeof (char) * SBC_MAX);
! 370: if (type == END_OF_RE)
! 371: bufp->can_be_null = 1;
! 372: return;
! 373: }
! 374: }
! 375: }
! 376:
! 377: /* Entry point for POSIX code. */
! 378: /* regcomp takes a regular expression as a string and compiles it.
! 379:
! 380: PREG is a regex_t *. We do not expect any fields to be initialized,
! 381: since POSIX says we shouldn't. Thus, we set
! 382:
! 383: `buffer' to the compiled pattern;
! 384: `used' to the length of the compiled pattern;
! 385: `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
! 386: REG_EXTENDED bit in CFLAGS is set; otherwise, to
! 387: RE_SYNTAX_POSIX_BASIC;
! 388: `newline_anchor' to REG_NEWLINE being set in CFLAGS;
! 389: `fastmap' to an allocated space for the fastmap;
! 390: `fastmap_accurate' to zero;
! 391: `re_nsub' to the number of subexpressions in PATTERN.
! 392:
! 393: PATTERN is the address of the pattern string.
! 394:
! 395: CFLAGS is a series of bits which affect compilation.
! 396:
! 397: If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
! 398: use POSIX basic syntax.
! 399:
! 400: If REG_NEWLINE is set, then . and [^...] don't match newline.
! 401: Also, regexec will try a match beginning after every newline.
! 402:
! 403: If REG_ICASE is set, then we considers upper- and lowercase
! 404: versions of letters to be equivalent when matching.
! 405:
! 406: If REG_NOSUB is set, then when PREG is passed to regexec, that
! 407: routine will report only success or failure, and nothing about the
! 408: registers.
! 409:
! 410: It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
! 411: the return codes and their meanings.) */
! 412:
! 413: int
! 414: regcomp (preg, pattern, cflags)
! 415: regex_t *__restrict preg;
! 416: const char *__restrict pattern;
! 417: int cflags;
! 418: {
! 419: reg_errcode_t ret;
! 420: reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
! 421: : RE_SYNTAX_POSIX_BASIC);
! 422:
! 423: preg->buffer = NULL;
! 424: preg->allocated = 0;
! 425: preg->used = 0;
! 426:
! 427: /* Try to allocate space for the fastmap. */
! 428: preg->fastmap = re_malloc (char, SBC_MAX);
! 429: if (BE (preg->fastmap == NULL, 0))
! 430: return REG_ESPACE;
! 431:
! 432: syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
! 433:
! 434: /* If REG_NEWLINE is set, newlines are treated differently. */
! 435: if (cflags & REG_NEWLINE)
! 436: { /* REG_NEWLINE implies neither . nor [^...] match newline. */
! 437: syntax &= ~RE_DOT_NEWLINE;
! 438: syntax |= RE_HAT_LISTS_NOT_NEWLINE;
! 439: /* It also changes the matching behavior. */
! 440: preg->newline_anchor = 1;
! 441: }
! 442: else
! 443: preg->newline_anchor = 0;
! 444: preg->no_sub = !!(cflags & REG_NOSUB);
! 445: preg->translate = NULL;
! 446:
! 447: ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
! 448:
! 449: /* POSIX doesn't distinguish between an unmatched open-group and an
! 450: unmatched close-group: both are REG_EPAREN. */
! 451: if (ret == REG_ERPAREN)
! 452: ret = REG_EPAREN;
! 453:
! 454: /* We have already checked preg->fastmap != NULL. */
! 455: if (BE (ret == REG_NOERROR, 1))
! 456: /* Compute the fastmap now, since regexec cannot modify the pattern
! 457: buffer. This function nevers fails in this implementation. */
! 458: (void) re_compile_fastmap (preg);
! 459: else
! 460: {
! 461: /* Some error occurred while compiling the expression. */
! 462: re_free (preg->fastmap);
! 463: preg->fastmap = NULL;
! 464: }
! 465:
! 466: return (int) ret;
! 467: }
! 468: #ifdef _LIBC
! 469: weak_alias (__regcomp, regcomp)
! 470: #endif
! 471:
! 472: /* Returns a message corresponding to an error code, ERRCODE, returned
! 473: from either regcomp or regexec. We don't use PREG here. */
! 474:
! 475: size_t
! 476: regerror (errcode, preg, errbuf, errbuf_size)
! 477: int errcode;
! 478: const regex_t *preg;
! 479: char *errbuf;
! 480: size_t errbuf_size;
! 481: {
! 482: const char *msg;
! 483: size_t msg_size;
! 484:
! 485: if (BE (errcode < 0
! 486: || errcode >= (int) (sizeof (__re_error_msgid_idx)
! 487: / sizeof (__re_error_msgid_idx[0])), 0))
! 488: /* Only error codes returned by the rest of the code should be passed
! 489: to this routine. If we are given anything else, or if other regex
! 490: code generates an invalid error code, then the program has a bug.
! 491: Dump core so we can fix it. */
! 492: abort ();
! 493:
! 494: msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
! 495:
! 496: msg_size = strlen (msg) + 1; /* Includes the null. */
! 497:
! 498: if (BE (errbuf_size != 0, 1))
! 499: {
! 500: if (BE (msg_size > errbuf_size, 0))
! 501: {
! 502: #if defined HAVE_MEMPCPY || defined _LIBC
! 503: *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
! 504: #else
! 505: memcpy (errbuf, msg, errbuf_size - 1);
! 506: errbuf[errbuf_size - 1] = 0;
! 507: #endif
! 508: }
! 509: else
! 510: memcpy (errbuf, msg, msg_size);
! 511: }
! 512:
! 513: return msg_size;
! 514: }
! 515: #ifdef _LIBC
! 516: weak_alias (__regerror, regerror)
! 517: #endif
! 518:
! 519:
! 520: static void
! 521: free_dfa_content (re_dfa_t *dfa)
! 522: {
! 523: int i, j;
! 524:
! 525: re_free (dfa->subexps);
! 526:
! 527: for (i = 0; i < dfa->nodes_len; ++i)
! 528: {
! 529: re_token_t *node = dfa->nodes + i;
! 530: #ifdef RE_ENABLE_I18N
! 531: if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
! 532: free_charset (node->opr.mbcset);
! 533: else
! 534: #endif /* RE_ENABLE_I18N */
! 535: if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
! 536: re_free (node->opr.sbcset);
! 537: }
! 538: re_free (dfa->nexts);
! 539: for (i = 0; i < dfa->nodes_len; ++i)
! 540: {
! 541: if (dfa->eclosures != NULL)
! 542: re_node_set_free (dfa->eclosures + i);
! 543: if (dfa->inveclosures != NULL)
! 544: re_node_set_free (dfa->inveclosures + i);
! 545: if (dfa->edests != NULL)
! 546: re_node_set_free (dfa->edests + i);
! 547: }
! 548: re_free (dfa->edests);
! 549: re_free (dfa->eclosures);
! 550: re_free (dfa->inveclosures);
! 551: re_free (dfa->nodes);
! 552:
! 553: for (i = 0; i <= dfa->state_hash_mask; ++i)
! 554: {
! 555: struct re_state_table_entry *entry = dfa->state_table + i;
! 556: for (j = 0; j < entry->num; ++j)
! 557: {
! 558: re_dfastate_t *state = entry->array[j];
! 559: free_state (state);
! 560: }
! 561: re_free (entry->array);
! 562: }
! 563: re_free (dfa->state_table);
! 564:
! 565: if (dfa->word_char != NULL)
! 566: re_free (dfa->word_char);
! 567: #ifdef DEBUG
! 568: re_free (dfa->re_str);
! 569: #endif
! 570:
! 571: re_free (dfa);
! 572: }
! 573:
! 574:
! 575: /* Free dynamically allocated space used by PREG. */
! 576:
! 577: void
! 578: regfree (preg)
! 579: regex_t *preg;
! 580: {
! 581: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
! 582: if (BE (dfa != NULL, 1))
! 583: free_dfa_content (dfa);
! 584:
! 585: re_free (preg->fastmap);
! 586: }
! 587: #ifdef _LIBC
! 588: weak_alias (__regfree, regfree)
! 589: #endif
! 590:
! 591: /* Entry points compatible with 4.2 BSD regex library. We don't define
! 592: them unless specifically requested. */
! 593:
! 594: #if defined _REGEX_RE_COMP || defined _LIBC
! 595:
! 596: /* BSD has one and only one pattern buffer. */
! 597: static struct re_pattern_buffer re_comp_buf;
! 598:
! 599: char *
! 600: # ifdef _LIBC
! 601: /* Make these definitions weak in libc, so POSIX programs can redefine
! 602: these names if they don't use our functions, and still use
! 603: regcomp/regexec above without link errors. */
! 604: weak_function
! 605: # endif
! 606: re_comp (s)
! 607: const char *s;
! 608: {
! 609: reg_errcode_t ret;
! 610: char *fastmap;
! 611:
! 612: if (!s)
! 613: {
! 614: if (!re_comp_buf.buffer)
! 615: return gettext ("No previous regular expression");
! 616: return 0;
! 617: }
! 618:
! 619: if (re_comp_buf.buffer)
! 620: {
! 621: fastmap = re_comp_buf.fastmap;
! 622: re_comp_buf.fastmap = NULL;
! 623: __regfree (&re_comp_buf);
! 624: memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
! 625: re_comp_buf.fastmap = fastmap;
! 626: }
! 627:
! 628: if (re_comp_buf.fastmap == NULL)
! 629: {
! 630: re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
! 631: if (re_comp_buf.fastmap == NULL)
! 632: return (char *) gettext (__re_error_msgid
! 633: + __re_error_msgid_idx[(int) REG_ESPACE]);
! 634: }
! 635:
! 636: /* Since `re_exec' always passes NULL for the `regs' argument, we
! 637: don't need to initialize the pattern buffer fields which affect it. */
! 638:
! 639: /* Match anchors at newlines. */
! 640: re_comp_buf.newline_anchor = 1;
! 641:
! 642: ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
! 643:
! 644: if (!ret)
! 645: return NULL;
! 646:
! 647: /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
! 648: return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
! 649: }
! 650:
! 651: #ifdef _LIBC
! 652: libc_freeres_fn (free_mem)
! 653: {
! 654: __regfree (&re_comp_buf);
! 655: }
! 656: #endif
! 657:
! 658: #endif /* _REGEX_RE_COMP */
! 659:
! 660: /* Internal entry point.
! 661: Compile the regular expression PATTERN, whose length is LENGTH.
! 662: SYNTAX indicate regular expression's syntax. */
! 663:
! 664: static reg_errcode_t
! 665: re_compile_internal (preg, pattern, length, syntax)
! 666: regex_t *preg;
! 667: const char * pattern;
! 668: int length;
! 669: reg_syntax_t syntax;
! 670: {
! 671: reg_errcode_t err = REG_NOERROR;
! 672: re_dfa_t *dfa;
! 673: re_string_t regexp;
! 674:
! 675: /* Initialize the pattern buffer. */
! 676: preg->fastmap_accurate = 0;
! 677: preg->syntax = syntax;
! 678: preg->not_bol = preg->not_eol = 0;
! 679: preg->used = 0;
! 680: preg->re_nsub = 0;
! 681: preg->can_be_null = 0;
! 682: preg->regs_allocated = REGS_UNALLOCATED;
! 683:
! 684: /* Initialize the dfa. */
! 685: dfa = (re_dfa_t *) preg->buffer;
! 686: if (preg->allocated < sizeof (re_dfa_t))
! 687: {
! 688: /* If zero allocated, but buffer is non-null, try to realloc
! 689: enough space. This loses if buffer's address is bogus, but
! 690: that is the user's responsibility. If ->buffer is NULL this
! 691: is a simple allocation. */
! 692: dfa = re_realloc (preg->buffer, re_dfa_t, 1);
! 693: if (dfa == NULL)
! 694: return REG_ESPACE;
! 695: preg->allocated = sizeof (re_dfa_t);
! 696: }
! 697: preg->buffer = (unsigned char *) dfa;
! 698: preg->used = sizeof (re_dfa_t);
! 699:
! 700: err = init_dfa (dfa, length);
! 701: if (BE (err != REG_NOERROR, 0))
! 702: {
! 703: re_free (dfa);
! 704: preg->buffer = NULL;
! 705: preg->allocated = 0;
! 706: return err;
! 707: }
! 708: #ifdef DEBUG
! 709: dfa->re_str = re_malloc (char, length + 1);
! 710: strncpy (dfa->re_str, pattern, length + 1);
! 711: #endif
! 712:
! 713: err = re_string_construct (®exp, pattern, length, preg->translate,
! 714: syntax & RE_ICASE);
! 715: if (BE (err != REG_NOERROR, 0))
! 716: {
! 717: re_free (dfa);
! 718: preg->buffer = NULL;
! 719: preg->allocated = 0;
! 720: return err;
! 721: }
! 722:
! 723: /* Parse the regular expression, and build a structure tree. */
! 724: preg->re_nsub = 0;
! 725: dfa->str_tree = parse (®exp, preg, syntax, &err);
! 726: if (BE (dfa->str_tree == NULL, 0))
! 727: goto re_compile_internal_free_return;
! 728:
! 729: /* Analyze the tree and collect information which is necessary to
! 730: create the dfa. */
! 731: err = analyze (dfa);
! 732: if (BE (err != REG_NOERROR, 0))
! 733: goto re_compile_internal_free_return;
! 734:
! 735: /* Then create the initial state of the dfa. */
! 736: err = create_initial_state (dfa);
! 737:
! 738: /* Release work areas. */
! 739: free_workarea_compile (preg);
! 740: re_string_destruct (®exp);
! 741:
! 742: if (BE (err != REG_NOERROR, 0))
! 743: {
! 744: re_compile_internal_free_return:
! 745: free_dfa_content (dfa);
! 746: preg->buffer = NULL;
! 747: preg->allocated = 0;
! 748: }
! 749:
! 750: return err;
! 751: }
! 752:
! 753: /* Initialize DFA. We use the length of the regular expression PAT_LEN
! 754: as the initial length of some arrays. */
! 755:
! 756: static reg_errcode_t
! 757: init_dfa (dfa, pat_len)
! 758: re_dfa_t *dfa;
! 759: int pat_len;
! 760: {
! 761: int table_size;
! 762:
! 763: memset (dfa, '\0', sizeof (re_dfa_t));
! 764:
! 765: dfa->nodes_alloc = pat_len + 1;
! 766: dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
! 767:
! 768: dfa->states_alloc = pat_len + 1;
! 769:
! 770: /* table_size = 2 ^ ceil(log pat_len) */
! 771: for (table_size = 1; table_size > 0; table_size <<= 1)
! 772: if (table_size > pat_len)
! 773: break;
! 774:
! 775: dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
! 776: dfa->state_hash_mask = table_size - 1;
! 777:
! 778: dfa->subexps_alloc = 1;
! 779: dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
! 780: dfa->word_char = NULL;
! 781:
! 782: if (BE (dfa->nodes == NULL || dfa->state_table == NULL
! 783: || dfa->subexps == NULL, 0))
! 784: {
! 785: /* We don't bother to free anything which was allocated. Very
! 786: soon the process will go down anyway. */
! 787: dfa->subexps = NULL;
! 788: dfa->state_table = NULL;
! 789: dfa->nodes = NULL;
! 790: return REG_ESPACE;
! 791: }
! 792: return REG_NOERROR;
! 793: }
! 794:
! 795: /* Initialize WORD_CHAR table, which indicate which character is
! 796: "word". In this case "word" means that it is the word construction
! 797: character used by some operators like "\<", "\>", etc. */
! 798:
! 799: static reg_errcode_t
! 800: init_word_char (dfa)
! 801: re_dfa_t *dfa;
! 802: {
! 803: int i, j, ch;
! 804: dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
! 805: if (BE (dfa->word_char == NULL, 0))
! 806: return REG_ESPACE;
! 807: for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
! 808: for (j = 0; j < UINT_BITS; ++j, ++ch)
! 809: if (isalnum (ch) || ch == '_')
! 810: dfa->word_char[i] |= 1 << j;
! 811: return REG_NOERROR;
! 812: }
! 813:
! 814: /* Free the work area which are only used while compiling. */
! 815:
! 816: static void
! 817: free_workarea_compile (preg)
! 818: regex_t *preg;
! 819: {
! 820: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
! 821: free_bin_tree (dfa->str_tree);
! 822: dfa->str_tree = NULL;
! 823: re_free (dfa->org_indices);
! 824: dfa->org_indices = NULL;
! 825: }
! 826:
! 827: /* Create initial states for all contexts. */
! 828:
! 829: static reg_errcode_t
! 830: create_initial_state (dfa)
! 831: re_dfa_t *dfa;
! 832: {
! 833: int first, i;
! 834: reg_errcode_t err;
! 835: re_node_set init_nodes;
! 836:
! 837: /* Initial states have the epsilon closure of the node which is
! 838: the first node of the regular expression. */
! 839: first = dfa->str_tree->first;
! 840: dfa->init_node = first;
! 841: err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
! 842: if (BE (err != REG_NOERROR, 0))
! 843: return err;
! 844:
! 845: /* The back-references which are in initial states can epsilon transit,
! 846: since in this case all of the subexpressions can be null.
! 847: Then we add epsilon closures of the nodes which are the next nodes of
! 848: the back-references. */
! 849: if (dfa->nbackref > 0)
! 850: for (i = 0; i < init_nodes.nelem; ++i)
! 851: {
! 852: int node_idx = init_nodes.elems[i];
! 853: re_token_type_t type = dfa->nodes[node_idx].type;
! 854:
! 855: int clexp_idx;
! 856: if (type != OP_BACK_REF)
! 857: continue;
! 858: for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
! 859: {
! 860: re_token_t *clexp_node;
! 861: clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
! 862: if (clexp_node->type == OP_CLOSE_SUBEXP
! 863: && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
! 864: break;
! 865: }
! 866: if (clexp_idx == init_nodes.nelem)
! 867: continue;
! 868:
! 869: if (type == OP_BACK_REF)
! 870: {
! 871: int dest_idx = dfa->edests[node_idx].elems[0];
! 872: if (!re_node_set_contains (&init_nodes, dest_idx))
! 873: {
! 874: re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
! 875: i = 0;
! 876: }
! 877: }
! 878: }
! 879:
! 880: /* It must be the first time to invoke acquire_state. */
! 881: dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
! 882: /* We don't check ERR here, since the initial state must not be NULL. */
! 883: if (BE (dfa->init_state == NULL, 0))
! 884: return err;
! 885: if (dfa->init_state->has_constraint)
! 886: {
! 887: dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
! 888: CONTEXT_WORD);
! 889: dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
! 890: CONTEXT_NEWLINE);
! 891: dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
! 892: &init_nodes,
! 893: CONTEXT_NEWLINE
! 894: | CONTEXT_BEGBUF);
! 895: if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
! 896: || dfa->init_state_begbuf == NULL, 0))
! 897: return err;
! 898: }
! 899: else
! 900: dfa->init_state_word = dfa->init_state_nl
! 901: = dfa->init_state_begbuf = dfa->init_state;
! 902:
! 903: re_node_set_free (&init_nodes);
! 904: return REG_NOERROR;
! 905: }
! 906:
! 907: /* Analyze the structure tree, and calculate "first", "next", "edest",
! 908: "eclosure", and "inveclosure". */
! 909:
! 910: static reg_errcode_t
! 911: analyze (dfa)
! 912: re_dfa_t *dfa;
! 913: {
! 914: int i;
! 915: reg_errcode_t ret;
! 916:
! 917: /* Allocate arrays. */
! 918: dfa->nexts = re_malloc (int, dfa->nodes_alloc);
! 919: dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
! 920: dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
! 921: dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
! 922: dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
! 923: if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
! 924: || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
! 925: return REG_ESPACE;
! 926: /* Initialize them. */
! 927: for (i = 0; i < dfa->nodes_len; ++i)
! 928: {
! 929: dfa->nexts[i] = -1;
! 930: re_node_set_init_empty (dfa->edests + i);
! 931: re_node_set_init_empty (dfa->eclosures + i);
! 932: re_node_set_init_empty (dfa->inveclosures + i);
! 933: }
! 934:
! 935: ret = analyze_tree (dfa, dfa->str_tree);
! 936: if (BE (ret == REG_NOERROR, 1))
! 937: {
! 938: ret = calc_eclosure (dfa);
! 939: if (ret == REG_NOERROR)
! 940: calc_inveclosure (dfa);
! 941: }
! 942: return ret;
! 943: }
! 944:
! 945: /* Helper functions for analyze.
! 946: This function calculate "first", "next", and "edest" for the subtree
! 947: whose root is NODE. */
! 948:
! 949: static reg_errcode_t
! 950: analyze_tree (dfa, node)
! 951: re_dfa_t *dfa;
! 952: bin_tree_t *node;
! 953: {
! 954: reg_errcode_t ret;
! 955: if (node->first == -1)
! 956: calc_first (dfa, node);
! 957: if (node->next == -1)
! 958: calc_next (dfa, node);
! 959: if (node->eclosure.nelem == 0)
! 960: calc_epsdest (dfa, node);
! 961: /* Calculate "first" etc. for the left child. */
! 962: if (node->left != NULL)
! 963: {
! 964: ret = analyze_tree (dfa, node->left);
! 965: if (BE (ret != REG_NOERROR, 0))
! 966: return ret;
! 967: }
! 968: /* Calculate "first" etc. for the right child. */
! 969: if (node->right != NULL)
! 970: {
! 971: ret = analyze_tree (dfa, node->right);
! 972: if (BE (ret != REG_NOERROR, 0))
! 973: return ret;
! 974: }
! 975: return REG_NOERROR;
! 976: }
! 977:
! 978: /* Calculate "first" for the node NODE. */
! 979: static void
! 980: calc_first (dfa, node)
! 981: re_dfa_t *dfa;
! 982: bin_tree_t *node;
! 983: {
! 984: int idx, type;
! 985: idx = node->node_idx;
! 986: type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
! 987:
! 988: switch (type)
! 989: {
! 990: #ifdef DEBUG
! 991: case OP_OPEN_BRACKET:
! 992: case OP_CLOSE_BRACKET:
! 993: case OP_OPEN_DUP_NUM:
! 994: case OP_CLOSE_DUP_NUM:
! 995: case OP_NON_MATCH_LIST:
! 996: case OP_OPEN_COLL_ELEM:
! 997: case OP_CLOSE_COLL_ELEM:
! 998: case OP_OPEN_EQUIV_CLASS:
! 999: case OP_CLOSE_EQUIV_CLASS:
! 1000: case OP_OPEN_CHAR_CLASS:
! 1001: case OP_CLOSE_CHAR_CLASS:
! 1002: /* These must not be appeared here. */
! 1003: assert (0);
! 1004: #endif
! 1005: case END_OF_RE:
! 1006: case CHARACTER:
! 1007: case OP_PERIOD:
! 1008: case OP_DUP_ASTERISK:
! 1009: case OP_DUP_QUESTION:
! 1010: #ifdef RE_ENABLE_I18N
! 1011: case COMPLEX_BRACKET:
! 1012: #endif /* RE_ENABLE_I18N */
! 1013: case SIMPLE_BRACKET:
! 1014: case OP_BACK_REF:
! 1015: case ANCHOR:
! 1016: case OP_OPEN_SUBEXP:
! 1017: case OP_CLOSE_SUBEXP:
! 1018: node->first = idx;
! 1019: break;
! 1020: case OP_DUP_PLUS:
! 1021: #ifdef DEBUG
! 1022: assert (node->left != NULL);
! 1023: #endif
! 1024: if (node->left->first == -1)
! 1025: calc_first (dfa, node->left);
! 1026: node->first = node->left->first;
! 1027: break;
! 1028: case OP_ALT:
! 1029: node->first = idx;
! 1030: break;
! 1031: /* else fall through */
! 1032: default:
! 1033: #ifdef DEBUG
! 1034: assert (node->left != NULL);
! 1035: #endif
! 1036: if (node->left->first == -1)
! 1037: calc_first (dfa, node->left);
! 1038: node->first = node->left->first;
! 1039: break;
! 1040: }
! 1041: }
! 1042:
! 1043: /* Calculate "next" for the node NODE. */
! 1044:
! 1045: static void
! 1046: calc_next (dfa, node)
! 1047: re_dfa_t *dfa;
! 1048: bin_tree_t *node;
! 1049: {
! 1050: int idx, type;
! 1051: bin_tree_t *parent = node->parent;
! 1052: if (parent == NULL)
! 1053: {
! 1054: node->next = -1;
! 1055: idx = node->node_idx;
! 1056: if (node->type == 0)
! 1057: dfa->nexts[idx] = node->next;
! 1058: return;
! 1059: }
! 1060:
! 1061: idx = parent->node_idx;
! 1062: type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
! 1063:
! 1064: switch (type)
! 1065: {
! 1066: case OP_DUP_ASTERISK:
! 1067: case OP_DUP_PLUS:
! 1068: node->next = idx;
! 1069: break;
! 1070: case CONCAT:
! 1071: if (parent->left == node)
! 1072: {
! 1073: if (parent->right->first == -1)
! 1074: calc_first (dfa, parent->right);
! 1075: node->next = parent->right->first;
! 1076: break;
! 1077: }
! 1078: /* else fall through */
! 1079: default:
! 1080: if (parent->next == -1)
! 1081: calc_next (dfa, parent);
! 1082: node->next = parent->next;
! 1083: break;
! 1084: }
! 1085: idx = node->node_idx;
! 1086: if (node->type == 0)
! 1087: dfa->nexts[idx] = node->next;
! 1088: }
! 1089:
! 1090: /* Calculate "edest" for the node NODE. */
! 1091:
! 1092: static void
! 1093: calc_epsdest (dfa, node)
! 1094: re_dfa_t *dfa;
! 1095: bin_tree_t *node;
! 1096: {
! 1097: int idx;
! 1098: idx = node->node_idx;
! 1099: if (node->type == 0)
! 1100: {
! 1101: if (dfa->nodes[idx].type == OP_DUP_ASTERISK
! 1102: || dfa->nodes[idx].type == OP_DUP_PLUS
! 1103: || dfa->nodes[idx].type == OP_DUP_QUESTION)
! 1104: {
! 1105: if (node->left->first == -1)
! 1106: calc_first (dfa, node->left);
! 1107: if (node->next == -1)
! 1108: calc_next (dfa, node);
! 1109: re_node_set_init_2 (dfa->edests + idx, node->left->first,
! 1110: node->next);
! 1111: }
! 1112: else if (dfa->nodes[idx].type == OP_ALT)
! 1113: {
! 1114: int left, right;
! 1115: if (node->left != NULL)
! 1116: {
! 1117: if (node->left->first == -1)
! 1118: calc_first (dfa, node->left);
! 1119: left = node->left->first;
! 1120: }
! 1121: else
! 1122: {
! 1123: if (node->next == -1)
! 1124: calc_next (dfa, node);
! 1125: left = node->next;
! 1126: }
! 1127: if (node->right != NULL)
! 1128: {
! 1129: if (node->right->first == -1)
! 1130: calc_first (dfa, node->right);
! 1131: right = node->right->first;
! 1132: }
! 1133: else
! 1134: {
! 1135: if (node->next == -1)
! 1136: calc_next (dfa, node);
! 1137: right = node->next;
! 1138: }
! 1139: re_node_set_init_2 (dfa->edests + idx, left, right);
! 1140: }
! 1141: else if (dfa->nodes[idx].type == ANCHOR
! 1142: || dfa->nodes[idx].type == OP_OPEN_SUBEXP
! 1143: || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
! 1144: || dfa->nodes[idx].type == OP_BACK_REF)
! 1145: re_node_set_init_1 (dfa->edests + idx, node->next);
! 1146: }
! 1147: }
! 1148:
! 1149: /* Duplicate the epsilon closure of the node ROOT_NODE.
! 1150: Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
! 1151: to their own constraint. */
! 1152:
! 1153: static reg_errcode_t
! 1154: duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
! 1155: init_constraint)
! 1156: re_dfa_t *dfa;
! 1157: int top_org_node, top_clone_node, root_node;
! 1158: unsigned int init_constraint;
! 1159: {
! 1160: reg_errcode_t err;
! 1161: int org_node, clone_node, ret;
! 1162: unsigned int constraint = init_constraint;
! 1163: for (org_node = top_org_node, clone_node = top_clone_node;;)
! 1164: {
! 1165: int org_dest, clone_dest;
! 1166: if (dfa->nodes[org_node].type == OP_BACK_REF)
! 1167: {
! 1168: /* If the back reference epsilon-transit, its destination must
! 1169: also have the constraint. Then duplicate the epsilon closure
! 1170: of the destination of the back reference, and store it in
! 1171: edests of the back reference. */
! 1172: org_dest = dfa->nexts[org_node];
! 1173: re_node_set_empty (dfa->edests + clone_node);
! 1174: err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
! 1175: if (BE (err != REG_NOERROR, 0))
! 1176: return err;
! 1177: dfa->nexts[clone_node] = dfa->nexts[org_node];
! 1178: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
! 1179: if (BE (ret < 0, 0))
! 1180: return REG_ESPACE;
! 1181: }
! 1182: else if (dfa->edests[org_node].nelem == 0)
! 1183: {
! 1184: /* In case of the node can't epsilon-transit, don't duplicate the
! 1185: destination and store the original destination as the
! 1186: destination of the node. */
! 1187: dfa->nexts[clone_node] = dfa->nexts[org_node];
! 1188: break;
! 1189: }
! 1190: else if (dfa->edests[org_node].nelem == 1)
! 1191: {
! 1192: /* In case of the node can epsilon-transit, and it has only one
! 1193: destination. */
! 1194: org_dest = dfa->edests[org_node].elems[0];
! 1195: re_node_set_empty (dfa->edests + clone_node);
! 1196: if (dfa->nodes[org_node].type == ANCHOR)
! 1197: {
! 1198: /* In case of the node has another constraint, append it. */
! 1199: if (org_node == root_node && clone_node != org_node)
! 1200: {
! 1201: /* ...but if the node is root_node itself, it means the
! 1202: epsilon closure have a loop, then tie it to the
! 1203: destination of the root_node. */
! 1204: ret = re_node_set_insert (dfa->edests + clone_node,
! 1205: org_dest);
! 1206: if (BE (ret < 0, 0))
! 1207: return REG_ESPACE;
! 1208: break;
! 1209: }
! 1210: constraint |= dfa->nodes[org_node].opr.ctx_type;
! 1211: }
! 1212: err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
! 1213: if (BE (err != REG_NOERROR, 0))
! 1214: return err;
! 1215: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
! 1216: if (BE (ret < 0, 0))
! 1217: return REG_ESPACE;
! 1218: }
! 1219: else /* dfa->edests[org_node].nelem == 2 */
! 1220: {
! 1221: /* In case of the node can epsilon-transit, and it has two
! 1222: destinations. E.g. '|', '*', '+', '?'. */
! 1223: org_dest = dfa->edests[org_node].elems[0];
! 1224: re_node_set_empty (dfa->edests + clone_node);
! 1225: /* Search for a duplicated node which satisfies the constraint. */
! 1226: clone_dest = search_duplicated_node (dfa, org_dest, constraint);
! 1227: if (clone_dest == -1)
! 1228: {
! 1229: /* There are no such a duplicated node, create a new one. */
! 1230: err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
! 1231: if (BE (err != REG_NOERROR, 0))
! 1232: return err;
! 1233: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
! 1234: if (BE (ret < 0, 0))
! 1235: return REG_ESPACE;
! 1236: err = duplicate_node_closure (dfa, org_dest, clone_dest,
! 1237: root_node, constraint);
! 1238: if (BE (err != REG_NOERROR, 0))
! 1239: return err;
! 1240: }
! 1241: else
! 1242: {
! 1243: /* There are a duplicated node which satisfy the constraint,
! 1244: use it to avoid infinite loop. */
! 1245: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
! 1246: if (BE (ret < 0, 0))
! 1247: return REG_ESPACE;
! 1248: }
! 1249:
! 1250: org_dest = dfa->edests[org_node].elems[1];
! 1251: err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
! 1252: if (BE (err != REG_NOERROR, 0))
! 1253: return err;
! 1254: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
! 1255: if (BE (ret < 0, 0))
! 1256: return REG_ESPACE;
! 1257: }
! 1258: org_node = org_dest;
! 1259: clone_node = clone_dest;
! 1260: }
! 1261: return REG_NOERROR;
! 1262: }
! 1263:
! 1264: /* Search for a node which is duplicated from the node ORG_NODE, and
! 1265: satisfies the constraint CONSTRAINT. */
! 1266:
! 1267: static int
! 1268: search_duplicated_node (dfa, org_node, constraint)
! 1269: re_dfa_t *dfa;
! 1270: int org_node;
! 1271: unsigned int constraint;
! 1272: {
! 1273: int idx;
! 1274: for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
! 1275: {
! 1276: if (org_node == dfa->org_indices[idx]
! 1277: && constraint == dfa->nodes[idx].constraint)
! 1278: return idx; /* Found. */
! 1279: }
! 1280: return -1; /* Not found. */
! 1281: }
! 1282:
! 1283: /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
! 1284: The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
! 1285: otherwise return the error code. */
! 1286:
! 1287: static reg_errcode_t
! 1288: duplicate_node (new_idx, dfa, org_idx, constraint)
! 1289: re_dfa_t *dfa;
! 1290: int *new_idx, org_idx;
! 1291: unsigned int constraint;
! 1292: {
! 1293: re_token_t dup;
! 1294: int dup_idx;
! 1295:
! 1296: dup = dfa->nodes[org_idx];
! 1297: dup_idx = re_dfa_add_node (dfa, dup, 1);
! 1298: if (BE (dup_idx == -1, 0))
! 1299: return REG_ESPACE;
! 1300: dfa->nodes[dup_idx].constraint = constraint;
! 1301: if (dfa->nodes[org_idx].type == ANCHOR)
! 1302: dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
! 1303: dfa->nodes[dup_idx].duplicated = 1;
! 1304: re_node_set_init_empty (dfa->edests + dup_idx);
! 1305: re_node_set_init_empty (dfa->eclosures + dup_idx);
! 1306: re_node_set_init_empty (dfa->inveclosures + dup_idx);
! 1307:
! 1308: /* Store the index of the original node. */
! 1309: dfa->org_indices[dup_idx] = org_idx;
! 1310: *new_idx = dup_idx;
! 1311: return REG_NOERROR;
! 1312: }
! 1313:
! 1314: static void
! 1315: calc_inveclosure (dfa)
! 1316: re_dfa_t *dfa;
! 1317: {
! 1318: int src, idx, dest;
! 1319: for (src = 0; src < dfa->nodes_len; ++src)
! 1320: {
! 1321: for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
! 1322: {
! 1323: dest = dfa->eclosures[src].elems[idx];
! 1324: re_node_set_insert (dfa->inveclosures + dest, src);
! 1325: }
! 1326: }
! 1327: }
! 1328:
! 1329: /* Calculate "eclosure" for all the node in DFA. */
! 1330:
! 1331: static reg_errcode_t
! 1332: calc_eclosure (dfa)
! 1333: re_dfa_t *dfa;
! 1334: {
! 1335: int node_idx, incomplete;
! 1336: #ifdef DEBUG
! 1337: assert (dfa->nodes_len > 0);
! 1338: #endif
! 1339: incomplete = 0;
! 1340: /* For each nodes, calculate epsilon closure. */
! 1341: for (node_idx = 0; ; ++node_idx)
! 1342: {
! 1343: reg_errcode_t err;
! 1344: re_node_set eclosure_elem;
! 1345: if (node_idx == dfa->nodes_len)
! 1346: {
! 1347: if (!incomplete)
! 1348: break;
! 1349: incomplete = 0;
! 1350: node_idx = 0;
! 1351: }
! 1352:
! 1353: #ifdef DEBUG
! 1354: assert (dfa->eclosures[node_idx].nelem != -1);
! 1355: #endif
! 1356: /* If we have already calculated, skip it. */
! 1357: if (dfa->eclosures[node_idx].nelem != 0)
! 1358: continue;
! 1359: /* Calculate epsilon closure of `node_idx'. */
! 1360: err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
! 1361: if (BE (err != REG_NOERROR, 0))
! 1362: return err;
! 1363:
! 1364: if (dfa->eclosures[node_idx].nelem == 0)
! 1365: {
! 1366: incomplete = 1;
! 1367: re_node_set_free (&eclosure_elem);
! 1368: }
! 1369: }
! 1370: return REG_NOERROR;
! 1371: }
! 1372:
! 1373: /* Calculate epsilon closure of NODE. */
! 1374:
! 1375: static reg_errcode_t
! 1376: calc_eclosure_iter (new_set, dfa, node, root)
! 1377: re_node_set *new_set;
! 1378: re_dfa_t *dfa;
! 1379: int node, root;
! 1380: {
! 1381: reg_errcode_t err;
! 1382: unsigned int constraint;
! 1383: int i, incomplete;
! 1384: re_node_set eclosure;
! 1385: incomplete = 0;
! 1386: err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
! 1387: if (BE (err != REG_NOERROR, 0))
! 1388: return err;
! 1389:
! 1390: /* This indicates that we are calculating this node now.
! 1391: We reference this value to avoid infinite loop. */
! 1392: dfa->eclosures[node].nelem = -1;
! 1393:
! 1394: constraint = ((dfa->nodes[node].type == ANCHOR)
! 1395: ? dfa->nodes[node].opr.ctx_type : 0);
! 1396: /* If the current node has constraints, duplicate all nodes.
! 1397: Since they must inherit the constraints. */
! 1398: if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
! 1399: {
! 1400: int org_node, cur_node;
! 1401: org_node = cur_node = node;
! 1402: err = duplicate_node_closure (dfa, node, node, node, constraint);
! 1403: if (BE (err != REG_NOERROR, 0))
! 1404: return err;
! 1405: }
! 1406:
! 1407: /* Expand each epsilon destination nodes. */
! 1408: if (IS_EPSILON_NODE(dfa->nodes[node].type))
! 1409: for (i = 0; i < dfa->edests[node].nelem; ++i)
! 1410: {
! 1411: re_node_set eclosure_elem;
! 1412: int edest = dfa->edests[node].elems[i];
! 1413: /* If calculating the epsilon closure of `edest' is in progress,
! 1414: return intermediate result. */
! 1415: if (dfa->eclosures[edest].nelem == -1)
! 1416: {
! 1417: incomplete = 1;
! 1418: continue;
! 1419: }
! 1420: /* If we haven't calculated the epsilon closure of `edest' yet,
! 1421: calculate now. Otherwise use calculated epsilon closure. */
! 1422: if (dfa->eclosures[edest].nelem == 0)
! 1423: {
! 1424: err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
! 1425: if (BE (err != REG_NOERROR, 0))
! 1426: return err;
! 1427: }
! 1428: else
! 1429: eclosure_elem = dfa->eclosures[edest];
! 1430: /* Merge the epsilon closure of `edest'. */
! 1431: re_node_set_merge (&eclosure, &eclosure_elem);
! 1432: /* If the epsilon closure of `edest' is incomplete,
! 1433: the epsilon closure of this node is also incomplete. */
! 1434: if (dfa->eclosures[edest].nelem == 0)
! 1435: {
! 1436: incomplete = 1;
! 1437: re_node_set_free (&eclosure_elem);
! 1438: }
! 1439: }
! 1440:
! 1441: /* Epsilon closures include itself. */
! 1442: re_node_set_insert (&eclosure, node);
! 1443: if (incomplete && !root)
! 1444: dfa->eclosures[node].nelem = 0;
! 1445: else
! 1446: dfa->eclosures[node] = eclosure;
! 1447: *new_set = eclosure;
! 1448: return REG_NOERROR;
! 1449: }
! 1450:
! 1451: /* Functions for token which are used in the parser. */
! 1452:
! 1453: /* Fetch a token from INPUT.
! 1454: We must not use this function inside bracket expressions. */
! 1455:
! 1456: static re_token_t
! 1457: fetch_token (input, syntax)
! 1458: re_string_t *input;
! 1459: reg_syntax_t syntax;
! 1460: {
! 1461: re_token_t token;
! 1462: int consumed_byte;
! 1463: consumed_byte = peek_token (&token, input, syntax);
! 1464: re_string_skip_bytes (input, consumed_byte);
! 1465: return token;
! 1466: }
! 1467:
! 1468: /* Peek a token from INPUT, and return the length of the token.
! 1469: We must not use this function inside bracket expressions. */
! 1470:
! 1471: static int
! 1472: peek_token (token, input, syntax)
! 1473: re_token_t *token;
! 1474: re_string_t *input;
! 1475: reg_syntax_t syntax;
! 1476: {
! 1477: unsigned char c;
! 1478:
! 1479: if (re_string_eoi (input))
! 1480: {
! 1481: token->type = END_OF_RE;
! 1482: return 0;
! 1483: }
! 1484:
! 1485: c = re_string_peek_byte (input, 0);
! 1486: token->opr.c = c;
! 1487:
! 1488: #ifdef RE_ENABLE_I18N
! 1489: token->mb_partial = 0;
! 1490: if (MB_CUR_MAX > 1 &&
! 1491: !re_string_first_byte (input, re_string_cur_idx (input)))
! 1492: {
! 1493: token->type = CHARACTER;
! 1494: token->mb_partial = 1;
! 1495: return 1;
! 1496: }
! 1497: #endif
! 1498: if (c == '\\')
! 1499: {
! 1500: unsigned char c2;
! 1501: if (re_string_cur_idx (input) + 1 >= re_string_length (input))
! 1502: {
! 1503: token->type = BACK_SLASH;
! 1504: return 1;
! 1505: }
! 1506:
! 1507: c2 = re_string_peek_byte_case (input, 1);
! 1508: token->opr.c = c2;
! 1509: token->type = CHARACTER;
! 1510: switch (c2)
! 1511: {
! 1512: case '|':
! 1513: if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
! 1514: token->type = OP_ALT;
! 1515: break;
! 1516: case '1': case '2': case '3': case '4': case '5':
! 1517: case '6': case '7': case '8': case '9':
! 1518: if (!(syntax & RE_NO_BK_REFS))
! 1519: {
! 1520: token->type = OP_BACK_REF;
! 1521: token->opr.idx = c2 - '0';
! 1522: }
! 1523: break;
! 1524: case '<':
! 1525: if (!(syntax & RE_NO_GNU_OPS))
! 1526: {
! 1527: token->type = ANCHOR;
! 1528: token->opr.idx = WORD_FIRST;
! 1529: }
! 1530: break;
! 1531: case '>':
! 1532: if (!(syntax & RE_NO_GNU_OPS))
! 1533: {
! 1534: token->type = ANCHOR;
! 1535: token->opr.idx = WORD_LAST;
! 1536: }
! 1537: break;
! 1538: case 'b':
! 1539: if (!(syntax & RE_NO_GNU_OPS))
! 1540: {
! 1541: token->type = ANCHOR;
! 1542: token->opr.idx = WORD_DELIM;
! 1543: }
! 1544: break;
! 1545: case 'B':
! 1546: if (!(syntax & RE_NO_GNU_OPS))
! 1547: {
! 1548: token->type = ANCHOR;
! 1549: token->opr.idx = INSIDE_WORD;
! 1550: }
! 1551: break;
! 1552: case 'w':
! 1553: if (!(syntax & RE_NO_GNU_OPS))
! 1554: token->type = OP_WORD;
! 1555: break;
! 1556: case 'W':
! 1557: if (!(syntax & RE_NO_GNU_OPS))
! 1558: token->type = OP_NOTWORD;
! 1559: break;
! 1560: case '`':
! 1561: if (!(syntax & RE_NO_GNU_OPS))
! 1562: {
! 1563: token->type = ANCHOR;
! 1564: token->opr.idx = BUF_FIRST;
! 1565: }
! 1566: break;
! 1567: case '\'':
! 1568: if (!(syntax & RE_NO_GNU_OPS))
! 1569: {
! 1570: token->type = ANCHOR;
! 1571: token->opr.idx = BUF_LAST;
! 1572: }
! 1573: break;
! 1574: case '(':
! 1575: if (!(syntax & RE_NO_BK_PARENS))
! 1576: token->type = OP_OPEN_SUBEXP;
! 1577: break;
! 1578: case ')':
! 1579: if (!(syntax & RE_NO_BK_PARENS))
! 1580: token->type = OP_CLOSE_SUBEXP;
! 1581: break;
! 1582: case '+':
! 1583: if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
! 1584: token->type = OP_DUP_PLUS;
! 1585: break;
! 1586: case '?':
! 1587: if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
! 1588: token->type = OP_DUP_QUESTION;
! 1589: break;
! 1590: case '{':
! 1591: if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
! 1592: token->type = OP_OPEN_DUP_NUM;
! 1593: break;
! 1594: case '}':
! 1595: if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
! 1596: token->type = OP_CLOSE_DUP_NUM;
! 1597: break;
! 1598: default:
! 1599: break;
! 1600: }
! 1601: return 2;
! 1602: }
! 1603:
! 1604: token->type = CHARACTER;
! 1605: switch (c)
! 1606: {
! 1607: case '\n':
! 1608: if (syntax & RE_NEWLINE_ALT)
! 1609: token->type = OP_ALT;
! 1610: break;
! 1611: case '|':
! 1612: if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
! 1613: token->type = OP_ALT;
! 1614: break;
! 1615: case '*':
! 1616: token->type = OP_DUP_ASTERISK;
! 1617: break;
! 1618: case '+':
! 1619: if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
! 1620: token->type = OP_DUP_PLUS;
! 1621: break;
! 1622: case '?':
! 1623: if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
! 1624: token->type = OP_DUP_QUESTION;
! 1625: break;
! 1626: case '{':
! 1627: if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
! 1628: token->type = OP_OPEN_DUP_NUM;
! 1629: break;
! 1630: case '}':
! 1631: if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
! 1632: token->type = OP_CLOSE_DUP_NUM;
! 1633: break;
! 1634: case '(':
! 1635: if (syntax & RE_NO_BK_PARENS)
! 1636: token->type = OP_OPEN_SUBEXP;
! 1637: break;
! 1638: case ')':
! 1639: if (syntax & RE_NO_BK_PARENS)
! 1640: token->type = OP_CLOSE_SUBEXP;
! 1641: break;
! 1642: case '[':
! 1643: token->type = OP_OPEN_BRACKET;
! 1644: break;
! 1645: case '.':
! 1646: token->type = OP_PERIOD;
! 1647: break;
! 1648: case '^':
! 1649: if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
! 1650: re_string_cur_idx (input) != 0)
! 1651: {
! 1652: char prev = re_string_peek_byte (input, -1);
! 1653: if (prev != '|' && prev != '(' &&
! 1654: (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
! 1655: break;
! 1656: }
! 1657: token->type = ANCHOR;
! 1658: token->opr.idx = LINE_FIRST;
! 1659: break;
! 1660: case '$':
! 1661: if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
! 1662: re_string_cur_idx (input) + 1 != re_string_length (input))
! 1663: {
! 1664: re_token_t next;
! 1665: re_string_skip_bytes (input, 1);
! 1666: peek_token (&next, input, syntax);
! 1667: re_string_skip_bytes (input, -1);
! 1668: if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
! 1669: break;
! 1670: }
! 1671: token->type = ANCHOR;
! 1672: token->opr.idx = LINE_LAST;
! 1673: break;
! 1674: default:
! 1675: break;
! 1676: }
! 1677: return 1;
! 1678: }
! 1679:
! 1680: /* Peek a token from INPUT, and return the length of the token.
! 1681: We must not use this function out of bracket expressions. */
! 1682:
! 1683: static int
! 1684: peek_token_bracket (token, input, syntax)
! 1685: re_token_t *token;
! 1686: re_string_t *input;
! 1687: reg_syntax_t syntax;
! 1688: {
! 1689: unsigned char c;
! 1690: if (re_string_eoi (input))
! 1691: {
! 1692: token->type = END_OF_RE;
! 1693: return 0;
! 1694: }
! 1695: c = re_string_peek_byte (input, 0);
! 1696: token->opr.c = c;
! 1697:
! 1698: #ifdef RE_ENABLE_I18N
! 1699: if (MB_CUR_MAX > 1 &&
! 1700: !re_string_first_byte (input, re_string_cur_idx (input)))
! 1701: {
! 1702: token->type = CHARACTER;
! 1703: return 1;
! 1704: }
! 1705: #endif /* RE_ENABLE_I18N */
! 1706:
! 1707: if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
! 1708: {
! 1709: /* In this case, '\' escape a character. */
! 1710: unsigned char c2;
! 1711: re_string_skip_bytes (input, 1);
! 1712: c2 = re_string_peek_byte (input, 0);
! 1713: token->opr.c = c2;
! 1714: token->type = CHARACTER;
! 1715: return 1;
! 1716: }
! 1717: if (c == '[') /* '[' is a special char in a bracket exps. */
! 1718: {
! 1719: unsigned char c2;
! 1720: int token_len;
! 1721: c2 = re_string_peek_byte (input, 1);
! 1722: token->opr.c = c2;
! 1723: token_len = 2;
! 1724: switch (c2)
! 1725: {
! 1726: case '.':
! 1727: token->type = OP_OPEN_COLL_ELEM;
! 1728: break;
! 1729: case '=':
! 1730: token->type = OP_OPEN_EQUIV_CLASS;
! 1731: break;
! 1732: case ':':
! 1733: if (syntax & RE_CHAR_CLASSES)
! 1734: {
! 1735: token->type = OP_OPEN_CHAR_CLASS;
! 1736: break;
! 1737: }
! 1738: /* else fall through. */
! 1739: default:
! 1740: token->type = CHARACTER;
! 1741: token->opr.c = c;
! 1742: token_len = 1;
! 1743: break;
! 1744: }
! 1745: return token_len;
! 1746: }
! 1747: switch (c)
! 1748: {
! 1749: case '-':
! 1750: token->type = OP_CHARSET_RANGE;
! 1751: break;
! 1752: case ']':
! 1753: token->type = OP_CLOSE_BRACKET;
! 1754: break;
! 1755: case '^':
! 1756: token->type = OP_NON_MATCH_LIST;
! 1757: break;
! 1758: default:
! 1759: token->type = CHARACTER;
! 1760: }
! 1761: return 1;
! 1762: }
! 1763:
! 1764: /* Functions for parser. */
! 1765:
! 1766: /* Entry point of the parser.
! 1767: Parse the regular expression REGEXP and return the structure tree.
! 1768: If an error is occured, ERR is set by error code, and return NULL.
! 1769: This function build the following tree, from regular expression <reg_exp>:
! 1770: CAT
! 1771: / \
! 1772: / \
! 1773: <reg_exp> EOR
! 1774:
! 1775: CAT means concatenation.
! 1776: EOR means end of regular expression. */
! 1777:
! 1778: static bin_tree_t *
! 1779: parse (regexp, preg, syntax, err)
! 1780: re_string_t *regexp;
! 1781: regex_t *preg;
! 1782: reg_syntax_t syntax;
! 1783: reg_errcode_t *err;
! 1784: {
! 1785: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
! 1786: bin_tree_t *tree, *eor, *root;
! 1787: re_token_t current_token;
! 1788: int new_idx;
! 1789: current_token = fetch_token (regexp, syntax);
! 1790: tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
! 1791: if (BE (*err != REG_NOERROR && tree == NULL, 0))
! 1792: return NULL;
! 1793: new_idx = re_dfa_add_node (dfa, current_token, 0);
! 1794: eor = create_tree (NULL, NULL, 0, new_idx);
! 1795: if (tree != NULL)
! 1796: root = create_tree (tree, eor, CONCAT, 0);
! 1797: else
! 1798: root = eor;
! 1799: if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
! 1800: {
! 1801: *err = REG_ESPACE;
! 1802: return NULL;
! 1803: }
! 1804: return root;
! 1805: }
! 1806:
! 1807: /* This function build the following tree, from regular expression
! 1808: <branch1>|<branch2>:
! 1809: ALT
! 1810: / \
! 1811: / \
! 1812: <branch1> <branch2>
! 1813:
! 1814: ALT means alternative, which represents the operator `|'. */
! 1815:
! 1816: static bin_tree_t *
! 1817: parse_reg_exp (regexp, preg, token, syntax, nest, err)
! 1818: re_string_t *regexp;
! 1819: regex_t *preg;
! 1820: re_token_t *token;
! 1821: reg_syntax_t syntax;
! 1822: int nest;
! 1823: reg_errcode_t *err;
! 1824: {
! 1825: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
! 1826: bin_tree_t *tree, *branch = NULL;
! 1827: int new_idx;
! 1828: tree = parse_branch (regexp, preg, token, syntax, nest, err);
! 1829: if (BE (*err != REG_NOERROR && tree == NULL, 0))
! 1830: return NULL;
! 1831:
! 1832: while (token->type == OP_ALT)
! 1833: {
! 1834: re_token_t alt_token = *token;
! 1835: new_idx = re_dfa_add_node (dfa, alt_token, 0);
! 1836: *token = fetch_token (regexp, syntax);
! 1837: if (token->type != OP_ALT && token->type != END_OF_RE
! 1838: && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
! 1839: {
! 1840: branch = parse_branch (regexp, preg, token, syntax, nest, err);
! 1841: if (BE (*err != REG_NOERROR && branch == NULL, 0))
! 1842: {
! 1843: free_bin_tree (tree);
! 1844: return NULL;
! 1845: }
! 1846: }
! 1847: else
! 1848: branch = NULL;
! 1849: tree = create_tree (tree, branch, 0, new_idx);
! 1850: if (BE (new_idx == -1 || tree == NULL, 0))
! 1851: {
! 1852: *err = REG_ESPACE;
! 1853: return NULL;
! 1854: }
! 1855: dfa->has_plural_match = 1;
! 1856: }
! 1857: return tree;
! 1858: }
! 1859:
! 1860: /* This function build the following tree, from regular expression
! 1861: <exp1><exp2>:
! 1862: CAT
! 1863: / \
! 1864: / \
! 1865: <exp1> <exp2>
! 1866:
! 1867: CAT means concatenation. */
! 1868:
! 1869: static bin_tree_t *
! 1870: parse_branch (regexp, preg, token, syntax, nest, err)
! 1871: re_string_t *regexp;
! 1872: regex_t *preg;
! 1873: re_token_t *token;
! 1874: reg_syntax_t syntax;
! 1875: int nest;
! 1876: reg_errcode_t *err;
! 1877: {
! 1878: bin_tree_t *tree, *exp;
! 1879: tree = parse_expression (regexp, preg, token, syntax, nest, err);
! 1880: if (BE (*err != REG_NOERROR && tree == NULL, 0))
! 1881: return NULL;
! 1882:
! 1883: while (token->type != OP_ALT && token->type != END_OF_RE
! 1884: && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
! 1885: {
! 1886: exp = parse_expression (regexp, preg, token, syntax, nest, err);
! 1887: if (BE (*err != REG_NOERROR && exp == NULL, 0))
! 1888: {
! 1889: free_bin_tree (tree);
! 1890: return NULL;
! 1891: }
! 1892: if (tree != NULL && exp != NULL)
! 1893: {
! 1894: tree = create_tree (tree, exp, CONCAT, 0);
! 1895: if (tree == NULL)
! 1896: {
! 1897: *err = REG_ESPACE;
! 1898: return NULL;
! 1899: }
! 1900: }
! 1901: else if (tree == NULL)
! 1902: tree = exp;
! 1903: /* Otherwise exp == NULL, we don't need to create new tree. */
! 1904: }
! 1905: return tree;
! 1906: }
! 1907:
! 1908: /* This function build the following tree, from regular expression a*:
! 1909: *
! 1910: |
! 1911: a
! 1912: */
! 1913:
! 1914: static bin_tree_t *
! 1915: parse_expression (regexp, preg, token, syntax, nest, err)
! 1916: re_string_t *regexp;
! 1917: regex_t *preg;
! 1918: re_token_t *token;
! 1919: reg_syntax_t syntax;
! 1920: int nest;
! 1921: reg_errcode_t *err;
! 1922: {
! 1923: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
! 1924: bin_tree_t *tree;
! 1925: int new_idx;
! 1926: switch (token->type)
! 1927: {
! 1928: case CHARACTER:
! 1929: new_idx = re_dfa_add_node (dfa, *token, 0);
! 1930: tree = create_tree (NULL, NULL, 0, new_idx);
! 1931: if (BE (new_idx == -1 || tree == NULL, 0))
! 1932: {
! 1933: *err = REG_ESPACE;
! 1934: return NULL;
! 1935: }
! 1936: #ifdef RE_ENABLE_I18N
! 1937: if (MB_CUR_MAX > 1)
! 1938: {
! 1939: while (!re_string_eoi (regexp)
! 1940: && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
! 1941: {
! 1942: bin_tree_t *mbc_remain;
! 1943: *token = fetch_token (regexp, syntax);
! 1944: new_idx = re_dfa_add_node (dfa, *token, 0);
! 1945: mbc_remain = create_tree (NULL, NULL, 0, new_idx);
! 1946: tree = create_tree (tree, mbc_remain, CONCAT, 0);
! 1947: if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
! 1948: {
! 1949: *err = REG_ESPACE;
! 1950: return NULL;
! 1951: }
! 1952: }
! 1953: }
! 1954: #endif
! 1955: break;
! 1956: case OP_OPEN_SUBEXP:
! 1957: tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
! 1958: if (BE (*err != REG_NOERROR && tree == NULL, 0))
! 1959: return NULL;
! 1960: break;
! 1961: case OP_OPEN_BRACKET:
! 1962: tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
! 1963: if (BE (*err != REG_NOERROR && tree == NULL, 0))
! 1964: return NULL;
! 1965: break;
! 1966: case OP_BACK_REF:
! 1967: if (BE (preg->re_nsub < token->opr.idx
! 1968: || dfa->subexps[token->opr.idx - 1].end == -1, 0))
! 1969: {
! 1970: *err = REG_ESUBREG;
! 1971: return NULL;
! 1972: }
! 1973: dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
! 1974: new_idx = re_dfa_add_node (dfa, *token, 0);
! 1975: tree = create_tree (NULL, NULL, 0, new_idx);
! 1976: if (BE (new_idx == -1 || tree == NULL, 0))
! 1977: {
! 1978: *err = REG_ESPACE;
! 1979: return NULL;
! 1980: }
! 1981: ++dfa->nbackref;
! 1982: dfa->has_mb_node = 1;
! 1983: break;
! 1984: case OP_DUP_ASTERISK:
! 1985: case OP_DUP_PLUS:
! 1986: case OP_DUP_QUESTION:
! 1987: case OP_OPEN_DUP_NUM:
! 1988: if (syntax & RE_CONTEXT_INVALID_OPS)
! 1989: {
! 1990: *err = REG_BADRPT;
! 1991: return NULL;
! 1992: }
! 1993: else if (syntax & RE_CONTEXT_INDEP_OPS)
! 1994: {
! 1995: *token = fetch_token (regexp, syntax);
! 1996: return parse_expression (regexp, preg, token, syntax, nest, err);
! 1997: }
! 1998: /* else fall through */
! 1999: case OP_CLOSE_SUBEXP:
! 2000: if ((token->type == OP_CLOSE_SUBEXP) &&
! 2001: !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
! 2002: {
! 2003: *err = REG_ERPAREN;
! 2004: return NULL;
! 2005: }
! 2006: /* else fall through */
! 2007: case OP_CLOSE_DUP_NUM:
! 2008: /* We treat it as a normal character. */
! 2009:
! 2010: /* Then we can these characters as normal characters. */
! 2011: token->type = CHARACTER;
! 2012: new_idx = re_dfa_add_node (dfa, *token, 0);
! 2013: tree = create_tree (NULL, NULL, 0, new_idx);
! 2014: if (BE (new_idx == -1 || tree == NULL, 0))
! 2015: {
! 2016: *err = REG_ESPACE;
! 2017: return NULL;
! 2018: }
! 2019: break;
! 2020: case ANCHOR:
! 2021: if (dfa->word_char == NULL)
! 2022: {
! 2023: *err = init_word_char (dfa);
! 2024: if (BE (*err != REG_NOERROR, 0))
! 2025: return NULL;
! 2026: }
! 2027: if (token->opr.ctx_type == WORD_DELIM)
! 2028: {
! 2029: bin_tree_t *tree_first, *tree_last;
! 2030: int idx_first, idx_last;
! 2031: token->opr.ctx_type = WORD_FIRST;
! 2032: idx_first = re_dfa_add_node (dfa, *token, 0);
! 2033: tree_first = create_tree (NULL, NULL, 0, idx_first);
! 2034: token->opr.ctx_type = WORD_LAST;
! 2035: idx_last = re_dfa_add_node (dfa, *token, 0);
! 2036: tree_last = create_tree (NULL, NULL, 0, idx_last);
! 2037: token->type = OP_ALT;
! 2038: new_idx = re_dfa_add_node (dfa, *token, 0);
! 2039: tree = create_tree (tree_first, tree_last, 0, new_idx);
! 2040: if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
! 2041: || tree_first == NULL || tree_last == NULL
! 2042: || tree == NULL, 0))
! 2043: {
! 2044: *err = REG_ESPACE;
! 2045: return NULL;
! 2046: }
! 2047: }
! 2048: else
! 2049: {
! 2050: new_idx = re_dfa_add_node (dfa, *token, 0);
! 2051: tree = create_tree (NULL, NULL, 0, new_idx);
! 2052: if (BE (new_idx == -1 || tree == NULL, 0))
! 2053: {
! 2054: *err = REG_ESPACE;
! 2055: return NULL;
! 2056: }
! 2057: }
! 2058: /* We must return here, since ANCHORs can't be followed
! 2059: by repetition operators.
! 2060: eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
! 2061: it must not be "<ANCHOR(^)><REPEAT(*)>". */
! 2062: *token = fetch_token (regexp, syntax);
! 2063: return tree;
! 2064: case OP_PERIOD:
! 2065: new_idx = re_dfa_add_node (dfa, *token, 0);
! 2066: tree = create_tree (NULL, NULL, 0, new_idx);
! 2067: if (BE (new_idx == -1 || tree == NULL, 0))
! 2068: {
! 2069: *err = REG_ESPACE;
! 2070: return NULL;
! 2071: }
! 2072: if (MB_CUR_MAX > 1)
! 2073: dfa->has_mb_node = 1;
! 2074: break;
! 2075: case OP_WORD:
! 2076: tree = build_word_op (dfa, 0, err);
! 2077: if (BE (*err != REG_NOERROR && tree == NULL, 0))
! 2078: return NULL;
! 2079: break;
! 2080: case OP_NOTWORD:
! 2081: tree = build_word_op (dfa, 1, err);
! 2082: if (BE (*err != REG_NOERROR && tree == NULL, 0))
! 2083: return NULL;
! 2084: break;
! 2085: case OP_ALT:
! 2086: case END_OF_RE:
! 2087: return NULL;
! 2088: case BACK_SLASH:
! 2089: *err = REG_EESCAPE;
! 2090: return NULL;
! 2091: default:
! 2092: /* Must not happen? */
! 2093: #ifdef DEBUG
! 2094: assert (0);
! 2095: #endif
! 2096: return NULL;
! 2097: }
! 2098: *token = fetch_token (regexp, syntax);
! 2099:
! 2100: while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
! 2101: || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
! 2102: {
! 2103: tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
! 2104: if (BE (*err != REG_NOERROR && tree == NULL, 0))
! 2105: return NULL;
! 2106: dfa->has_plural_match = 1;
! 2107: }
! 2108:
! 2109: return tree;
! 2110: }
! 2111:
! 2112: /* This function build the following tree, from regular expression
! 2113: (<reg_exp>):
! 2114: SUBEXP
! 2115: |
! 2116: <reg_exp>
! 2117: */
! 2118:
! 2119: static bin_tree_t *
! 2120: parse_sub_exp (regexp, preg, token, syntax, nest, err)
! 2121: re_string_t *regexp;
! 2122: regex_t *preg;
! 2123: re_token_t *token;
! 2124: reg_syntax_t syntax;
! 2125: int nest;
! 2126: reg_errcode_t *err;
! 2127: {
! 2128: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
! 2129: bin_tree_t *tree, *left_par, *right_par;
! 2130: size_t cur_nsub;
! 2131: int new_idx;
! 2132: cur_nsub = preg->re_nsub++;
! 2133: if (dfa->subexps_alloc < preg->re_nsub)
! 2134: {
! 2135: re_subexp_t *new_array;
! 2136: dfa->subexps_alloc *= 2;
! 2137: new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
! 2138: if (BE (new_array == NULL, 0))
! 2139: {
! 2140: dfa->subexps_alloc /= 2;
! 2141: *err = REG_ESPACE;
! 2142: return NULL;
! 2143: }
! 2144: dfa->subexps = new_array;
! 2145: }
! 2146: dfa->subexps[cur_nsub].start = dfa->nodes_len;
! 2147: dfa->subexps[cur_nsub].end = -1;
! 2148:
! 2149: new_idx = re_dfa_add_node (dfa, *token, 0);
! 2150: left_par = create_tree (NULL, NULL, 0, new_idx);
! 2151: if (BE (new_idx == -1 || left_par == NULL, 0))
! 2152: {
! 2153: *err = REG_ESPACE;
! 2154: return NULL;
! 2155: }
! 2156: dfa->nodes[new_idx].opr.idx = cur_nsub;
! 2157: *token = fetch_token (regexp, syntax);
! 2158:
! 2159: /* The subexpression may be a null string. */
! 2160: if (token->type == OP_CLOSE_SUBEXP)
! 2161: tree = NULL;
! 2162: else
! 2163: {
! 2164: tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
! 2165: if (BE (*err != REG_NOERROR && tree == NULL, 0))
! 2166: return NULL;
! 2167: }
! 2168: if (BE (token->type != OP_CLOSE_SUBEXP, 0))
! 2169: {
! 2170: free_bin_tree (tree);
! 2171: *err = REG_BADPAT;
! 2172: return NULL;
! 2173: }
! 2174: new_idx = re_dfa_add_node (dfa, *token, 0);
! 2175: dfa->subexps[cur_nsub].end = dfa->nodes_len;
! 2176: right_par = create_tree (NULL, NULL, 0, new_idx);
! 2177: tree = ((tree == NULL) ? right_par
! 2178: : create_tree (tree, right_par, CONCAT, 0));
! 2179: tree = create_tree (left_par, tree, CONCAT, 0);
! 2180: if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
! 2181: {
! 2182: *err = REG_ESPACE;
! 2183: return NULL;
! 2184: }
! 2185: dfa->nodes[new_idx].opr.idx = cur_nsub;
! 2186:
! 2187: return tree;
! 2188: }
! 2189:
! 2190: /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
! 2191:
! 2192: static bin_tree_t *
! 2193: parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
! 2194: bin_tree_t *dup_elem;
! 2195: re_string_t *regexp;
! 2196: re_dfa_t *dfa;
! 2197: re_token_t *token;
! 2198: reg_syntax_t syntax;
! 2199: reg_errcode_t *err;
! 2200: {
! 2201: re_token_t dup_token;
! 2202: bin_tree_t *tree = dup_elem, *work_tree;
! 2203: int new_idx, start_idx = re_string_cur_idx (regexp);
! 2204: re_token_t start_token = *token;
! 2205: if (token->type == OP_OPEN_DUP_NUM)
! 2206: {
! 2207: int i;
! 2208: int end = 0;
! 2209: int start = fetch_number (regexp, token, syntax);
! 2210: bin_tree_t *elem;
! 2211: if (start == -1)
! 2212: {
! 2213: if (token->type == CHARACTER && token->opr.c == ',')
! 2214: start = 0; /* We treat "{,m}" as "{0,m}". */
! 2215: else
! 2216: {
! 2217: *err = REG_BADBR; /* <re>{} is invalid. */
! 2218: return NULL;
! 2219: }
! 2220: }
! 2221: if (BE (start != -2, 1))
! 2222: {
! 2223: /* We treat "{n}" as "{n,n}". */
! 2224: end = ((token->type == OP_CLOSE_DUP_NUM) ? start
! 2225: : ((token->type == CHARACTER && token->opr.c == ',')
! 2226: ? fetch_number (regexp, token, syntax) : -2));
! 2227: }
! 2228: if (BE (start == -2 || end == -2, 0))
! 2229: {
! 2230: /* Invalid sequence. */
! 2231: if (token->type == OP_CLOSE_DUP_NUM)
! 2232: goto parse_dup_op_invalid_interval;
! 2233: else
! 2234: goto parse_dup_op_ebrace;
! 2235: }
! 2236: if (BE (start == 0 && end == 0, 0))
! 2237: {
! 2238: /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
! 2239: *token = fetch_token (regexp, syntax);
! 2240: free_bin_tree (dup_elem);
! 2241: return NULL;
! 2242: }
! 2243:
! 2244: /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
! 2245: elem = tree;
! 2246: for (i = 0; i < start; ++i)
! 2247: if (i != 0)
! 2248: {
! 2249: work_tree = duplicate_tree (elem, dfa);
! 2250: tree = create_tree (tree, work_tree, CONCAT, 0);
! 2251: if (BE (work_tree == NULL || tree == NULL, 0))
! 2252: goto parse_dup_op_espace;
! 2253: }
! 2254:
! 2255: if (end == -1)
! 2256: {
! 2257: /* We treat "<re>{0,}" as "<re>*". */
! 2258: dup_token.type = OP_DUP_ASTERISK;
! 2259: if (start > 0)
! 2260: {
! 2261: elem = duplicate_tree (elem, dfa);
! 2262: new_idx = re_dfa_add_node (dfa, dup_token, 0);
! 2263: work_tree = create_tree (elem, NULL, 0, new_idx);
! 2264: tree = create_tree (tree, work_tree, CONCAT, 0);
! 2265: if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
! 2266: || tree == NULL, 0))
! 2267: goto parse_dup_op_espace;
! 2268: }
! 2269: else
! 2270: {
! 2271: new_idx = re_dfa_add_node (dfa, dup_token, 0);
! 2272: tree = create_tree (elem, NULL, 0, new_idx);
! 2273: if (BE (new_idx == -1 || tree == NULL, 0))
! 2274: goto parse_dup_op_espace;
! 2275: }
! 2276: }
! 2277: else if (end - start > 0)
! 2278: {
! 2279: /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
! 2280: dup_token.type = OP_DUP_QUESTION;
! 2281: if (start > 0)
! 2282: {
! 2283: elem = duplicate_tree (elem, dfa);
! 2284: new_idx = re_dfa_add_node (dfa, dup_token, 0);
! 2285: elem = create_tree (elem, NULL, 0, new_idx);
! 2286: tree = create_tree (tree, elem, CONCAT, 0);
! 2287: if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
! 2288: goto parse_dup_op_espace;
! 2289: }
! 2290: else
! 2291: {
! 2292: new_idx = re_dfa_add_node (dfa, dup_token, 0);
! 2293: tree = elem = create_tree (elem, NULL, 0, new_idx);
! 2294: if (BE (new_idx == -1 || tree == NULL, 0))
! 2295: goto parse_dup_op_espace;
! 2296: }
! 2297: for (i = 1; i < end - start; ++i)
! 2298: {
! 2299: work_tree = duplicate_tree (elem, dfa);
! 2300: tree = create_tree (tree, work_tree, CONCAT, 0);
! 2301: if (BE (work_tree == NULL || tree == NULL, 0))
! 2302: {
! 2303: *err = REG_ESPACE;
! 2304: return NULL;
! 2305: }
! 2306: }
! 2307: }
! 2308: }
! 2309: else
! 2310: {
! 2311: new_idx = re_dfa_add_node (dfa, *token, 0);
! 2312: tree = create_tree (tree, NULL, 0, new_idx);
! 2313: if (BE (new_idx == -1 || tree == NULL, 0))
! 2314: {
! 2315: *err = REG_ESPACE;
! 2316: return NULL;
! 2317: }
! 2318: }
! 2319: *token = fetch_token (regexp, syntax);
! 2320: return tree;
! 2321:
! 2322: parse_dup_op_espace:
! 2323: free_bin_tree (tree);
! 2324: *err = REG_ESPACE;
! 2325: return NULL;
! 2326:
! 2327: parse_dup_op_ebrace:
! 2328: if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
! 2329: {
! 2330: *err = REG_EBRACE;
! 2331: return NULL;
! 2332: }
! 2333: goto parse_dup_op_rollback;
! 2334: parse_dup_op_invalid_interval:
! 2335: if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
! 2336: {
! 2337: *err = REG_BADBR;
! 2338: return NULL;
! 2339: }
! 2340: parse_dup_op_rollback:
! 2341: re_string_set_index (regexp, start_idx);
! 2342: *token = start_token;
! 2343: token->type = CHARACTER;
! 2344: return dup_elem;
! 2345: }
! 2346:
! 2347: /* Size of the names for collating symbol/equivalence_class/character_class.
! 2348: I'm not sure, but maybe enough. */
! 2349: #define BRACKET_NAME_BUF_SIZE 32
! 2350:
! 2351: #ifndef _LIBC
! 2352: /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
! 2353: Build the range expression which starts from START_ELEM, and ends
! 2354: at END_ELEM. The result are written to MBCSET and SBCSET.
! 2355: RANGE_ALLOC is the allocated size of mbcset->range_starts, and
! 2356: mbcset->range_ends, is a pointer argument sinse we may
! 2357: update it. */
! 2358:
! 2359: static reg_errcode_t
! 2360: # ifdef RE_ENABLE_I18N
! 2361: build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
! 2362: re_charset_t *mbcset;
! 2363: int *range_alloc;
! 2364: # else /* not RE_ENABLE_I18N */
! 2365: build_range_exp (sbcset, start_elem, end_elem)
! 2366: # endif /* not RE_ENABLE_I18N */
! 2367: re_bitset_ptr_t sbcset;
! 2368: bracket_elem_t *start_elem, *end_elem;
! 2369: {
! 2370: unsigned int start_ch, end_ch;
! 2371: /* Equivalence Classes and Character Classes can't be a range start/end. */
! 2372: if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
! 2373: || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
! 2374: 0))
! 2375: return REG_ERANGE;
! 2376:
! 2377: /* We can handle no multi character collating elements without libc
! 2378: support. */
! 2379: if (BE ((start_elem->type == COLL_SYM
! 2380: && strlen ((char *) start_elem->opr.name) > 1)
! 2381: || (end_elem->type == COLL_SYM
! 2382: && strlen ((char *) end_elem->opr.name) > 1), 0))
! 2383: return REG_ECOLLATE;
! 2384:
! 2385: # ifdef RE_ENABLE_I18N
! 2386: {
! 2387: wchar_t wc, start_wc, end_wc;
! 2388: wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
! 2389:
! 2390: start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
! 2391: : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
! 2392: : 0));
! 2393: end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
! 2394: : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
! 2395: : 0));
! 2396: start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
! 2397: ? __btowc (start_ch) : start_elem->opr.wch);
! 2398: end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
! 2399: ? __btowc (end_ch) : end_elem->opr.wch);
! 2400: cmp_buf[0] = start_wc;
! 2401: cmp_buf[4] = end_wc;
! 2402: if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
! 2403: return REG_ERANGE;
! 2404:
! 2405: /* Check the space of the arrays. */
! 2406: if (*range_alloc == mbcset->nranges)
! 2407: {
! 2408: /* There are not enough space, need realloc. */
! 2409: wchar_t *new_array_start, *new_array_end;
! 2410: int new_nranges;
! 2411:
! 2412: /* +1 in case of mbcset->nranges is 0. */
! 2413: new_nranges = 2 * mbcset->nranges + 1;
! 2414: /* Use realloc since mbcset->range_starts and mbcset->range_ends
! 2415: are NULL if *range_alloc == 0. */
! 2416: new_array_start = re_realloc (mbcset->range_starts, wchar_t,
! 2417: new_nranges);
! 2418: new_array_end = re_realloc (mbcset->range_ends, wchar_t,
! 2419: new_nranges);
! 2420:
! 2421: if (BE (new_array_start == NULL || new_array_end == NULL, 0))
! 2422: return REG_ESPACE;
! 2423:
! 2424: mbcset->range_starts = new_array_start;
! 2425: mbcset->range_ends = new_array_end;
! 2426: *range_alloc = new_nranges;
! 2427: }
! 2428:
! 2429: mbcset->range_starts[mbcset->nranges] = start_wc;
! 2430: mbcset->range_ends[mbcset->nranges++] = end_wc;
! 2431:
! 2432: /* Build the table for single byte characters. */
! 2433: for (wc = 0; wc <= SBC_MAX; ++wc)
! 2434: {
! 2435: cmp_buf[2] = wc;
! 2436: if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
! 2437: && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
! 2438: bitset_set (sbcset, wc);
! 2439: }
! 2440: }
! 2441: # else /* not RE_ENABLE_I18N */
! 2442: {
! 2443: unsigned int ch;
! 2444: start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
! 2445: : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
! 2446: : 0));
! 2447: end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
! 2448: : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
! 2449: : 0));
! 2450: if (start_ch > end_ch)
! 2451: return REG_ERANGE;
! 2452: /* Build the table for single byte characters. */
! 2453: for (ch = 0; ch <= SBC_MAX; ++ch)
! 2454: if (start_ch <= ch && ch <= end_ch)
! 2455: bitset_set (sbcset, ch);
! 2456: }
! 2457: # endif /* not RE_ENABLE_I18N */
! 2458: return REG_NOERROR;
! 2459: }
! 2460: #endif /* not _LIBC */
! 2461:
! 2462: #ifndef _LIBC
! 2463: /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
! 2464: Build the collating element which is represented by NAME.
! 2465: The result are written to MBCSET and SBCSET.
! 2466: COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
! 2467: pointer argument since we may update it. */
! 2468:
! 2469: static reg_errcode_t
! 2470: # ifdef RE_ENABLE_I18N
! 2471: build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
! 2472: re_charset_t *mbcset;
! 2473: int *coll_sym_alloc;
! 2474: # else /* not RE_ENABLE_I18N */
! 2475: build_collating_symbol (sbcset, name)
! 2476: # endif /* not RE_ENABLE_I18N */
! 2477: re_bitset_ptr_t sbcset;
! 2478: const unsigned char *name;
! 2479: {
! 2480: size_t name_len = strlen ((const char *) name);
! 2481: if (BE (name_len != 1, 0))
! 2482: return REG_ECOLLATE;
! 2483: else
! 2484: {
! 2485: bitset_set (sbcset, name[0]);
! 2486: return REG_NOERROR;
! 2487: }
! 2488: }
! 2489: #endif /* not _LIBC */
! 2490:
! 2491: /* This function parse bracket expression like "[abc]", "[a-c]",
! 2492: "[[.a-a.]]" etc. */
! 2493:
! 2494: static bin_tree_t *
! 2495: parse_bracket_exp (regexp, dfa, token, syntax, err)
! 2496: re_string_t *regexp;
! 2497: re_dfa_t *dfa;
! 2498: re_token_t *token;
! 2499: reg_syntax_t syntax;
! 2500: reg_errcode_t *err;
! 2501: {
! 2502: #ifdef _LIBC
! 2503: const unsigned char *collseqmb;
! 2504: const char *collseqwc;
! 2505: uint32_t nrules;
! 2506: int32_t table_size;
! 2507: const int32_t *symb_table;
! 2508: const unsigned char *extra;
! 2509:
! 2510: /* Local function for parse_bracket_exp used in _LIBC environement.
! 2511: Seek the collating symbol entry correspondings to NAME.
! 2512: Return the index of the symbol in the SYMB_TABLE. */
! 2513:
! 2514: static inline int32_t
! 2515: seek_collating_symbol_entry (name, name_len)
! 2516: const unsigned char *name;
! 2517: size_t name_len;
! 2518: {
! 2519: int32_t hash = elem_hash ((const char *) name, name_len);
! 2520: int32_t elem = hash % table_size;
! 2521: int32_t second = hash % (table_size - 2);
! 2522: while (symb_table[2 * elem] != 0)
! 2523: {
! 2524: /* First compare the hashing value. */
! 2525: if (symb_table[2 * elem] == hash
! 2526: /* Compare the length of the name. */
! 2527: && name_len == extra[symb_table[2 * elem + 1]]
! 2528: /* Compare the name. */
! 2529: && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
! 2530: name_len) == 0)
! 2531: {
! 2532: /* Yep, this is the entry. */
! 2533: break;
! 2534: }
! 2535:
! 2536: /* Next entry. */
! 2537: elem += second;
! 2538: }
! 2539: return elem;
! 2540: }
! 2541:
! 2542: /* Local function for parse_bracket_exp used in _LIBC environement.
! 2543: Look up the collation sequence value of BR_ELEM.
! 2544: Return the value if succeeded, UINT_MAX otherwise. */
! 2545:
! 2546: static inline unsigned int
! 2547: lookup_collation_sequence_value (br_elem)
! 2548: bracket_elem_t *br_elem;
! 2549: {
! 2550: if (br_elem->type == SB_CHAR)
! 2551: {
! 2552: /*
! 2553: if (MB_CUR_MAX == 1)
! 2554: */
! 2555: if (nrules == 0)
! 2556: return collseqmb[br_elem->opr.ch];
! 2557: else
! 2558: {
! 2559: wint_t wc = __btowc (br_elem->opr.ch);
! 2560: return collseq_table_lookup (collseqwc, wc);
! 2561: }
! 2562: }
! 2563: else if (br_elem->type == MB_CHAR)
! 2564: {
! 2565: return collseq_table_lookup (collseqwc, br_elem->opr.wch);
! 2566: }
! 2567: else if (br_elem->type == COLL_SYM)
! 2568: {
! 2569: size_t sym_name_len = strlen ((char *) br_elem->opr.name);
! 2570: if (nrules != 0)
! 2571: {
! 2572: int32_t elem, idx;
! 2573: elem = seek_collating_symbol_entry (br_elem->opr.name,
! 2574: sym_name_len);
! 2575: if (symb_table[2 * elem] != 0)
! 2576: {
! 2577: /* We found the entry. */
! 2578: idx = symb_table[2 * elem + 1];
! 2579: /* Skip the name of collating element name. */
! 2580: idx += 1 + extra[idx];
! 2581: /* Skip the byte sequence of the collating element. */
! 2582: idx += 1 + extra[idx];
! 2583: /* Adjust for the alignment. */
! 2584: idx = (idx + 3) & ~3;
! 2585: /* Skip the multibyte collation sequence value. */
! 2586: idx += sizeof (unsigned int);
! 2587: /* Skip the wide char sequence of the collating element. */
! 2588: idx += sizeof (unsigned int) *
! 2589: (1 + *(unsigned int *) (extra + idx));
! 2590: /* Return the collation sequence value. */
! 2591: return *(unsigned int *) (extra + idx);
! 2592: }
! 2593: else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
! 2594: {
! 2595: /* No valid character. Match it as a single byte
! 2596: character. */
! 2597: return collseqmb[br_elem->opr.name[0]];
! 2598: }
! 2599: }
! 2600: else if (sym_name_len == 1)
! 2601: return collseqmb[br_elem->opr.name[0]];
! 2602: }
! 2603: return UINT_MAX;
! 2604: }
! 2605:
! 2606: /* Local function for parse_bracket_exp used in _LIBC environement.
! 2607: Build the range expression which starts from START_ELEM, and ends
! 2608: at END_ELEM. The result are written to MBCSET and SBCSET.
! 2609: RANGE_ALLOC is the allocated size of mbcset->range_starts, and
! 2610: mbcset->range_ends, is a pointer argument sinse we may
! 2611: update it. */
! 2612:
! 2613: static inline reg_errcode_t
! 2614: # ifdef RE_ENABLE_I18N
! 2615: build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
! 2616: re_charset_t *mbcset;
! 2617: int *range_alloc;
! 2618: # else /* not RE_ENABLE_I18N */
! 2619: build_range_exp (sbcset, start_elem, end_elem)
! 2620: # endif /* not RE_ENABLE_I18N */
! 2621: re_bitset_ptr_t sbcset;
! 2622: bracket_elem_t *start_elem, *end_elem;
! 2623: {
! 2624: unsigned int ch;
! 2625: uint32_t start_collseq;
! 2626: uint32_t end_collseq;
! 2627:
! 2628: # ifdef RE_ENABLE_I18N
! 2629: /* Check the space of the arrays. */
! 2630: if (*range_alloc == mbcset->nranges)
! 2631: {
! 2632: /* There are not enough space, need realloc. */
! 2633: uint32_t *new_array_start;
! 2634: uint32_t *new_array_end;
! 2635: int new_nranges;
! 2636:
! 2637: /* +1 in case of mbcset->nranges is 0. */
! 2638: new_nranges = 2 * mbcset->nranges + 1;
! 2639: /* Use realloc since mbcset->range_starts and mbcset->range_ends
! 2640: are NULL if *range_alloc == 0. */
! 2641: new_array_start = re_realloc (mbcset->range_starts, uint32_t,
! 2642: new_nranges);
! 2643: new_array_end = re_realloc (mbcset->range_ends, uint32_t,
! 2644: new_nranges);
! 2645:
! 2646: if (BE (new_array_start == NULL || new_array_end == NULL, 0))
! 2647: return REG_ESPACE;
! 2648:
! 2649: mbcset->range_starts = new_array_start;
! 2650: mbcset->range_ends = new_array_end;
! 2651: *range_alloc = new_nranges;
! 2652: }
! 2653: # endif /* RE_ENABLE_I18N */
! 2654:
! 2655: /* Equivalence Classes and Character Classes can't be a range
! 2656: start/end. */
! 2657: if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
! 2658: || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
! 2659: 0))
! 2660: return REG_ERANGE;
! 2661:
! 2662: start_collseq = lookup_collation_sequence_value (start_elem);
! 2663: end_collseq = lookup_collation_sequence_value (end_elem);
! 2664: /* Check start/end collation sequence values. */
! 2665: if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
! 2666: return REG_ECOLLATE;
! 2667: if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
! 2668: return REG_ERANGE;
! 2669:
! 2670: # ifdef RE_ENABLE_I18N
! 2671: /* Got valid collation sequence values, add them as a new entry. */
! 2672: mbcset->range_starts[mbcset->nranges] = start_collseq;
! 2673: mbcset->range_ends[mbcset->nranges++] = end_collseq;
! 2674: # endif /* RE_ENABLE_I18N */
! 2675:
! 2676: /* Build the table for single byte characters. */
! 2677: for (ch = 0; ch <= SBC_MAX; ch++)
! 2678: {
! 2679: uint32_t ch_collseq;
! 2680: /*
! 2681: if (MB_CUR_MAX == 1)
! 2682: */
! 2683: if (nrules == 0)
! 2684: ch_collseq = collseqmb[ch];
! 2685: else
! 2686: ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
! 2687: if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
! 2688: bitset_set (sbcset, ch);
! 2689: }
! 2690: return REG_NOERROR;
! 2691: }
! 2692:
! 2693: /* Local function for parse_bracket_exp used in _LIBC environement.
! 2694: Build the collating element which is represented by NAME.
! 2695: The result are written to MBCSET and SBCSET.
! 2696: COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
! 2697: pointer argument sinse we may update it. */
! 2698:
! 2699: static inline reg_errcode_t
! 2700: # ifdef RE_ENABLE_I18N
! 2701: build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
! 2702: re_charset_t *mbcset;
! 2703: int *coll_sym_alloc;
! 2704: # else /* not RE_ENABLE_I18N */
! 2705: build_collating_symbol (sbcset, name)
! 2706: # endif /* not RE_ENABLE_I18N */
! 2707: re_bitset_ptr_t sbcset;
! 2708: const unsigned char *name;
! 2709: {
! 2710: int32_t elem, idx;
! 2711: size_t name_len = strlen ((const char *) name);
! 2712: if (nrules != 0)
! 2713: {
! 2714: elem = seek_collating_symbol_entry (name, name_len);
! 2715: if (symb_table[2 * elem] != 0)
! 2716: {
! 2717: /* We found the entry. */
! 2718: idx = symb_table[2 * elem + 1];
! 2719: /* Skip the name of collating element name. */
! 2720: idx += 1 + extra[idx];
! 2721: }
! 2722: else if (symb_table[2 * elem] == 0 && name_len == 1)
! 2723: {
! 2724: /* No valid character, treat it as a normal
! 2725: character. */
! 2726: bitset_set (sbcset, name[0]);
! 2727: return REG_NOERROR;
! 2728: }
! 2729: else
! 2730: return REG_ECOLLATE;
! 2731:
! 2732: # ifdef RE_ENABLE_I18N
! 2733: /* Got valid collation sequence, add it as a new entry. */
! 2734: /* Check the space of the arrays. */
! 2735: if (*coll_sym_alloc == mbcset->ncoll_syms)
! 2736: {
! 2737: /* Not enough, realloc it. */
! 2738: /* +1 in case of mbcset->ncoll_syms is 0. */
! 2739: *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
! 2740: /* Use realloc since mbcset->coll_syms is NULL
! 2741: if *alloc == 0. */
! 2742: mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
! 2743: *coll_sym_alloc);
! 2744: if (BE (mbcset->coll_syms == NULL, 0))
! 2745: return REG_ESPACE;
! 2746: }
! 2747: mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
! 2748: # endif /* RE_ENABLE_I18N */
! 2749: return REG_NOERROR;
! 2750: }
! 2751: else
! 2752: {
! 2753: if (BE (name_len != 1, 0))
! 2754: return REG_ECOLLATE;
! 2755: else
! 2756: {
! 2757: bitset_set (sbcset, name[0]);
! 2758: return REG_NOERROR;
! 2759: }
! 2760: }
! 2761: }
! 2762: #endif
! 2763:
! 2764: re_token_t br_token;
! 2765: re_bitset_ptr_t sbcset;
! 2766: #ifdef RE_ENABLE_I18N
! 2767: re_charset_t *mbcset;
! 2768: int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
! 2769: int equiv_class_alloc = 0, char_class_alloc = 0;
! 2770: #else /* not RE_ENABLE_I18N */
! 2771: int non_match = 0;
! 2772: #endif /* not RE_ENABLE_I18N */
! 2773: bin_tree_t *work_tree;
! 2774: int token_len, new_idx;
! 2775: #ifdef _LIBC
! 2776: collseqmb = (const unsigned char *)
! 2777: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
! 2778: nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
! 2779: if (nrules)
! 2780: {
! 2781: /*
! 2782: if (MB_CUR_MAX > 1)
! 2783: */
! 2784: collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
! 2785: table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
! 2786: symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
! 2787: _NL_COLLATE_SYMB_TABLEMB);
! 2788: extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
! 2789: _NL_COLLATE_SYMB_EXTRAMB);
! 2790: }
! 2791: #endif
! 2792: sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
! 2793: #ifdef RE_ENABLE_I18N
! 2794: mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
! 2795: #endif /* RE_ENABLE_I18N */
! 2796: #ifdef RE_ENABLE_I18N
! 2797: if (BE (sbcset == NULL || mbcset == NULL, 0))
! 2798: #else
! 2799: if (BE (sbcset == NULL, 0))
! 2800: #endif /* RE_ENABLE_I18N */
! 2801: {
! 2802: *err = REG_ESPACE;
! 2803: return NULL;
! 2804: }
! 2805:
! 2806: token_len = peek_token_bracket (token, regexp, syntax);
! 2807: if (BE (token->type == END_OF_RE, 0))
! 2808: {
! 2809: *err = REG_BADPAT;
! 2810: goto parse_bracket_exp_free_return;
! 2811: }
! 2812: if (token->type == OP_NON_MATCH_LIST)
! 2813: {
! 2814: #ifdef RE_ENABLE_I18N
! 2815: int i;
! 2816: mbcset->non_match = 1;
! 2817: #else /* not RE_ENABLE_I18N */
! 2818: non_match = 1;
! 2819: #endif /* not RE_ENABLE_I18N */
! 2820: if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
! 2821: bitset_set (sbcset, '\0');
! 2822: re_string_skip_bytes (regexp, token_len); /* Skip a token. */
! 2823: token_len = peek_token_bracket (token, regexp, syntax);
! 2824: if (BE (token->type == END_OF_RE, 0))
! 2825: {
! 2826: *err = REG_BADPAT;
! 2827: goto parse_bracket_exp_free_return;
! 2828: }
! 2829: #ifdef RE_ENABLE_I18N
! 2830: if (MB_CUR_MAX > 1)
! 2831: for (i = 0; i < SBC_MAX; ++i)
! 2832: if (__btowc (i) == WEOF)
! 2833: bitset_set (sbcset, i);
! 2834: #endif /* RE_ENABLE_I18N */
! 2835: }
! 2836:
! 2837: /* We treat the first ']' as a normal character. */
! 2838: if (token->type == OP_CLOSE_BRACKET)
! 2839: token->type = CHARACTER;
! 2840:
! 2841: while (1)
! 2842: {
! 2843: bracket_elem_t start_elem, end_elem;
! 2844: unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
! 2845: unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
! 2846: reg_errcode_t ret;
! 2847: int token_len2 = 0, is_range_exp = 0;
! 2848: re_token_t token2;
! 2849:
! 2850: start_elem.opr.name = start_name_buf;
! 2851: ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
! 2852: syntax);
! 2853: if (BE (ret != REG_NOERROR, 0))
! 2854: {
! 2855: *err = ret;
! 2856: goto parse_bracket_exp_free_return;
! 2857: }
! 2858:
! 2859: token_len = peek_token_bracket (token, regexp, syntax);
! 2860: if (BE (token->type == END_OF_RE, 0))
! 2861: {
! 2862: *err = REG_BADPAT;
! 2863: goto parse_bracket_exp_free_return;
! 2864: }
! 2865: if (token->type == OP_CHARSET_RANGE)
! 2866: {
! 2867: re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
! 2868: token_len2 = peek_token_bracket (&token2, regexp, syntax);
! 2869: if (BE (token->type == END_OF_RE, 0))
! 2870: {
! 2871: *err = REG_BADPAT;
! 2872: goto parse_bracket_exp_free_return;
! 2873: }
! 2874: if (token2.type == OP_CLOSE_BRACKET)
! 2875: {
! 2876: /* We treat the last '-' as a normal character. */
! 2877: re_string_skip_bytes (regexp, -token_len);
! 2878: token->type = CHARACTER;
! 2879: }
! 2880: else
! 2881: is_range_exp = 1;
! 2882: }
! 2883:
! 2884: if (is_range_exp == 1)
! 2885: {
! 2886: end_elem.opr.name = end_name_buf;
! 2887: ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
! 2888: dfa, syntax);
! 2889: if (BE (ret != REG_NOERROR, 0))
! 2890: {
! 2891: *err = ret;
! 2892: goto parse_bracket_exp_free_return;
! 2893: }
! 2894:
! 2895: token_len = peek_token_bracket (token, regexp, syntax);
! 2896: if (BE (token->type == END_OF_RE, 0))
! 2897: {
! 2898: *err = REG_BADPAT;
! 2899: goto parse_bracket_exp_free_return;
! 2900: }
! 2901: *err = build_range_exp (sbcset,
! 2902: #ifdef RE_ENABLE_I18N
! 2903: mbcset, &range_alloc,
! 2904: #endif /* RE_ENABLE_I18N */
! 2905: &start_elem, &end_elem);
! 2906: if (BE (*err != REG_NOERROR, 0))
! 2907: goto parse_bracket_exp_free_return;
! 2908: }
! 2909: else
! 2910: {
! 2911: switch (start_elem.type)
! 2912: {
! 2913: case SB_CHAR:
! 2914: bitset_set (sbcset, start_elem.opr.ch);
! 2915: break;
! 2916: #ifdef RE_ENABLE_I18N
! 2917: case MB_CHAR:
! 2918: /* Check whether the array has enough space. */
! 2919: if (mbchar_alloc == mbcset->nmbchars)
! 2920: {
! 2921: /* Not enough, realloc it. */
! 2922: /* +1 in case of mbcset->nmbchars is 0. */
! 2923: mbchar_alloc = 2 * mbcset->nmbchars + 1;
! 2924: /* Use realloc since array is NULL if *alloc == 0. */
! 2925: mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
! 2926: mbchar_alloc);
! 2927: if (BE (mbcset->mbchars == NULL, 0))
! 2928: goto parse_bracket_exp_espace;
! 2929: }
! 2930: mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
! 2931: break;
! 2932: #endif /* RE_ENABLE_I18N */
! 2933: case EQUIV_CLASS:
! 2934: *err = build_equiv_class (sbcset,
! 2935: #ifdef RE_ENABLE_I18N
! 2936: mbcset, &equiv_class_alloc,
! 2937: #endif /* RE_ENABLE_I18N */
! 2938: start_elem.opr.name);
! 2939: if (BE (*err != REG_NOERROR, 0))
! 2940: goto parse_bracket_exp_free_return;
! 2941: break;
! 2942: case COLL_SYM:
! 2943: *err = build_collating_symbol (sbcset,
! 2944: #ifdef RE_ENABLE_I18N
! 2945: mbcset, &coll_sym_alloc,
! 2946: #endif /* RE_ENABLE_I18N */
! 2947: start_elem.opr.name);
! 2948: if (BE (*err != REG_NOERROR, 0))
! 2949: goto parse_bracket_exp_free_return;
! 2950: break;
! 2951: case CHAR_CLASS:
! 2952: *err = build_charclass (sbcset,
! 2953: #ifdef RE_ENABLE_I18N
! 2954: mbcset, &char_class_alloc,
! 2955: #endif /* RE_ENABLE_I18N */
! 2956: start_elem.opr.name, syntax);
! 2957: if (BE (*err != REG_NOERROR, 0))
! 2958: goto parse_bracket_exp_free_return;
! 2959: break;
! 2960: default:
! 2961: assert (0);
! 2962: break;
! 2963: }
! 2964: }
! 2965: if (token->type == OP_CLOSE_BRACKET)
! 2966: break;
! 2967: }
! 2968:
! 2969: re_string_skip_bytes (regexp, token_len); /* Skip a token. */
! 2970:
! 2971: /* If it is non-matching list. */
! 2972: #ifdef RE_ENABLE_I18N
! 2973: if (mbcset->non_match)
! 2974: #else /* not RE_ENABLE_I18N */
! 2975: if (non_match)
! 2976: #endif /* not RE_ENABLE_I18N */
! 2977: bitset_not (sbcset);
! 2978:
! 2979: /* Build a tree for simple bracket. */
! 2980: br_token.type = SIMPLE_BRACKET;
! 2981: br_token.opr.sbcset = sbcset;
! 2982: new_idx = re_dfa_add_node (dfa, br_token, 0);
! 2983: work_tree = create_tree (NULL, NULL, 0, new_idx);
! 2984: if (BE (new_idx == -1 || work_tree == NULL, 0))
! 2985: goto parse_bracket_exp_espace;
! 2986:
! 2987: #ifdef RE_ENABLE_I18N
! 2988: if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
! 2989: || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
! 2990: || mbcset->non_match)))
! 2991: {
! 2992: re_token_t alt_token;
! 2993: bin_tree_t *mbc_tree;
! 2994: /* Build a tree for complex bracket. */
! 2995: br_token.type = COMPLEX_BRACKET;
! 2996: br_token.opr.mbcset = mbcset;
! 2997: dfa->has_mb_node = 1;
! 2998: new_idx = re_dfa_add_node (dfa, br_token, 0);
! 2999: mbc_tree = create_tree (NULL, NULL, 0, new_idx);
! 3000: if (BE (new_idx == -1 || mbc_tree == NULL, 0))
! 3001: goto parse_bracket_exp_espace;
! 3002: /* Then join them by ALT node. */
! 3003: dfa->has_plural_match = 1;
! 3004: alt_token.type = OP_ALT;
! 3005: new_idx = re_dfa_add_node (dfa, alt_token, 0);
! 3006: work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
! 3007: if (BE (new_idx != -1 && mbc_tree != NULL, 1))
! 3008: return work_tree;
! 3009: }
! 3010: else
! 3011: {
! 3012: free_charset (mbcset);
! 3013: return work_tree;
! 3014: }
! 3015: #else /* not RE_ENABLE_I18N */
! 3016: return work_tree;
! 3017: #endif /* not RE_ENABLE_I18N */
! 3018:
! 3019: parse_bracket_exp_espace:
! 3020: *err = REG_ESPACE;
! 3021: parse_bracket_exp_free_return:
! 3022: re_free (sbcset);
! 3023: #ifdef RE_ENABLE_I18N
! 3024: free_charset (mbcset);
! 3025: #endif /* RE_ENABLE_I18N */
! 3026: return NULL;
! 3027: }
! 3028:
! 3029: /* Parse an element in the bracket expression. */
! 3030:
! 3031: static reg_errcode_t
! 3032: parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
! 3033: bracket_elem_t *elem;
! 3034: re_string_t *regexp;
! 3035: re_token_t *token;
! 3036: int token_len;
! 3037: re_dfa_t *dfa;
! 3038: reg_syntax_t syntax;
! 3039: {
! 3040: #ifdef RE_ENABLE_I18N
! 3041: int cur_char_size;
! 3042: cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
! 3043: if (cur_char_size > 1)
! 3044: {
! 3045: elem->type = MB_CHAR;
! 3046: elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
! 3047: re_string_skip_bytes (regexp, cur_char_size);
! 3048: return REG_NOERROR;
! 3049: }
! 3050: #endif /* RE_ENABLE_I18N */
! 3051: re_string_skip_bytes (regexp, token_len); /* Skip a token. */
! 3052: if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
! 3053: || token->type == OP_OPEN_EQUIV_CLASS)
! 3054: return parse_bracket_symbol (elem, regexp, token);
! 3055: elem->type = SB_CHAR;
! 3056: elem->opr.ch = token->opr.c;
! 3057: return REG_NOERROR;
! 3058: }
! 3059:
! 3060: /* Parse a bracket symbol in the bracket expression. Bracket symbols are
! 3061: such as [:<character_class>:], [.<collating_element>.], and
! 3062: [=<equivalent_class>=]. */
! 3063:
! 3064: static reg_errcode_t
! 3065: parse_bracket_symbol (elem, regexp, token)
! 3066: bracket_elem_t *elem;
! 3067: re_string_t *regexp;
! 3068: re_token_t *token;
! 3069: {
! 3070: unsigned char ch, delim = token->opr.c;
! 3071: int i = 0;
! 3072: for (;; ++i)
! 3073: {
! 3074: if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
! 3075: return REG_EBRACK;
! 3076: if (token->type == OP_OPEN_CHAR_CLASS)
! 3077: ch = re_string_fetch_byte_case (regexp);
! 3078: else
! 3079: ch = re_string_fetch_byte (regexp);
! 3080: if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
! 3081: break;
! 3082: elem->opr.name[i] = ch;
! 3083: }
! 3084: re_string_skip_bytes (regexp, 1);
! 3085: elem->opr.name[i] = '\0';
! 3086: switch (token->type)
! 3087: {
! 3088: case OP_OPEN_COLL_ELEM:
! 3089: elem->type = COLL_SYM;
! 3090: break;
! 3091: case OP_OPEN_EQUIV_CLASS:
! 3092: elem->type = EQUIV_CLASS;
! 3093: break;
! 3094: case OP_OPEN_CHAR_CLASS:
! 3095: elem->type = CHAR_CLASS;
! 3096: break;
! 3097: default:
! 3098: break;
! 3099: }
! 3100: return REG_NOERROR;
! 3101: }
! 3102:
! 3103: /* Helper function for parse_bracket_exp.
! 3104: Build the equivalence class which is represented by NAME.
! 3105: The result are written to MBCSET and SBCSET.
! 3106: EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
! 3107: is a pointer argument sinse we may update it. */
! 3108:
! 3109: static reg_errcode_t
! 3110: #ifdef RE_ENABLE_I18N
! 3111: build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
! 3112: re_charset_t *mbcset;
! 3113: int *equiv_class_alloc;
! 3114: #else /* not RE_ENABLE_I18N */
! 3115: build_equiv_class (sbcset, name)
! 3116: #endif /* not RE_ENABLE_I18N */
! 3117: re_bitset_ptr_t sbcset;
! 3118: const unsigned char *name;
! 3119: {
! 3120: #if defined _LIBC && defined RE_ENABLE_I18N
! 3121: uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
! 3122: if (nrules != 0)
! 3123: {
! 3124: const int32_t *table, *indirect;
! 3125: const unsigned char *weights, *extra, *cp;
! 3126: unsigned char char_buf[2];
! 3127: int32_t idx1, idx2;
! 3128: unsigned int ch;
! 3129: size_t len;
! 3130: /* This #include defines a local function! */
! 3131: # include <locale/weight.h>
! 3132: /* Calculate the index for equivalence class. */
! 3133: cp = name;
! 3134: table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
! 3135: weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
! 3136: _NL_COLLATE_WEIGHTMB);
! 3137: extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
! 3138: _NL_COLLATE_EXTRAMB);
! 3139: indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
! 3140: _NL_COLLATE_INDIRECTMB);
! 3141: idx1 = findidx (&cp);
! 3142: if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
! 3143: /* This isn't a valid character. */
! 3144: return REG_ECOLLATE;
! 3145:
! 3146: /* Build single byte matcing table for this equivalence class. */
! 3147: char_buf[1] = (unsigned char) '\0';
! 3148: len = weights[idx1];
! 3149: for (ch = 0; ch < SBC_MAX; ++ch)
! 3150: {
! 3151: char_buf[0] = ch;
! 3152: cp = char_buf;
! 3153: idx2 = findidx (&cp);
! 3154: /*
! 3155: idx2 = table[ch];
! 3156: */
! 3157: if (idx2 == 0)
! 3158: /* This isn't a valid character. */
! 3159: continue;
! 3160: if (len == weights[idx2])
! 3161: {
! 3162: int cnt = 0;
! 3163: while (cnt <= len &&
! 3164: weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
! 3165: ++cnt;
! 3166:
! 3167: if (cnt > len)
! 3168: bitset_set (sbcset, ch);
! 3169: }
! 3170: }
! 3171: /* Check whether the array has enough space. */
! 3172: if (*equiv_class_alloc == mbcset->nequiv_classes)
! 3173: {
! 3174: /* Not enough, realloc it. */
! 3175: /* +1 in case of mbcset->nequiv_classes is 0. */
! 3176: *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
! 3177: /* Use realloc since the array is NULL if *alloc == 0. */
! 3178: mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
! 3179: *equiv_class_alloc);
! 3180: if (BE (mbcset->equiv_classes == NULL, 0))
! 3181: return REG_ESPACE;
! 3182: }
! 3183: mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
! 3184: }
! 3185: else
! 3186: #endif /* _LIBC && RE_ENABLE_I18N */
! 3187: {
! 3188: if (BE (strlen ((const char *) name) != 1, 0))
! 3189: return REG_ECOLLATE;
! 3190: bitset_set (sbcset, *name);
! 3191: }
! 3192: return REG_NOERROR;
! 3193: }
! 3194:
! 3195: /* Helper function for parse_bracket_exp.
! 3196: Build the character class which is represented by NAME.
! 3197: The result are written to MBCSET and SBCSET.
! 3198: CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
! 3199: is a pointer argument sinse we may update it. */
! 3200:
! 3201: static reg_errcode_t
! 3202: #ifdef RE_ENABLE_I18N
! 3203: build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
! 3204: re_charset_t *mbcset;
! 3205: int *char_class_alloc;
! 3206: #else /* not RE_ENABLE_I18N */
! 3207: build_charclass (sbcset, class_name, syntax)
! 3208: #endif /* not RE_ENABLE_I18N */
! 3209: re_bitset_ptr_t sbcset;
! 3210: const unsigned char *class_name;
! 3211: reg_syntax_t syntax;
! 3212: {
! 3213: int i;
! 3214: const char *name = (const char *) class_name;
! 3215:
! 3216: /* In case of REG_ICASE "upper" and "lower" match the both of
! 3217: upper and lower cases. */
! 3218: if ((syntax & RE_ICASE)
! 3219: && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
! 3220: name = "alpha";
! 3221:
! 3222: #ifdef RE_ENABLE_I18N
! 3223: /* Check the space of the arrays. */
! 3224: if (*char_class_alloc == mbcset->nchar_classes)
! 3225: {
! 3226: /* Not enough, realloc it. */
! 3227: /* +1 in case of mbcset->nchar_classes is 0. */
! 3228: *char_class_alloc = 2 * mbcset->nchar_classes + 1;
! 3229: /* Use realloc since array is NULL if *alloc == 0. */
! 3230: mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
! 3231: *char_class_alloc);
! 3232: if (BE (mbcset->char_classes == NULL, 0))
! 3233: return REG_ESPACE;
! 3234: }
! 3235: mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
! 3236: #endif /* RE_ENABLE_I18N */
! 3237:
! 3238: #define BUILD_CHARCLASS_LOOP(ctype_func)\
! 3239: for (i = 0; i < SBC_MAX; ++i) \
! 3240: { \
! 3241: if (ctype_func (i)) \
! 3242: bitset_set (sbcset, i); \
! 3243: }
! 3244:
! 3245: if (strcmp (name, "alnum") == 0)
! 3246: BUILD_CHARCLASS_LOOP (isalnum)
! 3247: else if (strcmp (name, "cntrl") == 0)
! 3248: BUILD_CHARCLASS_LOOP (iscntrl)
! 3249: else if (strcmp (name, "lower") == 0)
! 3250: BUILD_CHARCLASS_LOOP (islower)
! 3251: else if (strcmp (name, "space") == 0)
! 3252: BUILD_CHARCLASS_LOOP (isspace)
! 3253: else if (strcmp (name, "alpha") == 0)
! 3254: BUILD_CHARCLASS_LOOP (isalpha)
! 3255: else if (strcmp (name, "digit") == 0)
! 3256: BUILD_CHARCLASS_LOOP (isdigit)
! 3257: else if (strcmp (name, "print") == 0)
! 3258: BUILD_CHARCLASS_LOOP (isprint)
! 3259: else if (strcmp (name, "upper") == 0)
! 3260: BUILD_CHARCLASS_LOOP (isupper)
! 3261: else if (strcmp (name, "blank") == 0)
! 3262: BUILD_CHARCLASS_LOOP (isblank)
! 3263: else if (strcmp (name, "graph") == 0)
! 3264: BUILD_CHARCLASS_LOOP (isgraph)
! 3265: else if (strcmp (name, "punct") == 0)
! 3266: BUILD_CHARCLASS_LOOP (ispunct)
! 3267: else if (strcmp (name, "xdigit") == 0)
! 3268: BUILD_CHARCLASS_LOOP (isxdigit)
! 3269: else
! 3270: return REG_ECTYPE;
! 3271:
! 3272: return REG_NOERROR;
! 3273: }
! 3274:
! 3275: static bin_tree_t *
! 3276: build_word_op (dfa, not, err)
! 3277: re_dfa_t *dfa;
! 3278: int not;
! 3279: reg_errcode_t *err;
! 3280: {
! 3281: re_bitset_ptr_t sbcset;
! 3282: #ifdef RE_ENABLE_I18N
! 3283: re_charset_t *mbcset;
! 3284: int alloc = 0;
! 3285: #else /* not RE_ENABLE_I18N */
! 3286: int non_match = 0;
! 3287: #endif /* not RE_ENABLE_I18N */
! 3288: reg_errcode_t ret;
! 3289: re_token_t br_token;
! 3290: bin_tree_t *tree;
! 3291: int new_idx;
! 3292:
! 3293: sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
! 3294: #ifdef RE_ENABLE_I18N
! 3295: mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
! 3296: #endif /* RE_ENABLE_I18N */
! 3297:
! 3298: #ifdef RE_ENABLE_I18N
! 3299: if (BE (sbcset == NULL || mbcset == NULL, 0))
! 3300: #else /* not RE_ENABLE_I18N */
! 3301: if (BE (sbcset == NULL, 0))
! 3302: #endif /* not RE_ENABLE_I18N */
! 3303: {
! 3304: *err = REG_ESPACE;
! 3305: return NULL;
! 3306: }
! 3307:
! 3308: if (not)
! 3309: {
! 3310: #ifdef RE_ENABLE_I18N
! 3311: int i;
! 3312: /*
! 3313: if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
! 3314: bitset_set(cset->sbcset, '\0');
! 3315: */
! 3316: mbcset->non_match = 1;
! 3317: if (MB_CUR_MAX > 1)
! 3318: for (i = 0; i < SBC_MAX; ++i)
! 3319: if (__btowc (i) == WEOF)
! 3320: bitset_set (sbcset, i);
! 3321: #else /* not RE_ENABLE_I18N */
! 3322: non_match = 1;
! 3323: #endif /* not RE_ENABLE_I18N */
! 3324: }
! 3325:
! 3326: /* We don't care the syntax in this case. */
! 3327: ret = build_charclass (sbcset,
! 3328: #ifdef RE_ENABLE_I18N
! 3329: mbcset, &alloc,
! 3330: #endif /* RE_ENABLE_I18N */
! 3331: (const unsigned char *) "alpha", 0);
! 3332:
! 3333: if (BE (ret != REG_NOERROR, 0))
! 3334: {
! 3335: re_free (sbcset);
! 3336: #ifdef RE_ENABLE_I18N
! 3337: free_charset (mbcset);
! 3338: #endif /* RE_ENABLE_I18N */
! 3339: *err = ret;
! 3340: return NULL;
! 3341: }
! 3342: /* \w match '_' also. */
! 3343: bitset_set (sbcset, '_');
! 3344:
! 3345: /* If it is non-matching list. */
! 3346: #ifdef RE_ENABLE_I18N
! 3347: if (mbcset->non_match)
! 3348: #else /* not RE_ENABLE_I18N */
! 3349: if (non_match)
! 3350: #endif /* not RE_ENABLE_I18N */
! 3351: bitset_not (sbcset);
! 3352:
! 3353: /* Build a tree for simple bracket. */
! 3354: br_token.type = SIMPLE_BRACKET;
! 3355: br_token.opr.sbcset = sbcset;
! 3356: new_idx = re_dfa_add_node (dfa, br_token, 0);
! 3357: tree = create_tree (NULL, NULL, 0, new_idx);
! 3358: if (BE (new_idx == -1 || tree == NULL, 0))
! 3359: goto build_word_op_espace;
! 3360:
! 3361: #ifdef RE_ENABLE_I18N
! 3362: if (MB_CUR_MAX > 1)
! 3363: {
! 3364: re_token_t alt_token;
! 3365: bin_tree_t *mbc_tree;
! 3366: /* Build a tree for complex bracket. */
! 3367: br_token.type = COMPLEX_BRACKET;
! 3368: br_token.opr.mbcset = mbcset;
! 3369: dfa->has_mb_node = 1;
! 3370: new_idx = re_dfa_add_node (dfa, br_token, 0);
! 3371: mbc_tree = create_tree (NULL, NULL, 0, new_idx);
! 3372: if (BE (new_idx == -1 || mbc_tree == NULL, 0))
! 3373: goto build_word_op_espace;
! 3374: /* Then join them by ALT node. */
! 3375: alt_token.type = OP_ALT;
! 3376: new_idx = re_dfa_add_node (dfa, alt_token, 0);
! 3377: tree = create_tree (tree, mbc_tree, 0, new_idx);
! 3378: if (BE (new_idx != -1 && mbc_tree != NULL, 1))
! 3379: return tree;
! 3380: }
! 3381: else
! 3382: {
! 3383: free_charset (mbcset);
! 3384: return tree;
! 3385: }
! 3386: #else /* not RE_ENABLE_I18N */
! 3387: return tree;
! 3388: #endif /* not RE_ENABLE_I18N */
! 3389:
! 3390: build_word_op_espace:
! 3391: re_free (sbcset);
! 3392: #ifdef RE_ENABLE_I18N
! 3393: free_charset (mbcset);
! 3394: #endif /* RE_ENABLE_I18N */
! 3395: *err = REG_ESPACE;
! 3396: return NULL;
! 3397: }
! 3398:
! 3399: /* This is intended for the expressions like "a{1,3}".
! 3400: Fetch a number from `input', and return the number.
! 3401: Return -1, if the number field is empty like "{,1}".
! 3402: Return -2, If an error is occured. */
! 3403:
! 3404: static int
! 3405: fetch_number (input, token, syntax)
! 3406: re_string_t *input;
! 3407: re_token_t *token;
! 3408: reg_syntax_t syntax;
! 3409: {
! 3410: int num = -1;
! 3411: unsigned char c;
! 3412: while (1)
! 3413: {
! 3414: *token = fetch_token (input, syntax);
! 3415: c = token->opr.c;
! 3416: if (BE (token->type == END_OF_RE, 0))
! 3417: return -2;
! 3418: if (token->type == OP_CLOSE_DUP_NUM || c == ',')
! 3419: break;
! 3420: num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
! 3421: ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
! 3422: num = (num > RE_DUP_MAX) ? -2 : num;
! 3423: }
! 3424: return num;
! 3425: }
! 3426:
! 3427: #ifdef RE_ENABLE_I18N
! 3428: static void
! 3429: free_charset (re_charset_t *cset)
! 3430: {
! 3431: re_free (cset->mbchars);
! 3432: # ifdef _LIBC
! 3433: re_free (cset->coll_syms);
! 3434: re_free (cset->equiv_classes);
! 3435: re_free (cset->range_starts);
! 3436: re_free (cset->range_ends);
! 3437: # endif
! 3438: re_free (cset->char_classes);
! 3439: re_free (cset);
! 3440: }
! 3441: #endif /* RE_ENABLE_I18N */
! 3442:
! 3443: /* Functions for binary tree operation. */
! 3444:
! 3445: /* Create a node of tree.
! 3446: Note: This function automatically free left and right if malloc fails. */
! 3447:
! 3448: static bin_tree_t *
! 3449: create_tree (left, right, type, index)
! 3450: bin_tree_t *left;
! 3451: bin_tree_t *right;
! 3452: re_token_type_t type;
! 3453: int index;
! 3454: {
! 3455: bin_tree_t *tree;
! 3456: tree = re_malloc (bin_tree_t, 1);
! 3457: if (BE (tree == NULL, 0))
! 3458: {
! 3459: free_bin_tree (left);
! 3460: free_bin_tree (right);
! 3461: return NULL;
! 3462: }
! 3463: tree->parent = NULL;
! 3464: tree->left = left;
! 3465: tree->right = right;
! 3466: tree->type = type;
! 3467: tree->node_idx = index;
! 3468: tree->first = -1;
! 3469: tree->next = -1;
! 3470: re_node_set_init_empty (&tree->eclosure);
! 3471:
! 3472: if (left != NULL)
! 3473: left->parent = tree;
! 3474: if (right != NULL)
! 3475: right->parent = tree;
! 3476: return tree;
! 3477: }
! 3478:
! 3479: /* Free the sub tree pointed by TREE. */
! 3480:
! 3481: static void
! 3482: free_bin_tree (tree)
! 3483: bin_tree_t *tree;
! 3484: {
! 3485: if (tree == NULL)
! 3486: return;
! 3487: /*re_node_set_free (&tree->eclosure);*/
! 3488: free_bin_tree (tree->left);
! 3489: free_bin_tree (tree->right);
! 3490: re_free (tree);
! 3491: }
! 3492:
! 3493: /* Duplicate the node SRC, and return new node. */
! 3494:
! 3495: static bin_tree_t *
! 3496: duplicate_tree (src, dfa)
! 3497: const bin_tree_t *src;
! 3498: re_dfa_t *dfa;
! 3499: {
! 3500: bin_tree_t *left = NULL, *right = NULL, *new_tree;
! 3501: int new_node_idx;
! 3502: /* Since node indies must be according to Post-order of the tree,
! 3503: we must duplicate the left at first. */
! 3504: if (src->left != NULL)
! 3505: {
! 3506: left = duplicate_tree (src->left, dfa);
! 3507: if (left == NULL)
! 3508: return NULL;
! 3509: }
! 3510:
! 3511: /* Secondaly, duplicate the right. */
! 3512: if (src->right != NULL)
! 3513: {
! 3514: right = duplicate_tree (src->right, dfa);
! 3515: if (right == NULL)
! 3516: {
! 3517: free_bin_tree (left);
! 3518: return NULL;
! 3519: }
! 3520: }
! 3521:
! 3522: /* At last, duplicate itself. */
! 3523: if (src->type == NON_TYPE)
! 3524: {
! 3525: new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
! 3526: dfa->nodes[new_node_idx].duplicated = 1;
! 3527: if (BE (new_node_idx == -1, 0))
! 3528: {
! 3529: free_bin_tree (left);
! 3530: free_bin_tree (right);
! 3531: return NULL;
! 3532: }
! 3533: }
! 3534: else
! 3535: new_node_idx = src->type;
! 3536:
! 3537: new_tree = create_tree (left, right, src->type, new_node_idx);
! 3538: if (BE (new_tree == NULL, 0))
! 3539: {
! 3540: free_bin_tree (left);
! 3541: free_bin_tree (right);
! 3542: }
! 3543: return new_tree;
! 3544: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>