embedaddon/smartmontools/regex/regcomp.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / smartmontools / regex / regcomp.c
Revision 1.1.1.2 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Mon Jul 22 01:17:36 2013 UTC (10 years, 11 months ago) by misho
Branches: smartmontools, elwix, MAIN
CVS tags: v6_2, v6_1p0, v6_1, HEAD

6.1

1: /* Extended regular expression matching and search library. 2: Copyright (C) 2002, 2003 Free Software Foundation, Inc. 3: This file is part of the GNU C Library. 4: Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5: 6: The GNU C Library is free software; you can redistribute it and/or 7: modify it under the terms of the GNU Lesser General Public 8: License as published by the Free Software Foundation; either 9: version 2.1 of the License, or (at your option) any later version. 10: 11: The GNU C Library is distributed in the hope that it will be useful, 12: but WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: Lesser General Public License for more details. 15: 16: You should have received a copy of the GNU Lesser General Public 17: License along with the GNU C Library; if not, write to the Free 18: Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 19: MA 02110-1301 USA. */ 20: 21: static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 22: int length, reg_syntax_t syntax); 23: static void re_compile_fastmap_iter (regex_t *bufp, 24: const re_dfastate_t *init_state, 25: char *fastmap); 26: static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len); 27: static reg_errcode_t init_word_char (re_dfa_t *dfa); 28: #ifdef RE_ENABLE_I18N 29: static void free_charset (re_charset_t *cset); 30: #endif /* RE_ENABLE_I18N */ 31: static void free_workarea_compile (regex_t *preg); 32: static reg_errcode_t create_initial_state (re_dfa_t *dfa); 33: static reg_errcode_t analyze (re_dfa_t *dfa); 34: static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node); 35: static void calc_first (re_dfa_t *dfa, bin_tree_t *node); 36: static void calc_next (re_dfa_t *dfa, bin_tree_t *node); 37: static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node); 38: static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node, 39: int top_clone_node, int root_node, 40: unsigned int constraint); 41: static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx, 42: unsigned int constraint); 43: static int search_duplicated_node (re_dfa_t *dfa, int org_node, 44: unsigned int constraint); 45: static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 46: static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 47: int node, int root); 48: static void calc_inveclosure (re_dfa_t *dfa); 49: static int fetch_number (re_string_t *input, re_token_t *token, 50: reg_syntax_t syntax); 51: static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax); 52: static int peek_token (re_token_t *token, re_string_t *input, 53: reg_syntax_t syntax); 54: static int peek_token_bracket (re_token_t *token, re_string_t *input, 55: reg_syntax_t syntax); 56: static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 57: reg_syntax_t syntax, reg_errcode_t *err); 58: static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 59: re_token_t *token, reg_syntax_t syntax, 60: int nest, reg_errcode_t *err); 61: static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 62: re_token_t *token, reg_syntax_t syntax, 63: int nest, reg_errcode_t *err); 64: static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 65: re_token_t *token, reg_syntax_t syntax, 66: int nest, reg_errcode_t *err); 67: static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 68: re_token_t *token, reg_syntax_t syntax, 69: int nest, reg_errcode_t *err); 70: static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 71: re_dfa_t *dfa, re_token_t *token, 72: reg_syntax_t syntax, reg_errcode_t *err); 73: static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 74: re_token_t *token, reg_syntax_t syntax, 75: reg_errcode_t *err); 76: static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 77: re_string_t *regexp, 78: re_token_t *token, int token_len, 79: re_dfa_t *dfa, 80: reg_syntax_t syntax); 81: static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 82: re_string_t *regexp, 83: re_token_t *token); 84: #ifndef _LIBC 85: # ifdef RE_ENABLE_I18N 86: static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, 87: re_charset_t *mbcset, int *range_alloc, 88: bracket_elem_t *start_elem, 89: bracket_elem_t *end_elem); 90: static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, 91: re_charset_t *mbcset, 92: int *coll_sym_alloc, 93: const unsigned char *name); 94: # else /* not RE_ENABLE_I18N */ 95: static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset, 96: bracket_elem_t *start_elem, 97: bracket_elem_t *end_elem); 98: static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset, 99: const unsigned char *name); 100: # endif /* not RE_ENABLE_I18N */ 101: #endif /* not _LIBC */ 102: #ifdef RE_ENABLE_I18N 103: static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, 104: re_charset_t *mbcset, 105: int *equiv_class_alloc, 106: const unsigned char *name); 107: static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset, 108: re_charset_t *mbcset, 109: int *char_class_alloc, 110: const unsigned char *class_name, 111: reg_syntax_t syntax); 112: #else /* not RE_ENABLE_I18N */ 113: static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset, 114: const unsigned char *name); 115: static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset, 116: const unsigned char *class_name, 117: reg_syntax_t syntax); 118: #endif /* not RE_ENABLE_I18N */ 119: static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err); 120: static void free_bin_tree (bin_tree_t *tree); 121: static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right, 122: re_token_type_t type, int index); 123: static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 124: 125: /* This table gives an error message for each of the error codes listed 126: in regex.h. Obviously the order here has to be same as there. 127: POSIX doesn't require that we do anything for REG_NOERROR, 128: but why not be nice? */ 129: 130: const char __re_error_msgid[] attribute_hidden = 131: { 132: #define REG_NOERROR_IDX 0 133: gettext_noop ("Success") /* REG_NOERROR */ 134: "\0" 135: #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 136: gettext_noop ("No match") /* REG_NOMATCH */ 137: "\0" 138: #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 139: gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 140: "\0" 141: #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 142: gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 143: "\0" 144: #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 145: gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 146: "\0" 147: #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 148: gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 149: "\0" 150: #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 151: gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 152: "\0" 153: #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 154: gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 155: "\0" 156: #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 157: gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 158: "\0" 159: #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 160: gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 161: "\0" 162: #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 163: gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 164: "\0" 165: #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 166: gettext_noop ("Invalid range end") /* REG_ERANGE */ 167: "\0" 168: #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 169: gettext_noop ("Memory exhausted") /* REG_ESPACE */ 170: "\0" 171: #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 172: gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 173: "\0" 174: #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 175: gettext_noop ("Premature end of regular expression") /* REG_EEND */ 176: "\0" 177: #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 178: gettext_noop ("Regular expression too big") /* REG_ESIZE */ 179: "\0" 180: #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 181: gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 182: }; 183: 184: const size_t __re_error_msgid_idx[] attribute_hidden = 185: { 186: REG_NOERROR_IDX, 187: REG_NOMATCH_IDX, 188: REG_BADPAT_IDX, 189: REG_ECOLLATE_IDX, 190: REG_ECTYPE_IDX, 191: REG_EESCAPE_IDX, 192: REG_ESUBREG_IDX, 193: REG_EBRACK_IDX, 194: REG_EPAREN_IDX, 195: REG_EBRACE_IDX, 196: REG_BADBR_IDX, 197: REG_ERANGE_IDX, 198: REG_ESPACE_IDX, 199: REG_BADRPT_IDX, 200: REG_EEND_IDX, 201: REG_ESIZE_IDX, 202: REG_ERPAREN_IDX 203: }; 204: 205: /* Entry points for GNU code. */ 206: 207: /* re_compile_pattern is the GNU regular expression compiler: it 208: compiles PATTERN (of length LENGTH) and puts the result in BUFP. 209: Returns 0 if the pattern was valid, otherwise an error string. 210: 211: Assumes the `allocated' (and perhaps `buffer') and `translate' fields 212: are set in BUFP on entry. */ 213: 214: const char * 215: re_compile_pattern (pattern, length, bufp) 216: const char *pattern; 217: size_t length; 218: struct re_pattern_buffer *bufp; 219: { 220: reg_errcode_t ret; 221: 222: /* And GNU code determines whether or not to get register information 223: by passing null for the REGS argument to re_match, etc., not by 224: setting no_sub. */ 225: bufp->no_sub = 0; 226: 227: /* Match anchors at newline. */ 228: bufp->newline_anchor = 1; 229: 230: ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 231: 232: if (!ret) 233: return NULL; 234: return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 235: } 236: #ifdef _LIBC 237: weak_alias (__re_compile_pattern, re_compile_pattern) 238: #endif 239: 240: /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 241: also be assigned to arbitrarily: each pattern buffer stores its own 242: syntax, so it can be changed between regex compilations. */ 243: /* This has no initializer because initialized variables in Emacs 244: become read-only after dumping. */ 245: reg_syntax_t re_syntax_options; 246: 247: 248: /* Specify the precise syntax of regexps for compilation. This provides 249: for compatibility for various utilities which historically have 250: different, incompatible syntaxes. 251: 252: The argument SYNTAX is a bit mask comprised of the various bits 253: defined in regex.h. We return the old syntax. */ 254: 255: reg_syntax_t 256: re_set_syntax (syntax) 257: reg_syntax_t syntax; 258: { 259: reg_syntax_t ret = re_syntax_options; 260: 261: re_syntax_options = syntax; 262: return ret; 263: } 264: #ifdef _LIBC 265: weak_alias (__re_set_syntax, re_set_syntax) 266: #endif 267: 268: int 269: re_compile_fastmap (bufp) 270: struct re_pattern_buffer *bufp; 271: { 272: re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 273: char *fastmap = bufp->fastmap; 274: 275: memset (fastmap, '\0', sizeof (char) * SBC_MAX); 276: re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 277: if (dfa->init_state != dfa->init_state_word) 278: re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 279: if (dfa->init_state != dfa->init_state_nl) 280: re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 281: if (dfa->init_state != dfa->init_state_begbuf) 282: re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 283: bufp->fastmap_accurate = 1; 284: return 0; 285: } 286: #ifdef _LIBC 287: weak_alias (__re_compile_fastmap, re_compile_fastmap) 288: #endif 289: 290: static inline void 291: re_set_fastmap (char *fastmap, int icase, int ch) 292: { 293: fastmap[ch] = 1; 294: if (icase) 295: fastmap[tolower (ch)] = 1; 296: } 297: 298: /* Helper function for re_compile_fastmap. 299: Compile fastmap for the initial_state INIT_STATE. */ 300: 301: static void 302: re_compile_fastmap_iter (bufp, init_state, fastmap) 303: regex_t *bufp; 304: const re_dfastate_t *init_state; 305: char *fastmap; 306: { 307: re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 308: int node_cnt; 309: int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE)); 310: for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 311: { 312: int node = init_state->nodes.elems[node_cnt]; 313: re_token_type_t type = dfa->nodes[node].type; 314: 315: if (type == CHARACTER) 316: re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 317: else if (type == SIMPLE_BRACKET) 318: { 319: int i, j, ch; 320: for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 321: for (j = 0; j < UINT_BITS; ++j, ++ch) 322: if (dfa->nodes[node].opr.sbcset[i] & (1 << j)) 323: re_set_fastmap (fastmap, icase, ch); 324: } 325: #ifdef RE_ENABLE_I18N 326: else if (type == COMPLEX_BRACKET) 327: { 328: int i; 329: re_charset_t *cset = dfa->nodes[node].opr.mbcset; 330: if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes 331: || cset->nranges || cset->nchar_classes) 332: { 333: # ifdef _LIBC 334: if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) 335: { 336: /* In this case we want to catch the bytes which are 337: the first byte of any collation elements. 338: e.g. In da_DK, we want to catch 'a' since "aa" 339: is a valid collation element, and don't catch 340: 'b' since 'b' is the only collation element 341: which starts from 'b'. */ 342: int j, ch; 343: const int32_t *table = (const int32_t *) 344: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 345: for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 346: for (j = 0; j < UINT_BITS; ++j, ++ch) 347: if (table[ch] < 0) 348: re_set_fastmap (fastmap, icase, ch); 349: } 350: # else 351: if (MB_CUR_MAX > 1) 352: for (i = 0; i < SBC_MAX; ++i) 353: if (__btowc (i) == WEOF) 354: re_set_fastmap (fastmap, icase, i); 355: # endif /* not _LIBC */ 356: } 357: for (i = 0; i < cset->nmbchars; ++i) 358: { 359: char buf[256]; 360: mbstate_t state; 361: memset (&state, '\0', sizeof (state)); 362: __wcrtomb (buf, cset->mbchars[i], &state); 363: re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 364: } 365: } 366: #endif /* RE_ENABLE_I18N */ 367: else if (type == END_OF_RE || type == OP_PERIOD) 368: { 369: memset (fastmap, '\1', sizeof (char) * SBC_MAX); 370: if (type == END_OF_RE) 371: bufp->can_be_null = 1; 372: return; 373: } 374: } 375: } 376: 377: /* Entry point for POSIX code. */ 378: /* regcomp takes a regular expression as a string and compiles it. 379: 380: PREG is a regex_t *. We do not expect any fields to be initialized, 381: since POSIX says we shouldn't. Thus, we set 382: 383: `buffer' to the compiled pattern; 384: `used' to the length of the compiled pattern; 385: `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 386: REG_EXTENDED bit in CFLAGS is set; otherwise, to 387: RE_SYNTAX_POSIX_BASIC; 388: `newline_anchor' to REG_NEWLINE being set in CFLAGS; 389: `fastmap' to an allocated space for the fastmap; 390: `fastmap_accurate' to zero; 391: `re_nsub' to the number of subexpressions in PATTERN. 392: 393: PATTERN is the address of the pattern string. 394: 395: CFLAGS is a series of bits which affect compilation. 396: 397: If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 398: use POSIX basic syntax. 399: 400: If REG_NEWLINE is set, then . and [^...] don't match newline. 401: Also, regexec will try a match beginning after every newline. 402: 403: If REG_ICASE is set, then we considers upper- and lowercase 404: versions of letters to be equivalent when matching. 405: 406: If REG_NOSUB is set, then when PREG is passed to regexec, that 407: routine will report only success or failure, and nothing about the 408: registers. 409: 410: It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 411: the return codes and their meanings.) */ 412: 413: int 414: regcomp (preg, pattern, cflags) 415: regex_t *__restrict preg; 416: const char *__restrict pattern; 417: int cflags; 418: { 419: reg_errcode_t ret; 420: reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 421: : RE_SYNTAX_POSIX_BASIC); 422: 423: preg->buffer = NULL; 424: preg->allocated = 0; 425: preg->used = 0; 426: 427: /* Try to allocate space for the fastmap. */ 428: preg->fastmap = re_malloc (char, SBC_MAX); 429: if (BE (preg->fastmap == NULL, 0)) 430: return REG_ESPACE; 431: 432: syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 433: 434: /* If REG_NEWLINE is set, newlines are treated differently. */ 435: if (cflags & REG_NEWLINE) 436: { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 437: syntax &= ~RE_DOT_NEWLINE; 438: syntax |= RE_HAT_LISTS_NOT_NEWLINE; 439: /* It also changes the matching behavior. */ 440: preg->newline_anchor = 1; 441: } 442: else 443: preg->newline_anchor = 0; 444: preg->no_sub = !!(cflags & REG_NOSUB); 445: preg->translate = NULL; 446: 447: ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 448: 449: /* POSIX doesn't distinguish between an unmatched open-group and an 450: unmatched close-group: both are REG_EPAREN. */ 451: if (ret == REG_ERPAREN) 452: ret = REG_EPAREN; 453: 454: /* We have already checked preg->fastmap != NULL. */ 455: if (BE (ret == REG_NOERROR, 1)) 456: /* Compute the fastmap now, since regexec cannot modify the pattern 457: buffer. This function nevers fails in this implementation. */ 458: (void) re_compile_fastmap (preg); 459: else 460: { 461: /* Some error occurred while compiling the expression. */ 462: re_free (preg->fastmap); 463: preg->fastmap = NULL; 464: } 465: 466: return (int) ret; 467: } 468: #ifdef _LIBC 469: weak_alias (__regcomp, regcomp) 470: #endif 471: 472: /* Returns a message corresponding to an error code, ERRCODE, returned 473: from either regcomp or regexec. We don't use PREG here. */ 474: 475: size_t 476: regerror (errcode, preg, errbuf, errbuf_size) 477: int errcode; 478: const regex_t *preg; 479: char *errbuf; 480: size_t errbuf_size; 481: { 482: const char *msg; 483: size_t msg_size; 484: 485: if (BE (errcode < 0 486: || errcode >= (int) (sizeof (__re_error_msgid_idx) 487: / sizeof (__re_error_msgid_idx[0])), 0)) 488: /* Only error codes returned by the rest of the code should be passed 489: to this routine. If we are given anything else, or if other regex 490: code generates an invalid error code, then the program has a bug. 491: Dump core so we can fix it. */ 492: abort (); 493: 494: msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 495: 496: msg_size = strlen (msg) + 1; /* Includes the null. */ 497: 498: if (BE (errbuf_size != 0, 1)) 499: { 500: if (BE (msg_size > errbuf_size, 0)) 501: { 502: #if defined HAVE_MEMPCPY || defined _LIBC 503: *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 504: #else 505: memcpy (errbuf, msg, errbuf_size - 1); 506: errbuf[errbuf_size - 1] = 0; 507: #endif 508: } 509: else 510: memcpy (errbuf, msg, msg_size); 511: } 512: 513: return msg_size; 514: } 515: #ifdef _LIBC 516: weak_alias (__regerror, regerror) 517: #endif 518: 519: 520: static void 521: free_dfa_content (re_dfa_t *dfa) 522: { 523: int i, j; 524: 525: re_free (dfa->subexps); 526: 527: for (i = 0; i < dfa->nodes_len; ++i) 528: { 529: re_token_t *node = dfa->nodes + i; 530: #ifdef RE_ENABLE_I18N 531: if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 532: free_charset (node->opr.mbcset); 533: else 534: #endif /* RE_ENABLE_I18N */ 535: if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 536: re_free (node->opr.sbcset); 537: } 538: re_free (dfa->nexts); 539: for (i = 0; i < dfa->nodes_len; ++i) 540: { 541: if (dfa->eclosures != NULL) 542: re_node_set_free (dfa->eclosures + i); 543: if (dfa->inveclosures != NULL) 544: re_node_set_free (dfa->inveclosures + i); 545: if (dfa->edests != NULL) 546: re_node_set_free (dfa->edests + i); 547: } 548: re_free (dfa->edests); 549: re_free (dfa->eclosures); 550: re_free (dfa->inveclosures); 551: re_free (dfa->nodes); 552: 553: for (i = 0; i <= dfa->state_hash_mask; ++i) 554: { 555: struct re_state_table_entry *entry = dfa->state_table + i; 556: for (j = 0; j < entry->num; ++j) 557: { 558: re_dfastate_t *state = entry->array[j]; 559: free_state (state); 560: } 561: re_free (entry->array); 562: } 563: re_free (dfa->state_table); 564: 565: if (dfa->word_char != NULL) 566: re_free (dfa->word_char); 567: #ifdef DEBUG 568: re_free (dfa->re_str); 569: #endif 570: 571: re_free (dfa); 572: } 573: 574: 575: /* Free dynamically allocated space used by PREG. */ 576: 577: void 578: regfree (preg) 579: regex_t *preg; 580: { 581: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 582: if (BE (dfa != NULL, 1)) 583: free_dfa_content (dfa); 584: 585: re_free (preg->fastmap); 586: } 587: #ifdef _LIBC 588: weak_alias (__regfree, regfree) 589: #endif 590: 591: /* Entry points compatible with 4.2 BSD regex library. We don't define 592: them unless specifically requested. */ 593: 594: #if defined _REGEX_RE_COMP || defined _LIBC 595: 596: /* BSD has one and only one pattern buffer. */ 597: static struct re_pattern_buffer re_comp_buf; 598: 599: char * 600: # ifdef _LIBC 601: /* Make these definitions weak in libc, so POSIX programs can redefine 602: these names if they don't use our functions, and still use 603: regcomp/regexec above without link errors. */ 604: weak_function 605: # endif 606: re_comp (s) 607: const char *s; 608: { 609: reg_errcode_t ret; 610: char *fastmap; 611: 612: if (!s) 613: { 614: if (!re_comp_buf.buffer) 615: return gettext ("No previous regular expression"); 616: return 0; 617: } 618: 619: if (re_comp_buf.buffer) 620: { 621: fastmap = re_comp_buf.fastmap; 622: re_comp_buf.fastmap = NULL; 623: __regfree (&re_comp_buf); 624: memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 625: re_comp_buf.fastmap = fastmap; 626: } 627: 628: if (re_comp_buf.fastmap == NULL) 629: { 630: re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 631: if (re_comp_buf.fastmap == NULL) 632: return (char *) gettext (__re_error_msgid 633: + __re_error_msgid_idx[(int) REG_ESPACE]); 634: } 635: 636: /* Since `re_exec' always passes NULL for the `regs' argument, we 637: don't need to initialize the pattern buffer fields which affect it. */ 638: 639: /* Match anchors at newlines. */ 640: re_comp_buf.newline_anchor = 1; 641: 642: ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 643: 644: if (!ret) 645: return NULL; 646: 647: /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 648: return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 649: } 650: 651: #ifdef _LIBC 652: libc_freeres_fn (free_mem) 653: { 654: __regfree (&re_comp_buf); 655: } 656: #endif 657: 658: #endif /* _REGEX_RE_COMP */ 659: 660: /* Internal entry point. 661: Compile the regular expression PATTERN, whose length is LENGTH. 662: SYNTAX indicate regular expression's syntax. */ 663: 664: static reg_errcode_t 665: re_compile_internal (preg, pattern, length, syntax) 666: regex_t *preg; 667: const char * pattern; 668: int length; 669: reg_syntax_t syntax; 670: { 671: reg_errcode_t err = REG_NOERROR; 672: re_dfa_t *dfa; 673: re_string_t regexp; 674: 675: /* Initialize the pattern buffer. */ 676: preg->fastmap_accurate = 0; 677: preg->syntax = syntax; 678: preg->not_bol = preg->not_eol = 0; 679: preg->used = 0; 680: preg->re_nsub = 0; 681: preg->can_be_null = 0; 682: preg->regs_allocated = REGS_UNALLOCATED; 683: 684: /* Initialize the dfa. */ 685: dfa = (re_dfa_t *) preg->buffer; 686: if (preg->allocated < sizeof (re_dfa_t)) 687: { 688: /* If zero allocated, but buffer is non-null, try to realloc 689: enough space. This loses if buffer's address is bogus, but 690: that is the user's responsibility. If ->buffer is NULL this 691: is a simple allocation. */ 692: dfa = re_realloc (preg->buffer, re_dfa_t, 1); 693: if (dfa == NULL) 694: return REG_ESPACE; 695: preg->allocated = sizeof (re_dfa_t); 696: } 697: preg->buffer = (unsigned char *) dfa; 698: preg->used = sizeof (re_dfa_t); 699: 700: err = init_dfa (dfa, length); 701: if (BE (err != REG_NOERROR, 0)) 702: { 703: re_free (dfa); 704: preg->buffer = NULL; 705: preg->allocated = 0; 706: return err; 707: } 708: #ifdef DEBUG 709: dfa->re_str = re_malloc (char, length + 1); 710: strncpy (dfa->re_str, pattern, length + 1); 711: #endif 712: 713: err = re_string_construct (&regexp, pattern, length, preg->translate, 714: syntax & RE_ICASE); 715: if (BE (err != REG_NOERROR, 0)) 716: { 717: re_free (dfa); 718: preg->buffer = NULL; 719: preg->allocated = 0; 720: return err; 721: } 722: 723: /* Parse the regular expression, and build a structure tree. */ 724: preg->re_nsub = 0; 725: dfa->str_tree = parse (&regexp, preg, syntax, &err); 726: if (BE (dfa->str_tree == NULL, 0)) 727: goto re_compile_internal_free_return; 728: 729: /* Analyze the tree and collect information which is necessary to 730: create the dfa. */ 731: err = analyze (dfa); 732: if (BE (err != REG_NOERROR, 0)) 733: goto re_compile_internal_free_return; 734: 735: /* Then create the initial state of the dfa. */ 736: err = create_initial_state (dfa); 737: 738: /* Release work areas. */ 739: free_workarea_compile (preg); 740: re_string_destruct (&regexp); 741: 742: if (BE (err != REG_NOERROR, 0)) 743: { 744: re_compile_internal_free_return: 745: free_dfa_content (dfa); 746: preg->buffer = NULL; 747: preg->allocated = 0; 748: } 749: 750: return err; 751: } 752: 753: /* Initialize DFA. We use the length of the regular expression PAT_LEN 754: as the initial length of some arrays. */ 755: 756: static reg_errcode_t 757: init_dfa (dfa, pat_len) 758: re_dfa_t *dfa; 759: int pat_len; 760: { 761: int table_size; 762: 763: memset (dfa, '\0', sizeof (re_dfa_t)); 764: 765: dfa->nodes_alloc = pat_len + 1; 766: dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 767: 768: dfa->states_alloc = pat_len + 1; 769: 770: /* table_size = 2 ^ ceil(log pat_len) */ 771: for (table_size = 1; table_size > 0; table_size <<= 1) 772: if (table_size > pat_len) 773: break; 774: 775: dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 776: dfa->state_hash_mask = table_size - 1; 777: 778: dfa->subexps_alloc = 1; 779: dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc); 780: dfa->word_char = NULL; 781: 782: if (BE (dfa->nodes == NULL || dfa->state_table == NULL 783: || dfa->subexps == NULL, 0)) 784: { 785: /* We don't bother to free anything which was allocated. Very 786: soon the process will go down anyway. */ 787: dfa->subexps = NULL; 788: dfa->state_table = NULL; 789: dfa->nodes = NULL; 790: return REG_ESPACE; 791: } 792: return REG_NOERROR; 793: } 794: 795: /* Initialize WORD_CHAR table, which indicate which character is 796: "word". In this case "word" means that it is the word construction 797: character used by some operators like "\<", "\>", etc. */ 798: 799: static reg_errcode_t 800: init_word_char (dfa) 801: re_dfa_t *dfa; 802: { 803: int i, j, ch; 804: dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1); 805: if (BE (dfa->word_char == NULL, 0)) 806: return REG_ESPACE; 807: for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 808: for (j = 0; j < UINT_BITS; ++j, ++ch) 809: if (isalnum (ch) || ch == '_') 810: dfa->word_char[i] |= 1 << j; 811: return REG_NOERROR; 812: } 813: 814: /* Free the work area which are only used while compiling. */ 815: 816: static void 817: free_workarea_compile (preg) 818: regex_t *preg; 819: { 820: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 821: free_bin_tree (dfa->str_tree); 822: dfa->str_tree = NULL; 823: re_free (dfa->org_indices); 824: dfa->org_indices = NULL; 825: } 826: 827: /* Create initial states for all contexts. */ 828: 829: static reg_errcode_t 830: create_initial_state (dfa) 831: re_dfa_t *dfa; 832: { 833: int first, i; 834: reg_errcode_t err; 835: re_node_set init_nodes; 836: 837: /* Initial states have the epsilon closure of the node which is 838: the first node of the regular expression. */ 839: first = dfa->str_tree->first; 840: dfa->init_node = first; 841: err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 842: if (BE (err != REG_NOERROR, 0)) 843: return err; 844: 845: /* The back-references which are in initial states can epsilon transit, 846: since in this case all of the subexpressions can be null. 847: Then we add epsilon closures of the nodes which are the next nodes of 848: the back-references. */ 849: if (dfa->nbackref > 0) 850: for (i = 0; i < init_nodes.nelem; ++i) 851: { 852: int node_idx = init_nodes.elems[i]; 853: re_token_type_t type = dfa->nodes[node_idx].type; 854: 855: int clexp_idx; 856: if (type != OP_BACK_REF) 857: continue; 858: for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 859: { 860: re_token_t *clexp_node; 861: clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 862: if (clexp_node->type == OP_CLOSE_SUBEXP 863: && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx) 864: break; 865: } 866: if (clexp_idx == init_nodes.nelem) 867: continue; 868: 869: if (type == OP_BACK_REF) 870: { 871: int dest_idx = dfa->edests[node_idx].elems[0]; 872: if (!re_node_set_contains (&init_nodes, dest_idx)) 873: { 874: re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 875: i = 0; 876: } 877: } 878: } 879: 880: /* It must be the first time to invoke acquire_state. */ 881: dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 882: /* We don't check ERR here, since the initial state must not be NULL. */ 883: if (BE (dfa->init_state == NULL, 0)) 884: return err; 885: if (dfa->init_state->has_constraint) 886: { 887: dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 888: CONTEXT_WORD); 889: dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 890: CONTEXT_NEWLINE); 891: dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 892: &init_nodes, 893: CONTEXT_NEWLINE 894: | CONTEXT_BEGBUF); 895: if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 896: || dfa->init_state_begbuf == NULL, 0)) 897: return err; 898: } 899: else 900: dfa->init_state_word = dfa->init_state_nl 901: = dfa->init_state_begbuf = dfa->init_state; 902: 903: re_node_set_free (&init_nodes); 904: return REG_NOERROR; 905: } 906: 907: /* Analyze the structure tree, and calculate "first", "next", "edest", 908: "eclosure", and "inveclosure". */ 909: 910: static reg_errcode_t 911: analyze (dfa) 912: re_dfa_t *dfa; 913: { 914: int i; 915: reg_errcode_t ret; 916: 917: /* Allocate arrays. */ 918: dfa->nexts = re_malloc (int, dfa->nodes_alloc); 919: dfa->org_indices = re_malloc (int, dfa->nodes_alloc); 920: dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 921: dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 922: dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc); 923: if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 924: || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0)) 925: return REG_ESPACE; 926: /* Initialize them. */ 927: for (i = 0; i < dfa->nodes_len; ++i) 928: { 929: dfa->nexts[i] = -1; 930: re_node_set_init_empty (dfa->edests + i); 931: re_node_set_init_empty (dfa->eclosures + i); 932: re_node_set_init_empty (dfa->inveclosures + i); 933: } 934: 935: ret = analyze_tree (dfa, dfa->str_tree); 936: if (BE (ret == REG_NOERROR, 1)) 937: { 938: ret = calc_eclosure (dfa); 939: if (ret == REG_NOERROR) 940: calc_inveclosure (dfa); 941: } 942: return ret; 943: } 944: 945: /* Helper functions for analyze. 946: This function calculate "first", "next", and "edest" for the subtree 947: whose root is NODE. */ 948: 949: static reg_errcode_t 950: analyze_tree (dfa, node) 951: re_dfa_t *dfa; 952: bin_tree_t *node; 953: { 954: reg_errcode_t ret; 955: if (node->first == -1) 956: calc_first (dfa, node); 957: if (node->next == -1) 958: calc_next (dfa, node); 959: if (node->eclosure.nelem == 0) 960: calc_epsdest (dfa, node); 961: /* Calculate "first" etc. for the left child. */ 962: if (node->left != NULL) 963: { 964: ret = analyze_tree (dfa, node->left); 965: if (BE (ret != REG_NOERROR, 0)) 966: return ret; 967: } 968: /* Calculate "first" etc. for the right child. */ 969: if (node->right != NULL) 970: { 971: ret = analyze_tree (dfa, node->right); 972: if (BE (ret != REG_NOERROR, 0)) 973: return ret; 974: } 975: return REG_NOERROR; 976: } 977: 978: /* Calculate "first" for the node NODE. */ 979: static void 980: calc_first (dfa, node) 981: re_dfa_t *dfa; 982: bin_tree_t *node; 983: { 984: int idx, type; 985: idx = node->node_idx; 986: type = (node->type == 0) ? dfa->nodes[idx].type : node->type; 987: 988: switch (type) 989: { 990: #ifdef DEBUG 991: case OP_OPEN_BRACKET: 992: case OP_CLOSE_BRACKET: 993: case OP_OPEN_DUP_NUM: 994: case OP_CLOSE_DUP_NUM: 995: case OP_NON_MATCH_LIST: 996: case OP_OPEN_COLL_ELEM: 997: case OP_CLOSE_COLL_ELEM: 998: case OP_OPEN_EQUIV_CLASS: 999: case OP_CLOSE_EQUIV_CLASS: 1000: case OP_OPEN_CHAR_CLASS: 1001: case OP_CLOSE_CHAR_CLASS: 1002: /* These must not be appeared here. */ 1003: assert (0); 1004: #endif 1005: case END_OF_RE: 1006: case CHARACTER: 1007: case OP_PERIOD: 1008: case OP_DUP_ASTERISK: 1009: case OP_DUP_QUESTION: 1010: #ifdef RE_ENABLE_I18N 1011: case COMPLEX_BRACKET: 1012: #endif /* RE_ENABLE_I18N */ 1013: case SIMPLE_BRACKET: 1014: case OP_BACK_REF: 1015: case ANCHOR: 1016: case OP_OPEN_SUBEXP: 1017: case OP_CLOSE_SUBEXP: 1018: node->first = idx; 1019: break; 1020: case OP_DUP_PLUS: 1021: #ifdef DEBUG 1022: assert (node->left != NULL); 1023: #endif 1024: if (node->left->first == -1) 1025: calc_first (dfa, node->left); 1026: node->first = node->left->first; 1027: break; 1028: case OP_ALT: 1029: node->first = idx; 1030: break; 1031: /* else fall through */ 1032: default: 1033: #ifdef DEBUG 1034: assert (node->left != NULL); 1035: #endif 1036: if (node->left->first == -1) 1037: calc_first (dfa, node->left); 1038: node->first = node->left->first; 1039: break; 1040: } 1041: } 1042: 1043: /* Calculate "next" for the node NODE. */ 1044: 1045: static void 1046: calc_next (dfa, node) 1047: re_dfa_t *dfa; 1048: bin_tree_t *node; 1049: { 1050: int idx, type; 1051: bin_tree_t *parent = node->parent; 1052: if (parent == NULL) 1053: { 1054: node->next = -1; 1055: idx = node->node_idx; 1056: if (node->type == 0) 1057: dfa->nexts[idx] = node->next; 1058: return; 1059: } 1060: 1061: idx = parent->node_idx; 1062: type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type; 1063: 1064: switch (type) 1065: { 1066: case OP_DUP_ASTERISK: 1067: case OP_DUP_PLUS: 1068: node->next = idx; 1069: break; 1070: case CONCAT: 1071: if (parent->left == node) 1072: { 1073: if (parent->right->first == -1) 1074: calc_first (dfa, parent->right); 1075: node->next = parent->right->first; 1076: break; 1077: } 1078: /* else fall through */ 1079: default: 1080: if (parent->next == -1) 1081: calc_next (dfa, parent); 1082: node->next = parent->next; 1083: break; 1084: } 1085: idx = node->node_idx; 1086: if (node->type == 0) 1087: dfa->nexts[idx] = node->next; 1088: } 1089: 1090: /* Calculate "edest" for the node NODE. */ 1091: 1092: static void 1093: calc_epsdest (dfa, node) 1094: re_dfa_t *dfa; 1095: bin_tree_t *node; 1096: { 1097: int idx; 1098: idx = node->node_idx; 1099: if (node->type == 0) 1100: { 1101: if (dfa->nodes[idx].type == OP_DUP_ASTERISK 1102: || dfa->nodes[idx].type == OP_DUP_PLUS 1103: || dfa->nodes[idx].type == OP_DUP_QUESTION) 1104: { 1105: if (node->left->first == -1) 1106: calc_first (dfa, node->left); 1107: if (node->next == -1) 1108: calc_next (dfa, node); 1109: re_node_set_init_2 (dfa->edests + idx, node->left->first, 1110: node->next); 1111: } 1112: else if (dfa->nodes[idx].type == OP_ALT) 1113: { 1114: int left, right; 1115: if (node->left != NULL) 1116: { 1117: if (node->left->first == -1) 1118: calc_first (dfa, node->left); 1119: left = node->left->first; 1120: } 1121: else 1122: { 1123: if (node->next == -1) 1124: calc_next (dfa, node); 1125: left = node->next; 1126: } 1127: if (node->right != NULL) 1128: { 1129: if (node->right->first == -1) 1130: calc_first (dfa, node->right); 1131: right = node->right->first; 1132: } 1133: else 1134: { 1135: if (node->next == -1) 1136: calc_next (dfa, node); 1137: right = node->next; 1138: } 1139: re_node_set_init_2 (dfa->edests + idx, left, right); 1140: } 1141: else if (dfa->nodes[idx].type == ANCHOR 1142: || dfa->nodes[idx].type == OP_OPEN_SUBEXP 1143: || dfa->nodes[idx].type == OP_CLOSE_SUBEXP 1144: || dfa->nodes[idx].type == OP_BACK_REF) 1145: re_node_set_init_1 (dfa->edests + idx, node->next); 1146: } 1147: } 1148: 1149: /* Duplicate the epsilon closure of the node ROOT_NODE. 1150: Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1151: to their own constraint. */ 1152: 1153: static reg_errcode_t 1154: duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node, 1155: init_constraint) 1156: re_dfa_t *dfa; 1157: int top_org_node, top_clone_node, root_node; 1158: unsigned int init_constraint; 1159: { 1160: reg_errcode_t err; 1161: int org_node, clone_node, ret; 1162: unsigned int constraint = init_constraint; 1163: for (org_node = top_org_node, clone_node = top_clone_node;;) 1164: { 1165: int org_dest, clone_dest; 1166: if (dfa->nodes[org_node].type == OP_BACK_REF) 1167: { 1168: /* If the back reference epsilon-transit, its destination must 1169: also have the constraint. Then duplicate the epsilon closure 1170: of the destination of the back reference, and store it in 1171: edests of the back reference. */ 1172: org_dest = dfa->nexts[org_node]; 1173: re_node_set_empty (dfa->edests + clone_node); 1174: err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1175: if (BE (err != REG_NOERROR, 0)) 1176: return err; 1177: dfa->nexts[clone_node] = dfa->nexts[org_node]; 1178: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1179: if (BE (ret < 0, 0)) 1180: return REG_ESPACE; 1181: } 1182: else if (dfa->edests[org_node].nelem == 0) 1183: { 1184: /* In case of the node can't epsilon-transit, don't duplicate the 1185: destination and store the original destination as the 1186: destination of the node. */ 1187: dfa->nexts[clone_node] = dfa->nexts[org_node]; 1188: break; 1189: } 1190: else if (dfa->edests[org_node].nelem == 1) 1191: { 1192: /* In case of the node can epsilon-transit, and it has only one 1193: destination. */ 1194: org_dest = dfa->edests[org_node].elems[0]; 1195: re_node_set_empty (dfa->edests + clone_node); 1196: if (dfa->nodes[org_node].type == ANCHOR) 1197: { 1198: /* In case of the node has another constraint, append it. */ 1199: if (org_node == root_node && clone_node != org_node) 1200: { 1201: /* ...but if the node is root_node itself, it means the 1202: epsilon closure have a loop, then tie it to the 1203: destination of the root_node. */ 1204: ret = re_node_set_insert (dfa->edests + clone_node, 1205: org_dest); 1206: if (BE (ret < 0, 0)) 1207: return REG_ESPACE; 1208: break; 1209: } 1210: constraint |= dfa->nodes[org_node].opr.ctx_type; 1211: } 1212: err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1213: if (BE (err != REG_NOERROR, 0)) 1214: return err; 1215: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1216: if (BE (ret < 0, 0)) 1217: return REG_ESPACE; 1218: } 1219: else /* dfa->edests[org_node].nelem == 2 */ 1220: { 1221: /* In case of the node can epsilon-transit, and it has two 1222: destinations. E.g. '|', '*', '+', '?'. */ 1223: org_dest = dfa->edests[org_node].elems[0]; 1224: re_node_set_empty (dfa->edests + clone_node); 1225: /* Search for a duplicated node which satisfies the constraint. */ 1226: clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1227: if (clone_dest == -1) 1228: { 1229: /* There are no such a duplicated node, create a new one. */ 1230: err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1231: if (BE (err != REG_NOERROR, 0)) 1232: return err; 1233: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1234: if (BE (ret < 0, 0)) 1235: return REG_ESPACE; 1236: err = duplicate_node_closure (dfa, org_dest, clone_dest, 1237: root_node, constraint); 1238: if (BE (err != REG_NOERROR, 0)) 1239: return err; 1240: } 1241: else 1242: { 1243: /* There are a duplicated node which satisfy the constraint, 1244: use it to avoid infinite loop. */ 1245: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1246: if (BE (ret < 0, 0)) 1247: return REG_ESPACE; 1248: } 1249: 1250: org_dest = dfa->edests[org_node].elems[1]; 1251: err = duplicate_node (&clone_dest, dfa, org_dest, constraint); 1252: if (BE (err != REG_NOERROR, 0)) 1253: return err; 1254: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1255: if (BE (ret < 0, 0)) 1256: return REG_ESPACE; 1257: } 1258: org_node = org_dest; 1259: clone_node = clone_dest; 1260: } 1261: return REG_NOERROR; 1262: } 1263: 1264: /* Search for a node which is duplicated from the node ORG_NODE, and 1265: satisfies the constraint CONSTRAINT. */ 1266: 1267: static int 1268: search_duplicated_node (dfa, org_node, constraint) 1269: re_dfa_t *dfa; 1270: int org_node; 1271: unsigned int constraint; 1272: { 1273: int idx; 1274: for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1275: { 1276: if (org_node == dfa->org_indices[idx] 1277: && constraint == dfa->nodes[idx].constraint) 1278: return idx; /* Found. */ 1279: } 1280: return -1; /* Not found. */ 1281: } 1282: 1283: /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1284: The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded, 1285: otherwise return the error code. */ 1286: 1287: static reg_errcode_t 1288: duplicate_node (new_idx, dfa, org_idx, constraint) 1289: re_dfa_t *dfa; 1290: int *new_idx, org_idx; 1291: unsigned int constraint; 1292: { 1293: re_token_t dup; 1294: int dup_idx; 1295: 1296: dup = dfa->nodes[org_idx]; 1297: dup_idx = re_dfa_add_node (dfa, dup, 1); 1298: if (BE (dup_idx == -1, 0)) 1299: return REG_ESPACE; 1300: dfa->nodes[dup_idx].constraint = constraint; 1301: if (dfa->nodes[org_idx].type == ANCHOR) 1302: dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type; 1303: dfa->nodes[dup_idx].duplicated = 1; 1304: re_node_set_init_empty (dfa->edests + dup_idx); 1305: re_node_set_init_empty (dfa->eclosures + dup_idx); 1306: re_node_set_init_empty (dfa->inveclosures + dup_idx); 1307: 1308: /* Store the index of the original node. */ 1309: dfa->org_indices[dup_idx] = org_idx; 1310: *new_idx = dup_idx; 1311: return REG_NOERROR; 1312: } 1313: 1314: static void 1315: calc_inveclosure (dfa) 1316: re_dfa_t *dfa; 1317: { 1318: int src, idx, dest; 1319: for (src = 0; src < dfa->nodes_len; ++src) 1320: { 1321: for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1322: { 1323: dest = dfa->eclosures[src].elems[idx]; 1324: re_node_set_insert (dfa->inveclosures + dest, src); 1325: } 1326: } 1327: } 1328: 1329: /* Calculate "eclosure" for all the node in DFA. */ 1330: 1331: static reg_errcode_t 1332: calc_eclosure (dfa) 1333: re_dfa_t *dfa; 1334: { 1335: int node_idx, incomplete; 1336: #ifdef DEBUG 1337: assert (dfa->nodes_len > 0); 1338: #endif 1339: incomplete = 0; 1340: /* For each nodes, calculate epsilon closure. */ 1341: for (node_idx = 0; ; ++node_idx) 1342: { 1343: reg_errcode_t err; 1344: re_node_set eclosure_elem; 1345: if (node_idx == dfa->nodes_len) 1346: { 1347: if (!incomplete) 1348: break; 1349: incomplete = 0; 1350: node_idx = 0; 1351: } 1352: 1353: #ifdef DEBUG 1354: assert (dfa->eclosures[node_idx].nelem != -1); 1355: #endif 1356: /* If we have already calculated, skip it. */ 1357: if (dfa->eclosures[node_idx].nelem != 0) 1358: continue; 1359: /* Calculate epsilon closure of `node_idx'. */ 1360: err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1); 1361: if (BE (err != REG_NOERROR, 0)) 1362: return err; 1363: 1364: if (dfa->eclosures[node_idx].nelem == 0) 1365: { 1366: incomplete = 1; 1367: re_node_set_free (&eclosure_elem); 1368: } 1369: } 1370: return REG_NOERROR; 1371: } 1372: 1373: /* Calculate epsilon closure of NODE. */ 1374: 1375: static reg_errcode_t 1376: calc_eclosure_iter (new_set, dfa, node, root) 1377: re_node_set *new_set; 1378: re_dfa_t *dfa; 1379: int node, root; 1380: { 1381: reg_errcode_t err; 1382: unsigned int constraint; 1383: int i, incomplete; 1384: re_node_set eclosure; 1385: incomplete = 0; 1386: err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1387: if (BE (err != REG_NOERROR, 0)) 1388: return err; 1389: 1390: /* This indicates that we are calculating this node now. 1391: We reference this value to avoid infinite loop. */ 1392: dfa->eclosures[node].nelem = -1; 1393: 1394: constraint = ((dfa->nodes[node].type == ANCHOR) 1395: ? dfa->nodes[node].opr.ctx_type : 0); 1396: /* If the current node has constraints, duplicate all nodes. 1397: Since they must inherit the constraints. */ 1398: if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1399: { 1400: int org_node, cur_node; 1401: org_node = cur_node = node; 1402: err = duplicate_node_closure (dfa, node, node, node, constraint); 1403: if (BE (err != REG_NOERROR, 0)) 1404: return err; 1405: } 1406: 1407: /* Expand each epsilon destination nodes. */ 1408: if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1409: for (i = 0; i < dfa->edests[node].nelem; ++i) 1410: { 1411: re_node_set eclosure_elem; 1412: int edest = dfa->edests[node].elems[i]; 1413: /* If calculating the epsilon closure of `edest' is in progress, 1414: return intermediate result. */ 1415: if (dfa->eclosures[edest].nelem == -1) 1416: { 1417: incomplete = 1; 1418: continue; 1419: } 1420: /* If we haven't calculated the epsilon closure of `edest' yet, 1421: calculate now. Otherwise use calculated epsilon closure. */ 1422: if (dfa->eclosures[edest].nelem == 0) 1423: { 1424: err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0); 1425: if (BE (err != REG_NOERROR, 0)) 1426: return err; 1427: } 1428: else 1429: eclosure_elem = dfa->eclosures[edest]; 1430: /* Merge the epsilon closure of `edest'. */ 1431: re_node_set_merge (&eclosure, &eclosure_elem); 1432: /* If the epsilon closure of `edest' is incomplete, 1433: the epsilon closure of this node is also incomplete. */ 1434: if (dfa->eclosures[edest].nelem == 0) 1435: { 1436: incomplete = 1; 1437: re_node_set_free (&eclosure_elem); 1438: } 1439: } 1440: 1441: /* Epsilon closures include itself. */ 1442: re_node_set_insert (&eclosure, node); 1443: if (incomplete && !root) 1444: dfa->eclosures[node].nelem = 0; 1445: else 1446: dfa->eclosures[node] = eclosure; 1447: *new_set = eclosure; 1448: return REG_NOERROR; 1449: } 1450: 1451: /* Functions for token which are used in the parser. */ 1452: 1453: /* Fetch a token from INPUT. 1454: We must not use this function inside bracket expressions. */ 1455: 1456: static re_token_t 1457: fetch_token (input, syntax) 1458: re_string_t *input; 1459: reg_syntax_t syntax; 1460: { 1461: re_token_t token; 1462: int consumed_byte; 1463: consumed_byte = peek_token (&token, input, syntax); 1464: re_string_skip_bytes (input, consumed_byte); 1465: return token; 1466: } 1467: 1468: /* Peek a token from INPUT, and return the length of the token. 1469: We must not use this function inside bracket expressions. */ 1470: 1471: static int 1472: peek_token (token, input, syntax) 1473: re_token_t *token; 1474: re_string_t *input; 1475: reg_syntax_t syntax; 1476: { 1477: unsigned char c; 1478: 1479: if (re_string_eoi (input)) 1480: { 1481: token->type = END_OF_RE; 1482: return 0; 1483: } 1484: 1485: c = re_string_peek_byte (input, 0); 1486: token->opr.c = c; 1487: 1488: #ifdef RE_ENABLE_I18N 1489: token->mb_partial = 0; 1490: if (MB_CUR_MAX > 1 && 1491: !re_string_first_byte (input, re_string_cur_idx (input))) 1492: { 1493: token->type = CHARACTER; 1494: token->mb_partial = 1; 1495: return 1; 1496: } 1497: #endif 1498: if (c == '\\') 1499: { 1500: unsigned char c2; 1501: if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1502: { 1503: token->type = BACK_SLASH; 1504: return 1; 1505: } 1506: 1507: c2 = re_string_peek_byte_case (input, 1); 1508: token->opr.c = c2; 1509: token->type = CHARACTER; 1510: switch (c2) 1511: { 1512: case '|': 1513: if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1514: token->type = OP_ALT; 1515: break; 1516: case '1': case '2': case '3': case '4': case '5': 1517: case '6': case '7': case '8': case '9': 1518: if (!(syntax & RE_NO_BK_REFS)) 1519: { 1520: token->type = OP_BACK_REF; 1521: token->opr.idx = c2 - '0'; 1522: } 1523: break; 1524: case '<': 1525: if (!(syntax & RE_NO_GNU_OPS)) 1526: { 1527: token->type = ANCHOR; 1528: token->opr.idx = WORD_FIRST; 1529: } 1530: break; 1531: case '>': 1532: if (!(syntax & RE_NO_GNU_OPS)) 1533: { 1534: token->type = ANCHOR; 1535: token->opr.idx = WORD_LAST; 1536: } 1537: break; 1538: case 'b': 1539: if (!(syntax & RE_NO_GNU_OPS)) 1540: { 1541: token->type = ANCHOR; 1542: token->opr.idx = WORD_DELIM; 1543: } 1544: break; 1545: case 'B': 1546: if (!(syntax & RE_NO_GNU_OPS)) 1547: { 1548: token->type = ANCHOR; 1549: token->opr.idx = INSIDE_WORD; 1550: } 1551: break; 1552: case 'w': 1553: if (!(syntax & RE_NO_GNU_OPS)) 1554: token->type = OP_WORD; 1555: break; 1556: case 'W': 1557: if (!(syntax & RE_NO_GNU_OPS)) 1558: token->type = OP_NOTWORD; 1559: break; 1560: case '`': 1561: if (!(syntax & RE_NO_GNU_OPS)) 1562: { 1563: token->type = ANCHOR; 1564: token->opr.idx = BUF_FIRST; 1565: } 1566: break; 1567: case '\'': 1568: if (!(syntax & RE_NO_GNU_OPS)) 1569: { 1570: token->type = ANCHOR; 1571: token->opr.idx = BUF_LAST; 1572: } 1573: break; 1574: case '(': 1575: if (!(syntax & RE_NO_BK_PARENS)) 1576: token->type = OP_OPEN_SUBEXP; 1577: break; 1578: case ')': 1579: if (!(syntax & RE_NO_BK_PARENS)) 1580: token->type = OP_CLOSE_SUBEXP; 1581: break; 1582: case '+': 1583: if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1584: token->type = OP_DUP_PLUS; 1585: break; 1586: case '?': 1587: if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1588: token->type = OP_DUP_QUESTION; 1589: break; 1590: case '{': 1591: if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1592: token->type = OP_OPEN_DUP_NUM; 1593: break; 1594: case '}': 1595: if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1596: token->type = OP_CLOSE_DUP_NUM; 1597: break; 1598: default: 1599: break; 1600: } 1601: return 2; 1602: } 1603: 1604: token->type = CHARACTER; 1605: switch (c) 1606: { 1607: case '\n': 1608: if (syntax & RE_NEWLINE_ALT) 1609: token->type = OP_ALT; 1610: break; 1611: case '|': 1612: if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1613: token->type = OP_ALT; 1614: break; 1615: case '*': 1616: token->type = OP_DUP_ASTERISK; 1617: break; 1618: case '+': 1619: if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1620: token->type = OP_DUP_PLUS; 1621: break; 1622: case '?': 1623: if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1624: token->type = OP_DUP_QUESTION; 1625: break; 1626: case '{': 1627: if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1628: token->type = OP_OPEN_DUP_NUM; 1629: break; 1630: case '}': 1631: if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1632: token->type = OP_CLOSE_DUP_NUM; 1633: break; 1634: case '(': 1635: if (syntax & RE_NO_BK_PARENS) 1636: token->type = OP_OPEN_SUBEXP; 1637: break; 1638: case ')': 1639: if (syntax & RE_NO_BK_PARENS) 1640: token->type = OP_CLOSE_SUBEXP; 1641: break; 1642: case '[': 1643: token->type = OP_OPEN_BRACKET; 1644: break; 1645: case '.': 1646: token->type = OP_PERIOD; 1647: break; 1648: case '^': 1649: if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1650: re_string_cur_idx (input) != 0) 1651: { 1652: char prev = re_string_peek_byte (input, -1); 1653: if (prev != '|' && prev != '(' && 1654: (!(syntax & RE_NEWLINE_ALT) || prev != '\n')) 1655: break; 1656: } 1657: token->type = ANCHOR; 1658: token->opr.idx = LINE_FIRST; 1659: break; 1660: case '$': 1661: if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1662: re_string_cur_idx (input) + 1 != re_string_length (input)) 1663: { 1664: re_token_t next; 1665: re_string_skip_bytes (input, 1); 1666: peek_token (&next, input, syntax); 1667: re_string_skip_bytes (input, -1); 1668: if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1669: break; 1670: } 1671: token->type = ANCHOR; 1672: token->opr.idx = LINE_LAST; 1673: break; 1674: default: 1675: break; 1676: } 1677: return 1; 1678: } 1679: 1680: /* Peek a token from INPUT, and return the length of the token. 1681: We must not use this function out of bracket expressions. */ 1682: 1683: static int 1684: peek_token_bracket (token, input, syntax) 1685: re_token_t *token; 1686: re_string_t *input; 1687: reg_syntax_t syntax; 1688: { 1689: unsigned char c; 1690: if (re_string_eoi (input)) 1691: { 1692: token->type = END_OF_RE; 1693: return 0; 1694: } 1695: c = re_string_peek_byte (input, 0); 1696: token->opr.c = c; 1697: 1698: #ifdef RE_ENABLE_I18N 1699: if (MB_CUR_MAX > 1 && 1700: !re_string_first_byte (input, re_string_cur_idx (input))) 1701: { 1702: token->type = CHARACTER; 1703: return 1; 1704: } 1705: #endif /* RE_ENABLE_I18N */ 1706: 1707: if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)) 1708: { 1709: /* In this case, '\' escape a character. */ 1710: unsigned char c2; 1711: re_string_skip_bytes (input, 1); 1712: c2 = re_string_peek_byte (input, 0); 1713: token->opr.c = c2; 1714: token->type = CHARACTER; 1715: return 1; 1716: } 1717: if (c == '[') /* '[' is a special char in a bracket exps. */ 1718: { 1719: unsigned char c2; 1720: int token_len; 1721: c2 = re_string_peek_byte (input, 1); 1722: token->opr.c = c2; 1723: token_len = 2; 1724: switch (c2) 1725: { 1726: case '.': 1727: token->type = OP_OPEN_COLL_ELEM; 1728: break; 1729: case '=': 1730: token->type = OP_OPEN_EQUIV_CLASS; 1731: break; 1732: case ':': 1733: if (syntax & RE_CHAR_CLASSES) 1734: { 1735: token->type = OP_OPEN_CHAR_CLASS; 1736: break; 1737: } 1738: /* else fall through. */ 1739: default: 1740: token->type = CHARACTER; 1741: token->opr.c = c; 1742: token_len = 1; 1743: break; 1744: } 1745: return token_len; 1746: } 1747: switch (c) 1748: { 1749: case '-': 1750: token->type = OP_CHARSET_RANGE; 1751: break; 1752: case ']': 1753: token->type = OP_CLOSE_BRACKET; 1754: break; 1755: case '^': 1756: token->type = OP_NON_MATCH_LIST; 1757: break; 1758: default: 1759: token->type = CHARACTER; 1760: } 1761: return 1; 1762: } 1763: 1764: /* Functions for parser. */ 1765: 1766: /* Entry point of the parser. 1767: Parse the regular expression REGEXP and return the structure tree. 1768: If an error is occured, ERR is set by error code, and return NULL. 1769: This function build the following tree, from regular expression <reg_exp>: 1770: CAT 1771: / \ 1772: / \ 1773: <reg_exp> EOR 1774: 1775: CAT means concatenation. 1776: EOR means end of regular expression. */ 1777: 1778: static bin_tree_t * 1779: parse (regexp, preg, syntax, err) 1780: re_string_t *regexp; 1781: regex_t *preg; 1782: reg_syntax_t syntax; 1783: reg_errcode_t *err; 1784: { 1785: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1786: bin_tree_t *tree, *eor, *root; 1787: re_token_t current_token; 1788: int new_idx; 1789: current_token = fetch_token (regexp, syntax); 1790: tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err); 1791: if (BE (*err != REG_NOERROR && tree == NULL, 0)) 1792: return NULL; 1793: new_idx = re_dfa_add_node (dfa, current_token, 0); 1794: eor = create_tree (NULL, NULL, 0, new_idx); 1795: if (tree != NULL) 1796: root = create_tree (tree, eor, CONCAT, 0); 1797: else 1798: root = eor; 1799: if (BE (new_idx == -1 || eor == NULL || root == NULL, 0)) 1800: { 1801: *err = REG_ESPACE; 1802: return NULL; 1803: } 1804: return root; 1805: } 1806: 1807: /* This function build the following tree, from regular expression 1808: <branch1>|<branch2>: 1809: ALT 1810: / \ 1811: / \ 1812: <branch1> <branch2> 1813: 1814: ALT means alternative, which represents the operator `|'. */ 1815: 1816: static bin_tree_t * 1817: parse_reg_exp (regexp, preg, token, syntax, nest, err) 1818: re_string_t *regexp; 1819: regex_t *preg; 1820: re_token_t *token; 1821: reg_syntax_t syntax; 1822: int nest; 1823: reg_errcode_t *err; 1824: { 1825: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1826: bin_tree_t *tree, *branch = NULL; 1827: int new_idx; 1828: tree = parse_branch (regexp, preg, token, syntax, nest, err); 1829: if (BE (*err != REG_NOERROR && tree == NULL, 0)) 1830: return NULL; 1831: 1832: while (token->type == OP_ALT) 1833: { 1834: re_token_t alt_token = *token; 1835: new_idx = re_dfa_add_node (dfa, alt_token, 0); 1836: *token = fetch_token (regexp, syntax); 1837: if (token->type != OP_ALT && token->type != END_OF_RE 1838: && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 1839: { 1840: branch = parse_branch (regexp, preg, token, syntax, nest, err); 1841: if (BE (*err != REG_NOERROR && branch == NULL, 0)) 1842: { 1843: free_bin_tree (tree); 1844: return NULL; 1845: } 1846: } 1847: else 1848: branch = NULL; 1849: tree = create_tree (tree, branch, 0, new_idx); 1850: if (BE (new_idx == -1 || tree == NULL, 0)) 1851: { 1852: *err = REG_ESPACE; 1853: return NULL; 1854: } 1855: dfa->has_plural_match = 1; 1856: } 1857: return tree; 1858: } 1859: 1860: /* This function build the following tree, from regular expression 1861: <exp1><exp2>: 1862: CAT 1863: / \ 1864: / \ 1865: <exp1> <exp2> 1866: 1867: CAT means concatenation. */ 1868: 1869: static bin_tree_t * 1870: parse_branch (regexp, preg, token, syntax, nest, err) 1871: re_string_t *regexp; 1872: regex_t *preg; 1873: re_token_t *token; 1874: reg_syntax_t syntax; 1875: int nest; 1876: reg_errcode_t *err; 1877: { 1878: bin_tree_t *tree, *exp; 1879: tree = parse_expression (regexp, preg, token, syntax, nest, err); 1880: if (BE (*err != REG_NOERROR && tree == NULL, 0)) 1881: return NULL; 1882: 1883: while (token->type != OP_ALT && token->type != END_OF_RE 1884: && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 1885: { 1886: exp = parse_expression (regexp, preg, token, syntax, nest, err); 1887: if (BE (*err != REG_NOERROR && exp == NULL, 0)) 1888: { 1889: free_bin_tree (tree); 1890: return NULL; 1891: } 1892: if (tree != NULL && exp != NULL) 1893: { 1894: tree = create_tree (tree, exp, CONCAT, 0); 1895: if (tree == NULL) 1896: { 1897: *err = REG_ESPACE; 1898: return NULL; 1899: } 1900: } 1901: else if (tree == NULL) 1902: tree = exp; 1903: /* Otherwise exp == NULL, we don't need to create new tree. */ 1904: } 1905: return tree; 1906: } 1907: 1908: /* This function build the following tree, from regular expression a*: 1909: * 1910: | 1911: a 1912: */ 1913: 1914: static bin_tree_t * 1915: parse_expression (regexp, preg, token, syntax, nest, err) 1916: re_string_t *regexp; 1917: regex_t *preg; 1918: re_token_t *token; 1919: reg_syntax_t syntax; 1920: int nest; 1921: reg_errcode_t *err; 1922: { 1923: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1924: bin_tree_t *tree; 1925: int new_idx; 1926: switch (token->type) 1927: { 1928: case CHARACTER: 1929: new_idx = re_dfa_add_node (dfa, *token, 0); 1930: tree = create_tree (NULL, NULL, 0, new_idx); 1931: if (BE (new_idx == -1 || tree == NULL, 0)) 1932: { 1933: *err = REG_ESPACE; 1934: return NULL; 1935: } 1936: #ifdef RE_ENABLE_I18N 1937: if (MB_CUR_MAX > 1) 1938: { 1939: while (!re_string_eoi (regexp) 1940: && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 1941: { 1942: bin_tree_t *mbc_remain; 1943: *token = fetch_token (regexp, syntax); 1944: new_idx = re_dfa_add_node (dfa, *token, 0); 1945: mbc_remain = create_tree (NULL, NULL, 0, new_idx); 1946: tree = create_tree (tree, mbc_remain, CONCAT, 0); 1947: if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0)) 1948: { 1949: *err = REG_ESPACE; 1950: return NULL; 1951: } 1952: } 1953: } 1954: #endif 1955: break; 1956: case OP_OPEN_SUBEXP: 1957: tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 1958: if (BE (*err != REG_NOERROR && tree == NULL, 0)) 1959: return NULL; 1960: break; 1961: case OP_OPEN_BRACKET: 1962: tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 1963: if (BE (*err != REG_NOERROR && tree == NULL, 0)) 1964: return NULL; 1965: break; 1966: case OP_BACK_REF: 1967: if (BE (preg->re_nsub < token->opr.idx 1968: || dfa->subexps[token->opr.idx - 1].end == -1, 0)) 1969: { 1970: *err = REG_ESUBREG; 1971: return NULL; 1972: } 1973: dfa->used_bkref_map |= 1 << (token->opr.idx - 1); 1974: new_idx = re_dfa_add_node (dfa, *token, 0); 1975: tree = create_tree (NULL, NULL, 0, new_idx); 1976: if (BE (new_idx == -1 || tree == NULL, 0)) 1977: { 1978: *err = REG_ESPACE; 1979: return NULL; 1980: } 1981: ++dfa->nbackref; 1982: dfa->has_mb_node = 1; 1983: break; 1984: case OP_DUP_ASTERISK: 1985: case OP_DUP_PLUS: 1986: case OP_DUP_QUESTION: 1987: case OP_OPEN_DUP_NUM: 1988: if (syntax & RE_CONTEXT_INVALID_OPS) 1989: { 1990: *err = REG_BADRPT; 1991: return NULL; 1992: } 1993: else if (syntax & RE_CONTEXT_INDEP_OPS) 1994: { 1995: *token = fetch_token (regexp, syntax); 1996: return parse_expression (regexp, preg, token, syntax, nest, err); 1997: } 1998: /* else fall through */ 1999: case OP_CLOSE_SUBEXP: 2000: if ((token->type == OP_CLOSE_SUBEXP) && 2001: !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2002: { 2003: *err = REG_ERPAREN; 2004: return NULL; 2005: } 2006: /* else fall through */ 2007: case OP_CLOSE_DUP_NUM: 2008: /* We treat it as a normal character. */ 2009: 2010: /* Then we can these characters as normal characters. */ 2011: token->type = CHARACTER; 2012: new_idx = re_dfa_add_node (dfa, *token, 0); 2013: tree = create_tree (NULL, NULL, 0, new_idx); 2014: if (BE (new_idx == -1 || tree == NULL, 0)) 2015: { 2016: *err = REG_ESPACE; 2017: return NULL; 2018: } 2019: break; 2020: case ANCHOR: 2021: if (dfa->word_char == NULL) 2022: { 2023: *err = init_word_char (dfa); 2024: if (BE (*err != REG_NOERROR, 0)) 2025: return NULL; 2026: } 2027: if (token->opr.ctx_type == WORD_DELIM) 2028: { 2029: bin_tree_t *tree_first, *tree_last; 2030: int idx_first, idx_last; 2031: token->opr.ctx_type = WORD_FIRST; 2032: idx_first = re_dfa_add_node (dfa, *token, 0); 2033: tree_first = create_tree (NULL, NULL, 0, idx_first); 2034: token->opr.ctx_type = WORD_LAST; 2035: idx_last = re_dfa_add_node (dfa, *token, 0); 2036: tree_last = create_tree (NULL, NULL, 0, idx_last); 2037: token->type = OP_ALT; 2038: new_idx = re_dfa_add_node (dfa, *token, 0); 2039: tree = create_tree (tree_first, tree_last, 0, new_idx); 2040: if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1 2041: || tree_first == NULL || tree_last == NULL 2042: || tree == NULL, 0)) 2043: { 2044: *err = REG_ESPACE; 2045: return NULL; 2046: } 2047: } 2048: else 2049: { 2050: new_idx = re_dfa_add_node (dfa, *token, 0); 2051: tree = create_tree (NULL, NULL, 0, new_idx); 2052: if (BE (new_idx == -1 || tree == NULL, 0)) 2053: { 2054: *err = REG_ESPACE; 2055: return NULL; 2056: } 2057: } 2058: /* We must return here, since ANCHORs can't be followed 2059: by repetition operators. 2060: eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2061: it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2062: *token = fetch_token (regexp, syntax); 2063: return tree; 2064: case OP_PERIOD: 2065: new_idx = re_dfa_add_node (dfa, *token, 0); 2066: tree = create_tree (NULL, NULL, 0, new_idx); 2067: if (BE (new_idx == -1 || tree == NULL, 0)) 2068: { 2069: *err = REG_ESPACE; 2070: return NULL; 2071: } 2072: if (MB_CUR_MAX > 1) 2073: dfa->has_mb_node = 1; 2074: break; 2075: case OP_WORD: 2076: tree = build_word_op (dfa, 0, err); 2077: if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2078: return NULL; 2079: break; 2080: case OP_NOTWORD: 2081: tree = build_word_op (dfa, 1, err); 2082: if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2083: return NULL; 2084: break; 2085: case OP_ALT: 2086: case END_OF_RE: 2087: return NULL; 2088: case BACK_SLASH: 2089: *err = REG_EESCAPE; 2090: return NULL; 2091: default: 2092: /* Must not happen? */ 2093: #ifdef DEBUG 2094: assert (0); 2095: #endif 2096: return NULL; 2097: } 2098: *token = fetch_token (regexp, syntax); 2099: 2100: while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2101: || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2102: { 2103: tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2104: if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2105: return NULL; 2106: dfa->has_plural_match = 1; 2107: } 2108: 2109: return tree; 2110: } 2111: 2112: /* This function build the following tree, from regular expression 2113: (<reg_exp>): 2114: SUBEXP 2115: | 2116: <reg_exp> 2117: */ 2118: 2119: static bin_tree_t * 2120: parse_sub_exp (regexp, preg, token, syntax, nest, err) 2121: re_string_t *regexp; 2122: regex_t *preg; 2123: re_token_t *token; 2124: reg_syntax_t syntax; 2125: int nest; 2126: reg_errcode_t *err; 2127: { 2128: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2129: bin_tree_t *tree, *left_par, *right_par; 2130: size_t cur_nsub; 2131: int new_idx; 2132: cur_nsub = preg->re_nsub++; 2133: if (dfa->subexps_alloc < preg->re_nsub) 2134: { 2135: re_subexp_t *new_array; 2136: dfa->subexps_alloc *= 2; 2137: new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc); 2138: if (BE (new_array == NULL, 0)) 2139: { 2140: dfa->subexps_alloc /= 2; 2141: *err = REG_ESPACE; 2142: return NULL; 2143: } 2144: dfa->subexps = new_array; 2145: } 2146: dfa->subexps[cur_nsub].start = dfa->nodes_len; 2147: dfa->subexps[cur_nsub].end = -1; 2148: 2149: new_idx = re_dfa_add_node (dfa, *token, 0); 2150: left_par = create_tree (NULL, NULL, 0, new_idx); 2151: if (BE (new_idx == -1 || left_par == NULL, 0)) 2152: { 2153: *err = REG_ESPACE; 2154: return NULL; 2155: } 2156: dfa->nodes[new_idx].opr.idx = cur_nsub; 2157: *token = fetch_token (regexp, syntax); 2158: 2159: /* The subexpression may be a null string. */ 2160: if (token->type == OP_CLOSE_SUBEXP) 2161: tree = NULL; 2162: else 2163: { 2164: tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2165: if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2166: return NULL; 2167: } 2168: if (BE (token->type != OP_CLOSE_SUBEXP, 0)) 2169: { 2170: free_bin_tree (tree); 2171: *err = REG_BADPAT; 2172: return NULL; 2173: } 2174: new_idx = re_dfa_add_node (dfa, *token, 0); 2175: dfa->subexps[cur_nsub].end = dfa->nodes_len; 2176: right_par = create_tree (NULL, NULL, 0, new_idx); 2177: tree = ((tree == NULL) ? right_par 2178: : create_tree (tree, right_par, CONCAT, 0)); 2179: tree = create_tree (left_par, tree, CONCAT, 0); 2180: if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0)) 2181: { 2182: *err = REG_ESPACE; 2183: return NULL; 2184: } 2185: dfa->nodes[new_idx].opr.idx = cur_nsub; 2186: 2187: return tree; 2188: } 2189: 2190: /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2191: 2192: static bin_tree_t * 2193: parse_dup_op (dup_elem, regexp, dfa, token, syntax, err) 2194: bin_tree_t *dup_elem; 2195: re_string_t *regexp; 2196: re_dfa_t *dfa; 2197: re_token_t *token; 2198: reg_syntax_t syntax; 2199: reg_errcode_t *err; 2200: { 2201: re_token_t dup_token; 2202: bin_tree_t *tree = dup_elem, *work_tree; 2203: int new_idx, start_idx = re_string_cur_idx (regexp); 2204: re_token_t start_token = *token; 2205: if (token->type == OP_OPEN_DUP_NUM) 2206: { 2207: int i; 2208: int end = 0; 2209: int start = fetch_number (regexp, token, syntax); 2210: bin_tree_t *elem; 2211: if (start == -1) 2212: { 2213: if (token->type == CHARACTER && token->opr.c == ',') 2214: start = 0; /* We treat "{,m}" as "{0,m}". */ 2215: else 2216: { 2217: *err = REG_BADBR; /* <re>{} is invalid. */ 2218: return NULL; 2219: } 2220: } 2221: if (BE (start != -2, 1)) 2222: { 2223: /* We treat "{n}" as "{n,n}". */ 2224: end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2225: : ((token->type == CHARACTER && token->opr.c == ',') 2226: ? fetch_number (regexp, token, syntax) : -2)); 2227: } 2228: if (BE (start == -2 || end == -2, 0)) 2229: { 2230: /* Invalid sequence. */ 2231: if (token->type == OP_CLOSE_DUP_NUM) 2232: goto parse_dup_op_invalid_interval; 2233: else 2234: goto parse_dup_op_ebrace; 2235: } 2236: if (BE (start == 0 && end == 0, 0)) 2237: { 2238: /* We treat "<re>{0}" and "<re>{0,0}" as null string. */ 2239: *token = fetch_token (regexp, syntax); 2240: free_bin_tree (dup_elem); 2241: return NULL; 2242: } 2243: 2244: /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2245: elem = tree; 2246: for (i = 0; i < start; ++i) 2247: if (i != 0) 2248: { 2249: work_tree = duplicate_tree (elem, dfa); 2250: tree = create_tree (tree, work_tree, CONCAT, 0); 2251: if (BE (work_tree == NULL || tree == NULL, 0)) 2252: goto parse_dup_op_espace; 2253: } 2254: 2255: if (end == -1) 2256: { 2257: /* We treat "<re>{0,}" as "<re>*". */ 2258: dup_token.type = OP_DUP_ASTERISK; 2259: if (start > 0) 2260: { 2261: elem = duplicate_tree (elem, dfa); 2262: new_idx = re_dfa_add_node (dfa, dup_token, 0); 2263: work_tree = create_tree (elem, NULL, 0, new_idx); 2264: tree = create_tree (tree, work_tree, CONCAT, 0); 2265: if (BE (elem == NULL || new_idx == -1 || work_tree == NULL 2266: || tree == NULL, 0)) 2267: goto parse_dup_op_espace; 2268: } 2269: else 2270: { 2271: new_idx = re_dfa_add_node (dfa, dup_token, 0); 2272: tree = create_tree (elem, NULL, 0, new_idx); 2273: if (BE (new_idx == -1 || tree == NULL, 0)) 2274: goto parse_dup_op_espace; 2275: } 2276: } 2277: else if (end - start > 0) 2278: { 2279: /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */ 2280: dup_token.type = OP_DUP_QUESTION; 2281: if (start > 0) 2282: { 2283: elem = duplicate_tree (elem, dfa); 2284: new_idx = re_dfa_add_node (dfa, dup_token, 0); 2285: elem = create_tree (elem, NULL, 0, new_idx); 2286: tree = create_tree (tree, elem, CONCAT, 0); 2287: if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0)) 2288: goto parse_dup_op_espace; 2289: } 2290: else 2291: { 2292: new_idx = re_dfa_add_node (dfa, dup_token, 0); 2293: tree = elem = create_tree (elem, NULL, 0, new_idx); 2294: if (BE (new_idx == -1 || tree == NULL, 0)) 2295: goto parse_dup_op_espace; 2296: } 2297: for (i = 1; i < end - start; ++i) 2298: { 2299: work_tree = duplicate_tree (elem, dfa); 2300: tree = create_tree (tree, work_tree, CONCAT, 0); 2301: if (BE (work_tree == NULL || tree == NULL, 0)) 2302: { 2303: *err = REG_ESPACE; 2304: return NULL; 2305: } 2306: } 2307: } 2308: } 2309: else 2310: { 2311: new_idx = re_dfa_add_node (dfa, *token, 0); 2312: tree = create_tree (tree, NULL, 0, new_idx); 2313: if (BE (new_idx == -1 || tree == NULL, 0)) 2314: { 2315: *err = REG_ESPACE; 2316: return NULL; 2317: } 2318: } 2319: *token = fetch_token (regexp, syntax); 2320: return tree; 2321: 2322: parse_dup_op_espace: 2323: free_bin_tree (tree); 2324: *err = REG_ESPACE; 2325: return NULL; 2326: 2327: parse_dup_op_ebrace: 2328: if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2329: { 2330: *err = REG_EBRACE; 2331: return NULL; 2332: } 2333: goto parse_dup_op_rollback; 2334: parse_dup_op_invalid_interval: 2335: if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2336: { 2337: *err = REG_BADBR; 2338: return NULL; 2339: } 2340: parse_dup_op_rollback: 2341: re_string_set_index (regexp, start_idx); 2342: *token = start_token; 2343: token->type = CHARACTER; 2344: return dup_elem; 2345: } 2346: 2347: /* Size of the names for collating symbol/equivalence_class/character_class. 2348: I'm not sure, but maybe enough. */ 2349: #define BRACKET_NAME_BUF_SIZE 32 2350: 2351: #ifndef _LIBC 2352: /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2353: Build the range expression which starts from START_ELEM, and ends 2354: at END_ELEM. The result are written to MBCSET and SBCSET. 2355: RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2356: mbcset->range_ends, is a pointer argument sinse we may 2357: update it. */ 2358: 2359: static reg_errcode_t 2360: # ifdef RE_ENABLE_I18N 2361: build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2362: re_charset_t *mbcset; 2363: int *range_alloc; 2364: # else /* not RE_ENABLE_I18N */ 2365: build_range_exp (sbcset, start_elem, end_elem) 2366: # endif /* not RE_ENABLE_I18N */ 2367: re_bitset_ptr_t sbcset; 2368: bracket_elem_t *start_elem, *end_elem; 2369: { 2370: unsigned int start_ch, end_ch; 2371: /* Equivalence Classes and Character Classes can't be a range start/end. */ 2372: if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2373: || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2374: 0)) 2375: return REG_ERANGE; 2376: 2377: /* We can handle no multi character collating elements without libc 2378: support. */ 2379: if (BE ((start_elem->type == COLL_SYM 2380: && strlen ((char *) start_elem->opr.name) > 1) 2381: || (end_elem->type == COLL_SYM 2382: && strlen ((char *) end_elem->opr.name) > 1), 0)) 2383: return REG_ECOLLATE; 2384: 2385: # ifdef RE_ENABLE_I18N 2386: { 2387: wchar_t wc, start_wc, end_wc; 2388: wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2389: 2390: start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2391: : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2392: : 0)); 2393: end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2394: : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2395: : 0)); 2396: start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2397: ? __btowc (start_ch) : start_elem->opr.wch); 2398: end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2399: ? __btowc (end_ch) : end_elem->opr.wch); 2400: cmp_buf[0] = start_wc; 2401: cmp_buf[4] = end_wc; 2402: if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2403: return REG_ERANGE; 2404: 2405: /* Check the space of the arrays. */ 2406: if (*range_alloc == mbcset->nranges) 2407: { 2408: /* There are not enough space, need realloc. */ 2409: wchar_t *new_array_start, *new_array_end; 2410: int new_nranges; 2411: 2412: /* +1 in case of mbcset->nranges is 0. */ 2413: new_nranges = 2 * mbcset->nranges + 1; 2414: /* Use realloc since mbcset->range_starts and mbcset->range_ends 2415: are NULL if *range_alloc == 0. */ 2416: new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2417: new_nranges); 2418: new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2419: new_nranges); 2420: 2421: if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2422: return REG_ESPACE; 2423: 2424: mbcset->range_starts = new_array_start; 2425: mbcset->range_ends = new_array_end; 2426: *range_alloc = new_nranges; 2427: } 2428: 2429: mbcset->range_starts[mbcset->nranges] = start_wc; 2430: mbcset->range_ends[mbcset->nranges++] = end_wc; 2431: 2432: /* Build the table for single byte characters. */ 2433: for (wc = 0; wc <= SBC_MAX; ++wc) 2434: { 2435: cmp_buf[2] = wc; 2436: if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2437: && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2438: bitset_set (sbcset, wc); 2439: } 2440: } 2441: # else /* not RE_ENABLE_I18N */ 2442: { 2443: unsigned int ch; 2444: start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2445: : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2446: : 0)); 2447: end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2448: : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2449: : 0)); 2450: if (start_ch > end_ch) 2451: return REG_ERANGE; 2452: /* Build the table for single byte characters. */ 2453: for (ch = 0; ch <= SBC_MAX; ++ch) 2454: if (start_ch <= ch && ch <= end_ch) 2455: bitset_set (sbcset, ch); 2456: } 2457: # endif /* not RE_ENABLE_I18N */ 2458: return REG_NOERROR; 2459: } 2460: #endif /* not _LIBC */ 2461: 2462: #ifndef _LIBC 2463: /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2464: Build the collating element which is represented by NAME. 2465: The result are written to MBCSET and SBCSET. 2466: COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2467: pointer argument since we may update it. */ 2468: 2469: static reg_errcode_t 2470: # ifdef RE_ENABLE_I18N 2471: build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2472: re_charset_t *mbcset; 2473: int *coll_sym_alloc; 2474: # else /* not RE_ENABLE_I18N */ 2475: build_collating_symbol (sbcset, name) 2476: # endif /* not RE_ENABLE_I18N */ 2477: re_bitset_ptr_t sbcset; 2478: const unsigned char *name; 2479: { 2480: size_t name_len = strlen ((const char *) name); 2481: if (BE (name_len != 1, 0)) 2482: return REG_ECOLLATE; 2483: else 2484: { 2485: bitset_set (sbcset, name[0]); 2486: return REG_NOERROR; 2487: } 2488: } 2489: #endif /* not _LIBC */ 2490: 2491: /* This function parse bracket expression like "[abc]", "[a-c]", 2492: "[[.a-a.]]" etc. */ 2493: 2494: static bin_tree_t * 2495: parse_bracket_exp (regexp, dfa, token, syntax, err) 2496: re_string_t *regexp; 2497: re_dfa_t *dfa; 2498: re_token_t *token; 2499: reg_syntax_t syntax; 2500: reg_errcode_t *err; 2501: { 2502: #ifdef _LIBC 2503: const unsigned char *collseqmb; 2504: const char *collseqwc; 2505: uint32_t nrules; 2506: int32_t table_size; 2507: const int32_t *symb_table; 2508: const unsigned char *extra; 2509: 2510: /* Local function for parse_bracket_exp used in _LIBC environement. 2511: Seek the collating symbol entry correspondings to NAME. 2512: Return the index of the symbol in the SYMB_TABLE. */ 2513: 2514: static inline int32_t 2515: seek_collating_symbol_entry (name, name_len) 2516: const unsigned char *name; 2517: size_t name_len; 2518: { 2519: int32_t hash = elem_hash ((const char *) name, name_len); 2520: int32_t elem = hash % table_size; 2521: int32_t second = hash % (table_size - 2); 2522: while (symb_table[2 * elem] != 0) 2523: { 2524: /* First compare the hashing value. */ 2525: if (symb_table[2 * elem] == hash 2526: /* Compare the length of the name. */ 2527: && name_len == extra[symb_table[2 * elem + 1]] 2528: /* Compare the name. */ 2529: && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2530: name_len) == 0) 2531: { 2532: /* Yep, this is the entry. */ 2533: break; 2534: } 2535: 2536: /* Next entry. */ 2537: elem += second; 2538: } 2539: return elem; 2540: } 2541: 2542: /* Local function for parse_bracket_exp used in _LIBC environement. 2543: Look up the collation sequence value of BR_ELEM. 2544: Return the value if succeeded, UINT_MAX otherwise. */ 2545: 2546: static inline unsigned int 2547: lookup_collation_sequence_value (br_elem) 2548: bracket_elem_t *br_elem; 2549: { 2550: if (br_elem->type == SB_CHAR) 2551: { 2552: /* 2553: if (MB_CUR_MAX == 1) 2554: */ 2555: if (nrules == 0) 2556: return collseqmb[br_elem->opr.ch]; 2557: else 2558: { 2559: wint_t wc = __btowc (br_elem->opr.ch); 2560: return collseq_table_lookup (collseqwc, wc); 2561: } 2562: } 2563: else if (br_elem->type == MB_CHAR) 2564: { 2565: return collseq_table_lookup (collseqwc, br_elem->opr.wch); 2566: } 2567: else if (br_elem->type == COLL_SYM) 2568: { 2569: size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2570: if (nrules != 0) 2571: { 2572: int32_t elem, idx; 2573: elem = seek_collating_symbol_entry (br_elem->opr.name, 2574: sym_name_len); 2575: if (symb_table[2 * elem] != 0) 2576: { 2577: /* We found the entry. */ 2578: idx = symb_table[2 * elem + 1]; 2579: /* Skip the name of collating element name. */ 2580: idx += 1 + extra[idx]; 2581: /* Skip the byte sequence of the collating element. */ 2582: idx += 1 + extra[idx]; 2583: /* Adjust for the alignment. */ 2584: idx = (idx + 3) & ~3; 2585: /* Skip the multibyte collation sequence value. */ 2586: idx += sizeof (unsigned int); 2587: /* Skip the wide char sequence of the collating element. */ 2588: idx += sizeof (unsigned int) * 2589: (1 + *(unsigned int *) (extra + idx)); 2590: /* Return the collation sequence value. */ 2591: return *(unsigned int *) (extra + idx); 2592: } 2593: else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2594: { 2595: /* No valid character. Match it as a single byte 2596: character. */ 2597: return collseqmb[br_elem->opr.name[0]]; 2598: } 2599: } 2600: else if (sym_name_len == 1) 2601: return collseqmb[br_elem->opr.name[0]]; 2602: } 2603: return UINT_MAX; 2604: } 2605: 2606: /* Local function for parse_bracket_exp used in _LIBC environement. 2607: Build the range expression which starts from START_ELEM, and ends 2608: at END_ELEM. The result are written to MBCSET and SBCSET. 2609: RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2610: mbcset->range_ends, is a pointer argument sinse we may 2611: update it. */ 2612: 2613: static inline reg_errcode_t 2614: # ifdef RE_ENABLE_I18N 2615: build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2616: re_charset_t *mbcset; 2617: int *range_alloc; 2618: # else /* not RE_ENABLE_I18N */ 2619: build_range_exp (sbcset, start_elem, end_elem) 2620: # endif /* not RE_ENABLE_I18N */ 2621: re_bitset_ptr_t sbcset; 2622: bracket_elem_t *start_elem, *end_elem; 2623: { 2624: unsigned int ch; 2625: uint32_t start_collseq; 2626: uint32_t end_collseq; 2627: 2628: # ifdef RE_ENABLE_I18N 2629: /* Check the space of the arrays. */ 2630: if (*range_alloc == mbcset->nranges) 2631: { 2632: /* There are not enough space, need realloc. */ 2633: uint32_t *new_array_start; 2634: uint32_t *new_array_end; 2635: int new_nranges; 2636: 2637: /* +1 in case of mbcset->nranges is 0. */ 2638: new_nranges = 2 * mbcset->nranges + 1; 2639: /* Use realloc since mbcset->range_starts and mbcset->range_ends 2640: are NULL if *range_alloc == 0. */ 2641: new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2642: new_nranges); 2643: new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2644: new_nranges); 2645: 2646: if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2647: return REG_ESPACE; 2648: 2649: mbcset->range_starts = new_array_start; 2650: mbcset->range_ends = new_array_end; 2651: *range_alloc = new_nranges; 2652: } 2653: # endif /* RE_ENABLE_I18N */ 2654: 2655: /* Equivalence Classes and Character Classes can't be a range 2656: start/end. */ 2657: if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2658: || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2659: 0)) 2660: return REG_ERANGE; 2661: 2662: start_collseq = lookup_collation_sequence_value (start_elem); 2663: end_collseq = lookup_collation_sequence_value (end_elem); 2664: /* Check start/end collation sequence values. */ 2665: if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2666: return REG_ECOLLATE; 2667: if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2668: return REG_ERANGE; 2669: 2670: # ifdef RE_ENABLE_I18N 2671: /* Got valid collation sequence values, add them as a new entry. */ 2672: mbcset->range_starts[mbcset->nranges] = start_collseq; 2673: mbcset->range_ends[mbcset->nranges++] = end_collseq; 2674: # endif /* RE_ENABLE_I18N */ 2675: 2676: /* Build the table for single byte characters. */ 2677: for (ch = 0; ch <= SBC_MAX; ch++) 2678: { 2679: uint32_t ch_collseq; 2680: /* 2681: if (MB_CUR_MAX == 1) 2682: */ 2683: if (nrules == 0) 2684: ch_collseq = collseqmb[ch]; 2685: else 2686: ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch)); 2687: if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2688: bitset_set (sbcset, ch); 2689: } 2690: return REG_NOERROR; 2691: } 2692: 2693: /* Local function for parse_bracket_exp used in _LIBC environement. 2694: Build the collating element which is represented by NAME. 2695: The result are written to MBCSET and SBCSET. 2696: COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2697: pointer argument sinse we may update it. */ 2698: 2699: static inline reg_errcode_t 2700: # ifdef RE_ENABLE_I18N 2701: build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2702: re_charset_t *mbcset; 2703: int *coll_sym_alloc; 2704: # else /* not RE_ENABLE_I18N */ 2705: build_collating_symbol (sbcset, name) 2706: # endif /* not RE_ENABLE_I18N */ 2707: re_bitset_ptr_t sbcset; 2708: const unsigned char *name; 2709: { 2710: int32_t elem, idx; 2711: size_t name_len = strlen ((const char *) name); 2712: if (nrules != 0) 2713: { 2714: elem = seek_collating_symbol_entry (name, name_len); 2715: if (symb_table[2 * elem] != 0) 2716: { 2717: /* We found the entry. */ 2718: idx = symb_table[2 * elem + 1]; 2719: /* Skip the name of collating element name. */ 2720: idx += 1 + extra[idx]; 2721: } 2722: else if (symb_table[2 * elem] == 0 && name_len == 1) 2723: { 2724: /* No valid character, treat it as a normal 2725: character. */ 2726: bitset_set (sbcset, name[0]); 2727: return REG_NOERROR; 2728: } 2729: else 2730: return REG_ECOLLATE; 2731: 2732: # ifdef RE_ENABLE_I18N 2733: /* Got valid collation sequence, add it as a new entry. */ 2734: /* Check the space of the arrays. */ 2735: if (*coll_sym_alloc == mbcset->ncoll_syms) 2736: { 2737: /* Not enough, realloc it. */ 2738: /* +1 in case of mbcset->ncoll_syms is 0. */ 2739: *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 2740: /* Use realloc since mbcset->coll_syms is NULL 2741: if *alloc == 0. */ 2742: mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t, 2743: *coll_sym_alloc); 2744: if (BE (mbcset->coll_syms == NULL, 0)) 2745: return REG_ESPACE; 2746: } 2747: mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 2748: # endif /* RE_ENABLE_I18N */ 2749: return REG_NOERROR; 2750: } 2751: else 2752: { 2753: if (BE (name_len != 1, 0)) 2754: return REG_ECOLLATE; 2755: else 2756: { 2757: bitset_set (sbcset, name[0]); 2758: return REG_NOERROR; 2759: } 2760: } 2761: } 2762: #endif 2763: 2764: re_token_t br_token; 2765: re_bitset_ptr_t sbcset; 2766: #ifdef RE_ENABLE_I18N 2767: re_charset_t *mbcset; 2768: int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 2769: int equiv_class_alloc = 0, char_class_alloc = 0; 2770: #else /* not RE_ENABLE_I18N */ 2771: int non_match = 0; 2772: #endif /* not RE_ENABLE_I18N */ 2773: bin_tree_t *work_tree; 2774: int token_len, new_idx; 2775: #ifdef _LIBC 2776: collseqmb = (const unsigned char *) 2777: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 2778: nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 2779: if (nrules) 2780: { 2781: /* 2782: if (MB_CUR_MAX > 1) 2783: */ 2784: collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 2785: table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 2786: symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 2787: _NL_COLLATE_SYMB_TABLEMB); 2788: extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 2789: _NL_COLLATE_SYMB_EXTRAMB); 2790: } 2791: #endif 2792: sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); 2793: #ifdef RE_ENABLE_I18N 2794: mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 2795: #endif /* RE_ENABLE_I18N */ 2796: #ifdef RE_ENABLE_I18N 2797: if (BE (sbcset == NULL || mbcset == NULL, 0)) 2798: #else 2799: if (BE (sbcset == NULL, 0)) 2800: #endif /* RE_ENABLE_I18N */ 2801: { 2802: *err = REG_ESPACE; 2803: return NULL; 2804: } 2805: 2806: token_len = peek_token_bracket (token, regexp, syntax); 2807: if (BE (token->type == END_OF_RE, 0)) 2808: { 2809: *err = REG_BADPAT; 2810: goto parse_bracket_exp_free_return; 2811: } 2812: if (token->type == OP_NON_MATCH_LIST) 2813: { 2814: #ifdef RE_ENABLE_I18N 2815: int i; 2816: mbcset->non_match = 1; 2817: #else /* not RE_ENABLE_I18N */ 2818: non_match = 1; 2819: #endif /* not RE_ENABLE_I18N */ 2820: if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 2821: bitset_set (sbcset, '\0'); 2822: re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 2823: token_len = peek_token_bracket (token, regexp, syntax); 2824: if (BE (token->type == END_OF_RE, 0)) 2825: { 2826: *err = REG_BADPAT; 2827: goto parse_bracket_exp_free_return; 2828: } 2829: #ifdef RE_ENABLE_I18N 2830: if (MB_CUR_MAX > 1) 2831: for (i = 0; i < SBC_MAX; ++i) 2832: if (__btowc (i) == WEOF) 2833: bitset_set (sbcset, i); 2834: #endif /* RE_ENABLE_I18N */ 2835: } 2836: 2837: /* We treat the first ']' as a normal character. */ 2838: if (token->type == OP_CLOSE_BRACKET) 2839: token->type = CHARACTER; 2840: 2841: while (1) 2842: { 2843: bracket_elem_t start_elem, end_elem; 2844: unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 2845: unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 2846: reg_errcode_t ret; 2847: int token_len2 = 0, is_range_exp = 0; 2848: re_token_t token2; 2849: 2850: start_elem.opr.name = start_name_buf; 2851: ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 2852: syntax); 2853: if (BE (ret != REG_NOERROR, 0)) 2854: { 2855: *err = ret; 2856: goto parse_bracket_exp_free_return; 2857: } 2858: 2859: token_len = peek_token_bracket (token, regexp, syntax); 2860: if (BE (token->type == END_OF_RE, 0)) 2861: { 2862: *err = REG_BADPAT; 2863: goto parse_bracket_exp_free_return; 2864: } 2865: if (token->type == OP_CHARSET_RANGE) 2866: { 2867: re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 2868: token_len2 = peek_token_bracket (&token2, regexp, syntax); 2869: if (BE (token->type == END_OF_RE, 0)) 2870: { 2871: *err = REG_BADPAT; 2872: goto parse_bracket_exp_free_return; 2873: } 2874: if (token2.type == OP_CLOSE_BRACKET) 2875: { 2876: /* We treat the last '-' as a normal character. */ 2877: re_string_skip_bytes (regexp, -token_len); 2878: token->type = CHARACTER; 2879: } 2880: else 2881: is_range_exp = 1; 2882: } 2883: 2884: if (is_range_exp == 1) 2885: { 2886: end_elem.opr.name = end_name_buf; 2887: ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 2888: dfa, syntax); 2889: if (BE (ret != REG_NOERROR, 0)) 2890: { 2891: *err = ret; 2892: goto parse_bracket_exp_free_return; 2893: } 2894: 2895: token_len = peek_token_bracket (token, regexp, syntax); 2896: if (BE (token->type == END_OF_RE, 0)) 2897: { 2898: *err = REG_BADPAT; 2899: goto parse_bracket_exp_free_return; 2900: } 2901: *err = build_range_exp (sbcset, 2902: #ifdef RE_ENABLE_I18N 2903: mbcset, &range_alloc, 2904: #endif /* RE_ENABLE_I18N */ 2905: &start_elem, &end_elem); 2906: if (BE (*err != REG_NOERROR, 0)) 2907: goto parse_bracket_exp_free_return; 2908: } 2909: else 2910: { 2911: switch (start_elem.type) 2912: { 2913: case SB_CHAR: 2914: bitset_set (sbcset, start_elem.opr.ch); 2915: break; 2916: #ifdef RE_ENABLE_I18N 2917: case MB_CHAR: 2918: /* Check whether the array has enough space. */ 2919: if (mbchar_alloc == mbcset->nmbchars) 2920: { 2921: /* Not enough, realloc it. */ 2922: /* +1 in case of mbcset->nmbchars is 0. */ 2923: mbchar_alloc = 2 * mbcset->nmbchars + 1; 2924: /* Use realloc since array is NULL if *alloc == 0. */ 2925: mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t, 2926: mbchar_alloc); 2927: if (BE (mbcset->mbchars == NULL, 0)) 2928: goto parse_bracket_exp_espace; 2929: } 2930: mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 2931: break; 2932: #endif /* RE_ENABLE_I18N */ 2933: case EQUIV_CLASS: 2934: *err = build_equiv_class (sbcset, 2935: #ifdef RE_ENABLE_I18N 2936: mbcset, &equiv_class_alloc, 2937: #endif /* RE_ENABLE_I18N */ 2938: start_elem.opr.name); 2939: if (BE (*err != REG_NOERROR, 0)) 2940: goto parse_bracket_exp_free_return; 2941: break; 2942: case COLL_SYM: 2943: *err = build_collating_symbol (sbcset, 2944: #ifdef RE_ENABLE_I18N 2945: mbcset, &coll_sym_alloc, 2946: #endif /* RE_ENABLE_I18N */ 2947: start_elem.opr.name); 2948: if (BE (*err != REG_NOERROR, 0)) 2949: goto parse_bracket_exp_free_return; 2950: break; 2951: case CHAR_CLASS: 2952: *err = build_charclass (sbcset, 2953: #ifdef RE_ENABLE_I18N 2954: mbcset, &char_class_alloc, 2955: #endif /* RE_ENABLE_I18N */ 2956: start_elem.opr.name, syntax); 2957: if (BE (*err != REG_NOERROR, 0)) 2958: goto parse_bracket_exp_free_return; 2959: break; 2960: default: 2961: assert (0); 2962: break; 2963: } 2964: } 2965: if (token->type == OP_CLOSE_BRACKET) 2966: break; 2967: } 2968: 2969: re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 2970: 2971: /* If it is non-matching list. */ 2972: #ifdef RE_ENABLE_I18N 2973: if (mbcset->non_match) 2974: #else /* not RE_ENABLE_I18N */ 2975: if (non_match) 2976: #endif /* not RE_ENABLE_I18N */ 2977: bitset_not (sbcset); 2978: 2979: /* Build a tree for simple bracket. */ 2980: br_token.type = SIMPLE_BRACKET; 2981: br_token.opr.sbcset = sbcset; 2982: new_idx = re_dfa_add_node (dfa, br_token, 0); 2983: work_tree = create_tree (NULL, NULL, 0, new_idx); 2984: if (BE (new_idx == -1 || work_tree == NULL, 0)) 2985: goto parse_bracket_exp_espace; 2986: 2987: #ifdef RE_ENABLE_I18N 2988: if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 2989: || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes 2990: || mbcset->non_match))) 2991: { 2992: re_token_t alt_token; 2993: bin_tree_t *mbc_tree; 2994: /* Build a tree for complex bracket. */ 2995: br_token.type = COMPLEX_BRACKET; 2996: br_token.opr.mbcset = mbcset; 2997: dfa->has_mb_node = 1; 2998: new_idx = re_dfa_add_node (dfa, br_token, 0); 2999: mbc_tree = create_tree (NULL, NULL, 0, new_idx); 3000: if (BE (new_idx == -1 || mbc_tree == NULL, 0)) 3001: goto parse_bracket_exp_espace; 3002: /* Then join them by ALT node. */ 3003: dfa->has_plural_match = 1; 3004: alt_token.type = OP_ALT; 3005: new_idx = re_dfa_add_node (dfa, alt_token, 0); 3006: work_tree = create_tree (work_tree, mbc_tree, 0, new_idx); 3007: if (BE (new_idx != -1 && mbc_tree != NULL, 1)) 3008: return work_tree; 3009: } 3010: else 3011: { 3012: free_charset (mbcset); 3013: return work_tree; 3014: } 3015: #else /* not RE_ENABLE_I18N */ 3016: return work_tree; 3017: #endif /* not RE_ENABLE_I18N */ 3018: 3019: parse_bracket_exp_espace: 3020: *err = REG_ESPACE; 3021: parse_bracket_exp_free_return: 3022: re_free (sbcset); 3023: #ifdef RE_ENABLE_I18N 3024: free_charset (mbcset); 3025: #endif /* RE_ENABLE_I18N */ 3026: return NULL; 3027: } 3028: 3029: /* Parse an element in the bracket expression. */ 3030: 3031: static reg_errcode_t 3032: parse_bracket_element (elem, regexp, token, token_len, dfa, syntax) 3033: bracket_elem_t *elem; 3034: re_string_t *regexp; 3035: re_token_t *token; 3036: int token_len; 3037: re_dfa_t *dfa; 3038: reg_syntax_t syntax; 3039: { 3040: #ifdef RE_ENABLE_I18N 3041: int cur_char_size; 3042: cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3043: if (cur_char_size > 1) 3044: { 3045: elem->type = MB_CHAR; 3046: elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3047: re_string_skip_bytes (regexp, cur_char_size); 3048: return REG_NOERROR; 3049: } 3050: #endif /* RE_ENABLE_I18N */ 3051: re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3052: if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3053: || token->type == OP_OPEN_EQUIV_CLASS) 3054: return parse_bracket_symbol (elem, regexp, token); 3055: elem->type = SB_CHAR; 3056: elem->opr.ch = token->opr.c; 3057: return REG_NOERROR; 3058: } 3059: 3060: /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3061: such as [:<character_class>:], [.<collating_element>.], and 3062: [=<equivalent_class>=]. */ 3063: 3064: static reg_errcode_t 3065: parse_bracket_symbol (elem, regexp, token) 3066: bracket_elem_t *elem; 3067: re_string_t *regexp; 3068: re_token_t *token; 3069: { 3070: unsigned char ch, delim = token->opr.c; 3071: int i = 0; 3072: for (;; ++i) 3073: { 3074: if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE) 3075: return REG_EBRACK; 3076: if (token->type == OP_OPEN_CHAR_CLASS) 3077: ch = re_string_fetch_byte_case (regexp); 3078: else 3079: ch = re_string_fetch_byte (regexp); 3080: if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3081: break; 3082: elem->opr.name[i] = ch; 3083: } 3084: re_string_skip_bytes (regexp, 1); 3085: elem->opr.name[i] = '\0'; 3086: switch (token->type) 3087: { 3088: case OP_OPEN_COLL_ELEM: 3089: elem->type = COLL_SYM; 3090: break; 3091: case OP_OPEN_EQUIV_CLASS: 3092: elem->type = EQUIV_CLASS; 3093: break; 3094: case OP_OPEN_CHAR_CLASS: 3095: elem->type = CHAR_CLASS; 3096: break; 3097: default: 3098: break; 3099: } 3100: return REG_NOERROR; 3101: } 3102: 3103: /* Helper function for parse_bracket_exp. 3104: Build the equivalence class which is represented by NAME. 3105: The result are written to MBCSET and SBCSET. 3106: EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3107: is a pointer argument sinse we may update it. */ 3108: 3109: static reg_errcode_t 3110: #ifdef RE_ENABLE_I18N 3111: build_equiv_class (sbcset, mbcset, equiv_class_alloc, name) 3112: re_charset_t *mbcset; 3113: int *equiv_class_alloc; 3114: #else /* not RE_ENABLE_I18N */ 3115: build_equiv_class (sbcset, name) 3116: #endif /* not RE_ENABLE_I18N */ 3117: re_bitset_ptr_t sbcset; 3118: const unsigned char *name; 3119: { 3120: #if defined _LIBC && defined RE_ENABLE_I18N 3121: uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3122: if (nrules != 0) 3123: { 3124: const int32_t *table, *indirect; 3125: const unsigned char *weights, *extra, *cp; 3126: unsigned char char_buf[2]; 3127: int32_t idx1, idx2; 3128: unsigned int ch; 3129: size_t len; 3130: /* This #include defines a local function! */ 3131: # include <locale/weight.h> 3132: /* Calculate the index for equivalence class. */ 3133: cp = name; 3134: table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3135: weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3136: _NL_COLLATE_WEIGHTMB); 3137: extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3138: _NL_COLLATE_EXTRAMB); 3139: indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3140: _NL_COLLATE_INDIRECTMB); 3141: idx1 = findidx (&cp); 3142: if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3143: /* This isn't a valid character. */ 3144: return REG_ECOLLATE; 3145: 3146: /* Build single byte matcing table for this equivalence class. */ 3147: char_buf[1] = (unsigned char) '\0'; 3148: len = weights[idx1]; 3149: for (ch = 0; ch < SBC_MAX; ++ch) 3150: { 3151: char_buf[0] = ch; 3152: cp = char_buf; 3153: idx2 = findidx (&cp); 3154: /* 3155: idx2 = table[ch]; 3156: */ 3157: if (idx2 == 0) 3158: /* This isn't a valid character. */ 3159: continue; 3160: if (len == weights[idx2]) 3161: { 3162: int cnt = 0; 3163: while (cnt <= len && 3164: weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) 3165: ++cnt; 3166: 3167: if (cnt > len) 3168: bitset_set (sbcset, ch); 3169: } 3170: } 3171: /* Check whether the array has enough space. */ 3172: if (*equiv_class_alloc == mbcset->nequiv_classes) 3173: { 3174: /* Not enough, realloc it. */ 3175: /* +1 in case of mbcset->nequiv_classes is 0. */ 3176: *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3177: /* Use realloc since the array is NULL if *alloc == 0. */ 3178: mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t, 3179: *equiv_class_alloc); 3180: if (BE (mbcset->equiv_classes == NULL, 0)) 3181: return REG_ESPACE; 3182: } 3183: mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3184: } 3185: else 3186: #endif /* _LIBC && RE_ENABLE_I18N */ 3187: { 3188: if (BE (strlen ((const char *) name) != 1, 0)) 3189: return REG_ECOLLATE; 3190: bitset_set (sbcset, *name); 3191: } 3192: return REG_NOERROR; 3193: } 3194: 3195: /* Helper function for parse_bracket_exp. 3196: Build the character class which is represented by NAME. 3197: The result are written to MBCSET and SBCSET. 3198: CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3199: is a pointer argument sinse we may update it. */ 3200: 3201: static reg_errcode_t 3202: #ifdef RE_ENABLE_I18N 3203: build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax) 3204: re_charset_t *mbcset; 3205: int *char_class_alloc; 3206: #else /* not RE_ENABLE_I18N */ 3207: build_charclass (sbcset, class_name, syntax) 3208: #endif /* not RE_ENABLE_I18N */ 3209: re_bitset_ptr_t sbcset; 3210: const unsigned char *class_name; 3211: reg_syntax_t syntax; 3212: { 3213: int i; 3214: const char *name = (const char *) class_name; 3215: 3216: /* In case of REG_ICASE "upper" and "lower" match the both of 3217: upper and lower cases. */ 3218: if ((syntax & RE_ICASE) 3219: && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3220: name = "alpha"; 3221: 3222: #ifdef RE_ENABLE_I18N 3223: /* Check the space of the arrays. */ 3224: if (*char_class_alloc == mbcset->nchar_classes) 3225: { 3226: /* Not enough, realloc it. */ 3227: /* +1 in case of mbcset->nchar_classes is 0. */ 3228: *char_class_alloc = 2 * mbcset->nchar_classes + 1; 3229: /* Use realloc since array is NULL if *alloc == 0. */ 3230: mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t, 3231: *char_class_alloc); 3232: if (BE (mbcset->char_classes == NULL, 0)) 3233: return REG_ESPACE; 3234: } 3235: mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3236: #endif /* RE_ENABLE_I18N */ 3237: 3238: #define BUILD_CHARCLASS_LOOP(ctype_func)\ 3239: for (i = 0; i < SBC_MAX; ++i) \ 3240: { \ 3241: if (ctype_func (i)) \ 3242: bitset_set (sbcset, i); \ 3243: } 3244: 3245: if (strcmp (name, "alnum") == 0) 3246: BUILD_CHARCLASS_LOOP (isalnum) 3247: else if (strcmp (name, "cntrl") == 0) 3248: BUILD_CHARCLASS_LOOP (iscntrl) 3249: else if (strcmp (name, "lower") == 0) 3250: BUILD_CHARCLASS_LOOP (islower) 3251: else if (strcmp (name, "space") == 0) 3252: BUILD_CHARCLASS_LOOP (isspace) 3253: else if (strcmp (name, "alpha") == 0) 3254: BUILD_CHARCLASS_LOOP (isalpha) 3255: else if (strcmp (name, "digit") == 0) 3256: BUILD_CHARCLASS_LOOP (isdigit) 3257: else if (strcmp (name, "print") == 0) 3258: BUILD_CHARCLASS_LOOP (isprint) 3259: else if (strcmp (name, "upper") == 0) 3260: BUILD_CHARCLASS_LOOP (isupper) 3261: else if (strcmp (name, "blank") == 0) 3262: BUILD_CHARCLASS_LOOP (isblank) 3263: else if (strcmp (name, "graph") == 0) 3264: BUILD_CHARCLASS_LOOP (isgraph) 3265: else if (strcmp (name, "punct") == 0) 3266: BUILD_CHARCLASS_LOOP (ispunct) 3267: else if (strcmp (name, "xdigit") == 0) 3268: BUILD_CHARCLASS_LOOP (isxdigit) 3269: else 3270: return REG_ECTYPE; 3271: 3272: return REG_NOERROR; 3273: } 3274: 3275: static bin_tree_t * 3276: build_word_op (dfa, not, err) 3277: re_dfa_t *dfa; 3278: int not; 3279: reg_errcode_t *err; 3280: { 3281: re_bitset_ptr_t sbcset; 3282: #ifdef RE_ENABLE_I18N 3283: re_charset_t *mbcset; 3284: int alloc = 0; 3285: #else /* not RE_ENABLE_I18N */ 3286: int non_match = 0; 3287: #endif /* not RE_ENABLE_I18N */ 3288: reg_errcode_t ret; 3289: re_token_t br_token; 3290: bin_tree_t *tree; 3291: int new_idx; 3292: 3293: sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS); 3294: #ifdef RE_ENABLE_I18N 3295: mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3296: #endif /* RE_ENABLE_I18N */ 3297: 3298: #ifdef RE_ENABLE_I18N 3299: if (BE (sbcset == NULL || mbcset == NULL, 0)) 3300: #else /* not RE_ENABLE_I18N */ 3301: if (BE (sbcset == NULL, 0)) 3302: #endif /* not RE_ENABLE_I18N */ 3303: { 3304: *err = REG_ESPACE; 3305: return NULL; 3306: } 3307: 3308: if (not) 3309: { 3310: #ifdef RE_ENABLE_I18N 3311: int i; 3312: /* 3313: if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3314: bitset_set(cset->sbcset, '\0'); 3315: */ 3316: mbcset->non_match = 1; 3317: if (MB_CUR_MAX > 1) 3318: for (i = 0; i < SBC_MAX; ++i) 3319: if (__btowc (i) == WEOF) 3320: bitset_set (sbcset, i); 3321: #else /* not RE_ENABLE_I18N */ 3322: non_match = 1; 3323: #endif /* not RE_ENABLE_I18N */ 3324: } 3325: 3326: /* We don't care the syntax in this case. */ 3327: ret = build_charclass (sbcset, 3328: #ifdef RE_ENABLE_I18N 3329: mbcset, &alloc, 3330: #endif /* RE_ENABLE_I18N */ 3331: (const unsigned char *) "alpha", 0); 3332: 3333: if (BE (ret != REG_NOERROR, 0)) 3334: { 3335: re_free (sbcset); 3336: #ifdef RE_ENABLE_I18N 3337: free_charset (mbcset); 3338: #endif /* RE_ENABLE_I18N */ 3339: *err = ret; 3340: return NULL; 3341: } 3342: /* \w match '_' also. */ 3343: bitset_set (sbcset, '_'); 3344: 3345: /* If it is non-matching list. */ 3346: #ifdef RE_ENABLE_I18N 3347: if (mbcset->non_match) 3348: #else /* not RE_ENABLE_I18N */ 3349: if (non_match) 3350: #endif /* not RE_ENABLE_I18N */ 3351: bitset_not (sbcset); 3352: 3353: /* Build a tree for simple bracket. */ 3354: br_token.type = SIMPLE_BRACKET; 3355: br_token.opr.sbcset = sbcset; 3356: new_idx = re_dfa_add_node (dfa, br_token, 0); 3357: tree = create_tree (NULL, NULL, 0, new_idx); 3358: if (BE (new_idx == -1 || tree == NULL, 0)) 3359: goto build_word_op_espace; 3360: 3361: #ifdef RE_ENABLE_I18N 3362: if (MB_CUR_MAX > 1) 3363: { 3364: re_token_t alt_token; 3365: bin_tree_t *mbc_tree; 3366: /* Build a tree for complex bracket. */ 3367: br_token.type = COMPLEX_BRACKET; 3368: br_token.opr.mbcset = mbcset; 3369: dfa->has_mb_node = 1; 3370: new_idx = re_dfa_add_node (dfa, br_token, 0); 3371: mbc_tree = create_tree (NULL, NULL, 0, new_idx); 3372: if (BE (new_idx == -1 || mbc_tree == NULL, 0)) 3373: goto build_word_op_espace; 3374: /* Then join them by ALT node. */ 3375: alt_token.type = OP_ALT; 3376: new_idx = re_dfa_add_node (dfa, alt_token, 0); 3377: tree = create_tree (tree, mbc_tree, 0, new_idx); 3378: if (BE (new_idx != -1 && mbc_tree != NULL, 1)) 3379: return tree; 3380: } 3381: else 3382: { 3383: free_charset (mbcset); 3384: return tree; 3385: } 3386: #else /* not RE_ENABLE_I18N */ 3387: return tree; 3388: #endif /* not RE_ENABLE_I18N */ 3389: 3390: build_word_op_espace: 3391: re_free (sbcset); 3392: #ifdef RE_ENABLE_I18N 3393: free_charset (mbcset); 3394: #endif /* RE_ENABLE_I18N */ 3395: *err = REG_ESPACE; 3396: return NULL; 3397: } 3398: 3399: /* This is intended for the expressions like "a{1,3}". 3400: Fetch a number from `input', and return the number. 3401: Return -1, if the number field is empty like "{,1}". 3402: Return -2, If an error is occured. */ 3403: 3404: static int 3405: fetch_number (input, token, syntax) 3406: re_string_t *input; 3407: re_token_t *token; 3408: reg_syntax_t syntax; 3409: { 3410: int num = -1; 3411: unsigned char c; 3412: while (1) 3413: { 3414: *token = fetch_token (input, syntax); 3415: c = token->opr.c; 3416: if (BE (token->type == END_OF_RE, 0)) 3417: return -2; 3418: if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3419: break; 3420: num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2) 3421: ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0')); 3422: num = (num > RE_DUP_MAX) ? -2 : num; 3423: } 3424: return num; 3425: } 3426: 3427: #ifdef RE_ENABLE_I18N 3428: static void 3429: free_charset (re_charset_t *cset) 3430: { 3431: re_free (cset->mbchars); 3432: # ifdef _LIBC 3433: re_free (cset->coll_syms); 3434: re_free (cset->equiv_classes); 3435: re_free (cset->range_starts); 3436: re_free (cset->range_ends); 3437: # endif 3438: re_free (cset->char_classes); 3439: re_free (cset); 3440: } 3441: #endif /* RE_ENABLE_I18N */ 3442: 3443: /* Functions for binary tree operation. */ 3444: 3445: /* Create a node of tree. 3446: Note: This function automatically free left and right if malloc fails. */ 3447: 3448: static bin_tree_t * 3449: create_tree (left, right, type, index) 3450: bin_tree_t *left; 3451: bin_tree_t *right; 3452: re_token_type_t type; 3453: int index; 3454: { 3455: bin_tree_t *tree; 3456: tree = re_malloc (bin_tree_t, 1); 3457: if (BE (tree == NULL, 0)) 3458: { 3459: free_bin_tree (left); 3460: free_bin_tree (right); 3461: return NULL; 3462: } 3463: tree->parent = NULL; 3464: tree->left = left; 3465: tree->right = right; 3466: tree->type = type; 3467: tree->node_idx = index; 3468: tree->first = -1; 3469: tree->next = -1; 3470: re_node_set_init_empty (&tree->eclosure); 3471: 3472: if (left != NULL) 3473: left->parent = tree; 3474: if (right != NULL) 3475: right->parent = tree; 3476: return tree; 3477: } 3478: 3479: /* Free the sub tree pointed by TREE. */ 3480: 3481: static void 3482: free_bin_tree (tree) 3483: bin_tree_t *tree; 3484: { 3485: if (tree == NULL) 3486: return; 3487: /*re_node_set_free (&tree->eclosure);*/ 3488: free_bin_tree (tree->left); 3489: free_bin_tree (tree->right); 3490: re_free (tree); 3491: } 3492: 3493: /* Duplicate the node SRC, and return new node. */ 3494: 3495: static bin_tree_t * 3496: duplicate_tree (src, dfa) 3497: const bin_tree_t *src; 3498: re_dfa_t *dfa; 3499: { 3500: bin_tree_t *left = NULL, *right = NULL, *new_tree; 3501: int new_node_idx; 3502: /* Since node indies must be according to Post-order of the tree, 3503: we must duplicate the left at first. */ 3504: if (src->left != NULL) 3505: { 3506: left = duplicate_tree (src->left, dfa); 3507: if (left == NULL) 3508: return NULL; 3509: } 3510: 3511: /* Secondaly, duplicate the right. */ 3512: if (src->right != NULL) 3513: { 3514: right = duplicate_tree (src->right, dfa); 3515: if (right == NULL) 3516: { 3517: free_bin_tree (left); 3518: return NULL; 3519: } 3520: } 3521: 3522: /* At last, duplicate itself. */ 3523: if (src->type == NON_TYPE) 3524: { 3525: new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0); 3526: dfa->nodes[new_node_idx].duplicated = 1; 3527: if (BE (new_node_idx == -1, 0)) 3528: { 3529: free_bin_tree (left); 3530: free_bin_tree (right); 3531: return NULL; 3532: } 3533: } 3534: else 3535: new_node_idx = src->type; 3536: 3537: new_tree = create_tree (left, right, src->type, new_node_idx); 3538: if (BE (new_tree == NULL, 0)) 3539: { 3540: free_bin_tree (left); 3541: free_bin_tree (right); 3542: } 3543: return new_tree; 3544: }