embedaddon/smartmontools/regex/regexec.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / smartmontools / regex / regexec.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 16:32:16 2012 UTC (12 years, 4 months ago) by misho
Branches: smartmontools, elwix, MAIN
CVS tags: v5_43, v5_42, HEAD

smartmontools

1: /* Extended regular expression matching and search library. 2: Copyright (C) 2002, 2003 Free Software Foundation, Inc. 3: This file is part of the GNU C Library. 4: Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5: 6: The GNU C Library is free software; you can redistribute it and/or 7: modify it under the terms of the GNU Lesser General Public 8: License as published by the Free Software Foundation; either 9: version 2.1 of the License, or (at your option) any later version. 10: 11: The GNU C Library is distributed in the hope that it will be useful, 12: but WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: Lesser General Public License for more details. 15: 16: You should have received a copy of the GNU Lesser General Public 17: License along with the GNU C Library; if not, write to the Free 18: Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19: 02111-1307 USA. */ 20: 21: static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags, 22: re_string_t *input, int n); 23: static void match_ctx_clean (re_match_context_t *mctx); 24: static void match_ctx_free (re_match_context_t *cache); 25: static void match_ctx_free_subtops (re_match_context_t *mctx); 26: static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node, 27: int str_idx, int from, int to); 28: static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx); 29: static void match_ctx_clear_flag (re_match_context_t *mctx); 30: static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node, 31: int str_idx); 32: static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop, 33: int node, int str_idx); 34: static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts, 35: re_dfastate_t **limited_sts, int last_node, 36: int last_str_idx, int check_subexp); 37: static reg_errcode_t re_search_internal (const regex_t *preg, 38: const char *string, int length, 39: int start, int range, int stop, 40: size_t nmatch, regmatch_t pmatch[], 41: int eflags); 42: static int re_search_2_stub (struct re_pattern_buffer *bufp, 43: const char *string1, int length1, 44: const char *string2, int length2, 45: int start, int range, struct re_registers *regs, 46: int stop, int ret_len); 47: static int re_search_stub (struct re_pattern_buffer *bufp, 48: const char *string, int length, int start, 49: int range, int stop, struct re_registers *regs, 50: int ret_len); 51: static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, 52: int nregs, int regs_allocated); 53: static inline re_dfastate_t *acquire_init_state_context (reg_errcode_t *err, 54: const regex_t *preg, 55: const re_match_context_t *mctx, 56: int idx); 57: static reg_errcode_t prune_impossible_nodes (const regex_t *preg, 58: re_match_context_t *mctx); 59: static int check_matching (const regex_t *preg, re_match_context_t *mctx, 60: int fl_search, int fl_longest_match); 61: static int check_halt_node_context (const re_dfa_t *dfa, int node, 62: unsigned int context); 63: static int check_halt_state_context (const regex_t *preg, 64: const re_dfastate_t *state, 65: const re_match_context_t *mctx, int idx); 66: static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node, 67: int cur_idx, int nmatch); 68: static int proceed_next_node (const regex_t *preg, int nregs, regmatch_t *regs, 69: const re_match_context_t *mctx, 70: int *pidx, int node, re_node_set *eps_via_nodes, 71: struct re_fail_stack_t *fs); 72: static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs, 73: int str_idx, int *dests, int nregs, 74: regmatch_t *regs, 75: re_node_set *eps_via_nodes); 76: static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs, 77: regmatch_t *regs, re_node_set *eps_via_nodes); 78: static reg_errcode_t set_regs (const regex_t *preg, 79: const re_match_context_t *mctx, 80: size_t nmatch, regmatch_t *pmatch, 81: int fl_backtrack); 82: static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs); 83: 84: #ifdef RE_ENABLE_I18N 85: static int sift_states_iter_mb (const regex_t *preg, 86: const re_match_context_t *mctx, 87: re_sift_context_t *sctx, 88: int node_idx, int str_idx, int max_str_idx); 89: #endif /* RE_ENABLE_I18N */ 90: static reg_errcode_t sift_states_backward (const regex_t *preg, 91: re_match_context_t *mctx, 92: re_sift_context_t *sctx); 93: static reg_errcode_t update_cur_sifted_state (const regex_t *preg, 94: re_match_context_t *mctx, 95: re_sift_context_t *sctx, 96: int str_idx, 97: re_node_set *dest_nodes); 98: static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa, 99: re_node_set *dest_nodes, 100: const re_node_set *candidates); 101: static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node, 102: re_node_set *dest_nodes, 103: const re_node_set *and_nodes); 104: static int check_dst_limits (re_dfa_t *dfa, re_node_set *limits, 105: re_match_context_t *mctx, int dst_node, 106: int dst_idx, int src_node, int src_idx); 107: static int check_dst_limits_calc_pos (re_dfa_t *dfa, re_match_context_t *mctx, 108: int limit, re_node_set *eclosures, 109: int subexp_idx, int node, int str_idx); 110: static reg_errcode_t check_subexp_limits (re_dfa_t *dfa, 111: re_node_set *dest_nodes, 112: const re_node_set *candidates, 113: re_node_set *limits, 114: struct re_backref_cache_entry *bkref_ents, 115: int str_idx); 116: static reg_errcode_t sift_states_bkref (const regex_t *preg, 117: re_match_context_t *mctx, 118: re_sift_context_t *sctx, 119: int str_idx, re_node_set *dest_nodes); 120: static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx, 121: int next_state_log_idx); 122: static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst, 123: re_dfastate_t **src, int num); 124: static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg, 125: re_match_context_t *mctx, 126: re_dfastate_t *state, int fl_search); 127: static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa, 128: re_match_context_t *mctx, 129: re_node_set *cur_nodes, 130: int str_idx); 131: static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg, 132: re_dfastate_t *pstate, 133: int fl_search, 134: re_match_context_t *mctx); 135: #ifdef RE_ENABLE_I18N 136: static reg_errcode_t transit_state_mb (const regex_t *preg, 137: re_dfastate_t *pstate, 138: re_match_context_t *mctx); 139: #endif /* RE_ENABLE_I18N */ 140: static reg_errcode_t transit_state_bkref (const regex_t *preg, 141: re_node_set *nodes, 142: re_match_context_t *mctx); 143: static reg_errcode_t get_subexp (const regex_t *preg, re_match_context_t *mctx, 144: int bkref_node, int bkref_str_idx); 145: static reg_errcode_t get_subexp_sub (const regex_t *preg, 146: re_match_context_t *mctx, 147: re_sub_match_top_t *sub_top, 148: re_sub_match_last_t *sub_last, 149: int bkref_node, int bkref_str); 150: static int find_subexp_node (re_dfa_t *dfa, re_node_set *nodes, 151: int subexp_idx, int fl_open); 152: static reg_errcode_t check_arrival (const regex_t *preg, 153: re_match_context_t *mctx, 154: state_array_t *path, int top_node, 155: int top_str, int last_node, int last_str, 156: int fl_open); 157: static reg_errcode_t check_arrival_add_next_nodes (const regex_t *preg, 158: re_dfa_t *dfa, 159: re_match_context_t *mctx, 160: int str_idx, 161: re_node_set *cur_nodes, 162: re_node_set *next_nodes); 163: static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa, 164: re_node_set *cur_nodes, 165: int ex_subexp, int fl_open); 166: static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa, 167: re_node_set *dst_nodes, 168: int target, int ex_subexp, 169: int fl_open); 170: static reg_errcode_t expand_bkref_cache (const regex_t *preg, 171: re_match_context_t *mctx, 172: re_node_set *cur_nodes, int cur_str, 173: int last_str, int subexp_num, 174: int fl_open); 175: static re_dfastate_t **build_trtable (const regex_t *dfa, 176: const re_dfastate_t *state, 177: int fl_search); 178: #ifdef RE_ENABLE_I18N 179: static int check_node_accept_bytes (const regex_t *preg, int node_idx, 180: const re_string_t *input, int idx); 181: # ifdef _LIBC 182: static unsigned int find_collation_sequence_value (const unsigned char *mbs, 183: size_t name_len); 184: # endif /* _LIBC */ 185: #endif /* RE_ENABLE_I18N */ 186: static int group_nodes_into_DFAstates (const regex_t *dfa, 187: const re_dfastate_t *state, 188: re_node_set *states_node, 189: bitset *states_ch); 190: static int check_node_accept (const regex_t *preg, const re_token_t *node, 191: const re_match_context_t *mctx, int idx); 192: static reg_errcode_t extend_buffers (re_match_context_t *mctx); 193: 194: /* Entry point for POSIX code. */ 195: 196: /* regexec searches for a given pattern, specified by PREG, in the 197: string STRING. 198: 199: If NMATCH is zero or REG_NOSUB was set in the cflags argument to 200: `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 201: least NMATCH elements, and we set them to the offsets of the 202: corresponding matched substrings. 203: 204: EFLAGS specifies `execution flags' which affect matching: if 205: REG_NOTBOL is set, then ^ does not match at the beginning of the 206: string; if REG_NOTEOL is set, then $ does not match at the end. 207: 208: We return 0 if we find a match and REG_NOMATCH if not. */ 209: 210: int 211: regexec (preg, string, nmatch, pmatch, eflags) 212: const regex_t *__restrict preg; 213: const char *__restrict string; 214: size_t nmatch; 215: regmatch_t pmatch[]; 216: int eflags; 217: { 218: reg_errcode_t err; 219: int length = strlen (string); 220: if (preg->no_sub) 221: err = re_search_internal (preg, string, length, 0, length, length, 0, 222: NULL, eflags); 223: else 224: err = re_search_internal (preg, string, length, 0, length, length, nmatch, 225: pmatch, eflags); 226: return err != REG_NOERROR; 227: } 228: #ifdef _LIBC 229: weak_alias (__regexec, regexec) 230: #endif 231: 232: /* Entry points for GNU code. */ 233: 234: /* re_match, re_search, re_match_2, re_search_2 235: 236: The former two functions operate on STRING with length LENGTH, 237: while the later two operate on concatenation of STRING1 and STRING2 238: with lengths LENGTH1 and LENGTH2, respectively. 239: 240: re_match() matches the compiled pattern in BUFP against the string, 241: starting at index START. 242: 243: re_search() first tries matching at index START, then it tries to match 244: starting from index START + 1, and so on. The last start position tried 245: is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same 246: way as re_match().) 247: 248: The parameter STOP of re_{match,search}_2 specifies that no match exceeding 249: the first STOP characters of the concatenation of the strings should be 250: concerned. 251: 252: If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match 253: and all groups is stroed in REGS. (For the "_2" variants, the offsets are 254: computed relative to the concatenation, not relative to the individual 255: strings.) 256: 257: On success, re_match* functions return the length of the match, re_search* 258: return the position of the start of the match. Return value -1 means no 259: match was found and -2 indicates an internal error. */ 260: 261: int 262: re_match (bufp, string, length, start, regs) 263: struct re_pattern_buffer *bufp; 264: const char *string; 265: int length, start; 266: struct re_registers *regs; 267: { 268: return re_search_stub (bufp, string, length, start, 0, length, regs, 1); 269: } 270: #ifdef _LIBC 271: weak_alias (__re_match, re_match) 272: #endif 273: 274: int 275: re_search (bufp, string, length, start, range, regs) 276: struct re_pattern_buffer *bufp; 277: const char *string; 278: int length, start, range; 279: struct re_registers *regs; 280: { 281: return re_search_stub (bufp, string, length, start, range, length, regs, 0); 282: } 283: #ifdef _LIBC 284: weak_alias (__re_search, re_search) 285: #endif 286: 287: int 288: re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop) 289: struct re_pattern_buffer *bufp; 290: const char *string1, *string2; 291: int length1, length2, start, stop; 292: struct re_registers *regs; 293: { 294: return re_search_2_stub (bufp, string1, length1, string2, length2, 295: start, 0, regs, stop, 1); 296: } 297: #ifdef _LIBC 298: weak_alias (__re_match_2, re_match_2) 299: #endif 300: 301: int 302: re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop) 303: struct re_pattern_buffer *bufp; 304: const char *string1, *string2; 305: int length1, length2, start, range, stop; 306: struct re_registers *regs; 307: { 308: return re_search_2_stub (bufp, string1, length1, string2, length2, 309: start, range, regs, stop, 0); 310: } 311: #ifdef _LIBC 312: weak_alias (__re_search_2, re_search_2) 313: #endif 314: 315: static int 316: re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs, 317: stop, ret_len) 318: struct re_pattern_buffer *bufp; 319: const char *string1, *string2; 320: int length1, length2, start, range, stop, ret_len; 321: struct re_registers *regs; 322: { 323: const char *str; 324: int rval; 325: int len = length1 + length2; 326: int free_str = 0; 327: 328: if (BE (length1 < 0 || length2 < 0 || stop < 0, 0)) 329: return -2; 330: 331: /* Concatenate the strings. */ 332: if (length2 > 0) 333: if (length1 > 0) 334: { 335: char *s = re_malloc (char, len); 336: 337: if (BE (s == NULL, 0)) 338: return -2; 339: memcpy (s, string1, length1); 340: memcpy (s + length1, string2, length2); 341: str = s; 342: free_str = 1; 343: } 344: else 345: str = string2; 346: else 347: str = string1; 348: 349: rval = re_search_stub (bufp, str, len, start, range, stop, regs, 350: ret_len); 351: if (free_str) 352: re_free ((char *) str); 353: return rval; 354: } 355: 356: /* The parameters have the same meaning as those of re_search. 357: Additional parameters: 358: If RET_LEN is nonzero the length of the match is returned (re_match style); 359: otherwise the position of the match is returned. */ 360: 361: static int 362: re_search_stub (bufp, string, length, start, range, stop, regs, ret_len) 363: struct re_pattern_buffer *bufp; 364: const char *string; 365: int length, start, range, stop, ret_len; 366: struct re_registers *regs; 367: { 368: reg_errcode_t result; 369: regmatch_t *pmatch; 370: int nregs, rval; 371: int eflags = 0; 372: 373: /* Check for out-of-range. */ 374: if (BE (start < 0 || start > length, 0)) 375: return -1; 376: if (BE (start + range > length, 0)) 377: range = length - start; 378: else if (BE (start + range < 0, 0)) 379: range = -start; 380: 381: eflags |= (bufp->not_bol) ? REG_NOTBOL : 0; 382: eflags |= (bufp->not_eol) ? REG_NOTEOL : 0; 383: 384: /* Compile fastmap if we haven't yet. */ 385: if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate) 386: re_compile_fastmap (bufp); 387: 388: if (BE (bufp->no_sub, 0)) 389: regs = NULL; 390: 391: /* We need at least 1 register. */ 392: if (regs == NULL) 393: nregs = 1; 394: else if (BE (bufp->regs_allocated == REGS_FIXED && 395: regs->num_regs < bufp->re_nsub + 1, 0)) 396: { 397: nregs = regs->num_regs; 398: if (BE (nregs < 1, 0)) 399: { 400: /* Nothing can be copied to regs. */ 401: regs = NULL; 402: nregs = 1; 403: } 404: } 405: else 406: nregs = bufp->re_nsub + 1; 407: pmatch = re_malloc (regmatch_t, nregs); 408: if (BE (pmatch == NULL, 0)) 409: return -2; 410: 411: result = re_search_internal (bufp, string, length, start, range, stop, 412: nregs, pmatch, eflags); 413: 414: rval = 0; 415: 416: /* I hope we needn't fill ther regs with -1's when no match was found. */ 417: if (result != REG_NOERROR) 418: rval = -1; 419: else if (regs != NULL) 420: { 421: /* If caller wants register contents data back, copy them. */ 422: bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs, 423: bufp->regs_allocated); 424: if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0)) 425: rval = -2; 426: } 427: 428: if (BE (rval == 0, 1)) 429: { 430: if (ret_len) 431: { 432: assert (pmatch[0].rm_so == start); 433: rval = pmatch[0].rm_eo - start; 434: } 435: else 436: rval = pmatch[0].rm_so; 437: } 438: re_free (pmatch); 439: return rval; 440: } 441: 442: static unsigned 443: re_copy_regs (regs, pmatch, nregs, regs_allocated) 444: struct re_registers *regs; 445: regmatch_t *pmatch; 446: int nregs, regs_allocated; 447: { 448: int rval = REGS_REALLOCATE; 449: int i; 450: int need_regs = nregs + 1; 451: /* We need one extra element beyond `num_regs' for the `-1' marker GNU code 452: uses. */ 453: 454: /* Have the register data arrays been allocated? */ 455: if (regs_allocated == REGS_UNALLOCATED) 456: { /* No. So allocate them with malloc. */ 457: regs->start = re_malloc (regoff_t, need_regs); 458: if (BE (regs->start == NULL, 0)) 459: return REGS_UNALLOCATED; 460: regs->end = re_malloc (regoff_t, need_regs); 461: if (BE (regs->end == NULL, 0)) 462: { 463: re_free (regs->start); 464: return REGS_UNALLOCATED; 465: } 466: regs->num_regs = need_regs; 467: } 468: else if (regs_allocated == REGS_REALLOCATE) 469: { /* Yes. If we need more elements than were already 470: allocated, reallocate them. If we need fewer, just 471: leave it alone. */ 472: if (need_regs > regs->num_regs) 473: { 474: regs->start = re_realloc (regs->start, regoff_t, need_regs); 475: if (BE (regs->start == NULL, 0)) 476: { 477: if (regs->end != NULL) 478: re_free (regs->end); 479: return REGS_UNALLOCATED; 480: } 481: regs->end = re_realloc (regs->end, regoff_t, need_regs); 482: if (BE (regs->end == NULL, 0)) 483: { 484: re_free (regs->start); 485: return REGS_UNALLOCATED; 486: } 487: regs->num_regs = need_regs; 488: } 489: } 490: else 491: { 492: assert (regs_allocated == REGS_FIXED); 493: /* This function may not be called with REGS_FIXED and nregs too big. */ 494: assert (regs->num_regs >= nregs); 495: rval = REGS_FIXED; 496: } 497: 498: /* Copy the regs. */ 499: for (i = 0; i < nregs; ++i) 500: { 501: regs->start[i] = pmatch[i].rm_so; 502: regs->end[i] = pmatch[i].rm_eo; 503: } 504: for ( ; i < regs->num_regs; ++i) 505: regs->start[i] = regs->end[i] = -1; 506: 507: return rval; 508: } 509: 510: /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 511: ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 512: this memory for recording register information. STARTS and ENDS 513: must be allocated using the malloc library routine, and must each 514: be at least NUM_REGS * sizeof (regoff_t) bytes long. 515: 516: If NUM_REGS == 0, then subsequent matches should allocate their own 517: register data. 518: 519: Unless this function is called, the first search or match using 520: PATTERN_BUFFER will allocate its own register data, without 521: freeing the old data. */ 522: 523: void 524: re_set_registers (bufp, regs, num_regs, starts, ends) 525: struct re_pattern_buffer *bufp; 526: struct re_registers *regs; 527: unsigned num_regs; 528: regoff_t *starts, *ends; 529: { 530: if (num_regs) 531: { 532: bufp->regs_allocated = REGS_REALLOCATE; 533: regs->num_regs = num_regs; 534: regs->start = starts; 535: regs->end = ends; 536: } 537: else 538: { 539: bufp->regs_allocated = REGS_UNALLOCATED; 540: regs->num_regs = 0; 541: regs->start = regs->end = (regoff_t *) 0; 542: } 543: } 544: #ifdef _LIBC 545: weak_alias (__re_set_registers, re_set_registers) 546: #endif 547: 548: /* Entry points compatible with 4.2 BSD regex library. We don't define 549: them unless specifically requested. */ 550: 551: #if defined _REGEX_RE_COMP || defined _LIBC 552: int 553: # ifdef _LIBC 554: weak_function 555: # endif 556: re_exec (s) 557: const char *s; 558: { 559: return 0 == regexec (&re_comp_buf, s, 0, NULL, 0); 560: } 561: #endif /* _REGEX_RE_COMP */ 562: 563: static re_node_set empty_set; 564: 565: /* Internal entry point. */ 566: 567: /* Searches for a compiled pattern PREG in the string STRING, whose 568: length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same 569: mingings with regexec. START, and RANGE have the same meanings 570: with re_search. 571: Return REG_NOERROR if we find a match, and REG_NOMATCH if not, 572: otherwise return the error code. 573: Note: We assume front end functions already check ranges. 574: (START + RANGE >= 0 && START + RANGE <= LENGTH) */ 575: 576: static reg_errcode_t 577: re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, 578: eflags) 579: const regex_t *preg; 580: const char *string; 581: int length, start, range, stop, eflags; 582: size_t nmatch; 583: regmatch_t pmatch[]; 584: { 585: reg_errcode_t err; 586: re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 587: re_string_t input; 588: int left_lim, right_lim, incr; 589: int fl_longest_match, match_first, match_last = -1; 590: int fast_translate, sb; 591: re_match_context_t mctx; 592: char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate 593: && range && !preg->can_be_null) ? preg->fastmap : NULL); 594: 595: /* Check if the DFA haven't been compiled. */ 596: if (BE (preg->used == 0 || dfa->init_state == NULL 597: || dfa->init_state_word == NULL || dfa->init_state_nl == NULL 598: || dfa->init_state_begbuf == NULL, 0)) 599: return REG_NOMATCH; 600: 601: re_node_set_init_empty (&empty_set); 602: memset (&mctx, '\0', sizeof (re_match_context_t)); 603: 604: /* We must check the longest matching, if nmatch > 0. */ 605: fl_longest_match = (nmatch != 0 || dfa->nbackref); 606: 607: err = re_string_allocate (&input, string, length, dfa->nodes_len + 1, 608: preg->translate, preg->syntax & RE_ICASE); 609: if (BE (err != REG_NOERROR, 0)) 610: goto free_return; 611: input.stop = stop; 612: 613: err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2); 614: if (BE (err != REG_NOERROR, 0)) 615: goto free_return; 616: 617: /* We will log all the DFA states through which the dfa pass, 618: if nmatch > 1, or this dfa has "multibyte node", which is a 619: back-reference or a node which can accept multibyte character or 620: multi character collating element. */ 621: if (nmatch > 1 || dfa->has_mb_node) 622: { 623: mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1); 624: if (BE (mctx.state_log == NULL, 0)) 625: { 626: err = REG_ESPACE; 627: goto free_return; 628: } 629: } 630: else 631: mctx.state_log = NULL; 632: 633: #ifdef DEBUG 634: /* We assume front-end functions already check them. */ 635: assert (start + range >= 0 && start + range <= length); 636: #endif 637: 638: match_first = start; 639: input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 640: : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 641: 642: /* Check incrementally whether of not the input string match. */ 643: incr = (range < 0) ? -1 : 1; 644: left_lim = (range < 0) ? start + range : start; 645: right_lim = (range < 0) ? start : start + range; 646: sb = MB_CUR_MAX == 1; 647: fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate); 648: 649: for (;;) 650: { 651: /* At first get the current byte from input string. */ 652: if (fastmap) 653: { 654: if (BE (fast_translate, 1)) 655: { 656: unsigned RE_TRANSLATE_TYPE t 657: = (unsigned RE_TRANSLATE_TYPE) preg->translate; 658: if (BE (range >= 0, 1)) 659: { 660: if (BE (t != NULL, 0)) 661: { 662: while (BE (match_first < right_lim, 1) 663: && !fastmap[t[(unsigned char) string[match_first]]]) 664: ++match_first; 665: } 666: else 667: { 668: while (BE (match_first < right_lim, 1) 669: && !fastmap[(unsigned char) string[match_first]]) 670: ++match_first; 671: } 672: if (BE (match_first == right_lim, 0)) 673: { 674: int ch = match_first >= length 675: ? 0 : (unsigned char) string[match_first]; 676: if (!fastmap[t ? t[ch] : ch]) 677: break; 678: } 679: } 680: else 681: { 682: while (match_first >= left_lim) 683: { 684: int ch = match_first >= length 685: ? 0 : (unsigned char) string[match_first]; 686: if (fastmap[t ? t[ch] : ch]) 687: break; 688: --match_first; 689: } 690: if (match_first < left_lim) 691: break; 692: } 693: } 694: else 695: { 696: int ch; 697: 698: do 699: { 700: /* In this case, we can't determine easily the current byte, 701: since it might be a component byte of a multibyte 702: character. Then we use the constructed buffer 703: instead. */ 704: /* If MATCH_FIRST is out of the valid range, reconstruct the 705: buffers. */ 706: if (input.raw_mbs_idx + input.valid_len <= match_first 707: || match_first < input.raw_mbs_idx) 708: { 709: err = re_string_reconstruct (&input, match_first, eflags, 710: preg->newline_anchor); 711: if (BE (err != REG_NOERROR, 0)) 712: goto free_return; 713: } 714: /* If MATCH_FIRST is out of the buffer, leave it as '\0'. 715: Note that MATCH_FIRST must not be smaller than 0. */ 716: ch = ((match_first >= length) ? 0 717: : re_string_byte_at (&input, 718: match_first - input.raw_mbs_idx)); 719: if (fastmap[ch]) 720: break; 721: match_first += incr; 722: } 723: while (match_first >= left_lim && match_first <= right_lim); 724: if (! fastmap[ch]) 725: break; 726: } 727: } 728: 729: /* Reconstruct the buffers so that the matcher can assume that 730: the matching starts from the begining of the buffer. */ 731: err = re_string_reconstruct (&input, match_first, eflags, 732: preg->newline_anchor); 733: if (BE (err != REG_NOERROR, 0)) 734: goto free_return; 735: #ifdef RE_ENABLE_I18N 736: /* Eliminate it when it is a component of a multibyte character 737: and isn't the head of a multibyte character. */ 738: if (sb || re_string_first_byte (&input, 0)) 739: #endif 740: { 741: /* It seems to be appropriate one, then use the matcher. */ 742: /* We assume that the matching starts from 0. */ 743: mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0; 744: match_last = check_matching (preg, &mctx, 0, fl_longest_match); 745: if (match_last != -1) 746: { 747: if (BE (match_last == -2, 0)) 748: { 749: err = REG_ESPACE; 750: goto free_return; 751: } 752: else 753: { 754: mctx.match_last = match_last; 755: if ((!preg->no_sub && nmatch > 1) || dfa->nbackref) 756: { 757: re_dfastate_t *pstate = mctx.state_log[match_last]; 758: mctx.last_node = check_halt_state_context (preg, pstate, 759: &mctx, match_last); 760: } 761: if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) 762: || dfa->nbackref) 763: { 764: err = prune_impossible_nodes (preg, &mctx); 765: if (err == REG_NOERROR) 766: break; 767: if (BE (err != REG_NOMATCH, 0)) 768: goto free_return; 769: } 770: else 771: break; /* We found a matching. */ 772: } 773: } 774: match_ctx_clean (&mctx); 775: } 776: /* Update counter. */ 777: match_first += incr; 778: if (match_first < left_lim || right_lim < match_first) 779: break; 780: } 781: 782: /* Set pmatch[] if we need. */ 783: if (match_last != -1 && nmatch > 0) 784: { 785: int reg_idx; 786: 787: /* Initialize registers. */ 788: for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 789: pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; 790: 791: /* Set the points where matching start/end. */ 792: pmatch[0].rm_so = 0; 793: pmatch[0].rm_eo = mctx.match_last; 794: 795: if (!preg->no_sub && nmatch > 1) 796: { 797: err = set_regs (preg, &mctx, nmatch, pmatch, 798: dfa->has_plural_match && dfa->nbackref > 0); 799: if (BE (err != REG_NOERROR, 0)) 800: goto free_return; 801: } 802: 803: /* At last, add the offset to the each registers, since we slided 804: the buffers so that We can assume that the matching starts from 0. */ 805: for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 806: if (pmatch[reg_idx].rm_so != -1) 807: { 808: pmatch[reg_idx].rm_so += match_first; 809: pmatch[reg_idx].rm_eo += match_first; 810: } 811: } 812: err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR; 813: free_return: 814: re_free (mctx.state_log); 815: if (dfa->nbackref) 816: match_ctx_free (&mctx); 817: re_string_destruct (&input); 818: return err; 819: } 820: 821: static reg_errcode_t 822: prune_impossible_nodes (preg, mctx) 823: const regex_t *preg; 824: re_match_context_t *mctx; 825: { 826: int halt_node, match_last; 827: reg_errcode_t ret; 828: re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 829: re_dfastate_t **sifted_states; 830: re_dfastate_t **lim_states = NULL; 831: re_sift_context_t sctx; 832: #ifdef DEBUG 833: assert (mctx->state_log != NULL); 834: #endif 835: match_last = mctx->match_last; 836: halt_node = mctx->last_node; 837: sifted_states = re_malloc (re_dfastate_t *, match_last + 1); 838: if (BE (sifted_states == NULL, 0)) 839: { 840: ret = REG_ESPACE; 841: goto free_return; 842: } 843: if (dfa->nbackref) 844: { 845: lim_states = re_malloc (re_dfastate_t *, match_last + 1); 846: if (BE (lim_states == NULL, 0)) 847: { 848: ret = REG_ESPACE; 849: goto free_return; 850: } 851: while (1) 852: { 853: memset (lim_states, '\0', 854: sizeof (re_dfastate_t *) * (match_last + 1)); 855: match_ctx_clear_flag (mctx); 856: sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 857: match_last, 0); 858: ret = sift_states_backward (preg, mctx, &sctx); 859: re_node_set_free (&sctx.limits); 860: if (BE (ret != REG_NOERROR, 0)) 861: goto free_return; 862: if (sifted_states[0] != NULL || lim_states[0] != NULL) 863: break; 864: do 865: { 866: --match_last; 867: if (match_last < 0) 868: { 869: ret = REG_NOMATCH; 870: goto free_return; 871: } 872: } while (!mctx->state_log[match_last]->halt); 873: halt_node = check_halt_state_context (preg, 874: mctx->state_log[match_last], 875: mctx, match_last); 876: } 877: ret = merge_state_array (dfa, sifted_states, lim_states, 878: match_last + 1); 879: re_free (lim_states); 880: lim_states = NULL; 881: if (BE (ret != REG_NOERROR, 0)) 882: goto free_return; 883: } 884: else 885: { 886: sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, 887: match_last, 0); 888: ret = sift_states_backward (preg, mctx, &sctx); 889: re_node_set_free (&sctx.limits); 890: if (BE (ret != REG_NOERROR, 0)) 891: goto free_return; 892: } 893: re_free (mctx->state_log); 894: mctx->state_log = sifted_states; 895: sifted_states = NULL; 896: mctx->last_node = halt_node; 897: mctx->match_last = match_last; 898: ret = REG_NOERROR; 899: free_return: 900: re_free (sifted_states); 901: re_free (lim_states); 902: return ret; 903: } 904: 905: /* Acquire an initial state and return it. 906: We must select appropriate initial state depending on the context, 907: since initial states may have constraints like "\<", "^", etc.. */ 908: 909: static inline re_dfastate_t * 910: acquire_init_state_context (err, preg, mctx, idx) 911: reg_errcode_t *err; 912: const regex_t *preg; 913: const re_match_context_t *mctx; 914: int idx; 915: { 916: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 917: 918: *err = REG_NOERROR; 919: if (dfa->init_state->has_constraint) 920: { 921: unsigned int context; 922: context = re_string_context_at (mctx->input, idx - 1, mctx->eflags, 923: preg->newline_anchor); 924: if (IS_WORD_CONTEXT (context)) 925: return dfa->init_state_word; 926: else if (IS_ORDINARY_CONTEXT (context)) 927: return dfa->init_state; 928: else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context)) 929: return dfa->init_state_begbuf; 930: else if (IS_NEWLINE_CONTEXT (context)) 931: return dfa->init_state_nl; 932: else if (IS_BEGBUF_CONTEXT (context)) 933: { 934: /* It is relatively rare case, then calculate on demand. */ 935: return re_acquire_state_context (err, dfa, 936: dfa->init_state->entrance_nodes, 937: context); 938: } 939: else 940: /* Must not happen? */ 941: return dfa->init_state; 942: } 943: else 944: return dfa->init_state; 945: } 946: 947: /* Check whether the regular expression match input string INPUT or not, 948: and return the index where the matching end, return -1 if not match, 949: or return -2 in case of an error. 950: FL_SEARCH means we must search where the matching starts, 951: FL_LONGEST_MATCH means we want the POSIX longest matching. 952: Note that the matcher assume that the maching starts from the current 953: index of the buffer. */ 954: 955: static int 956: check_matching (preg, mctx, fl_search, fl_longest_match) 957: const regex_t *preg; 958: re_match_context_t *mctx; 959: int fl_search, fl_longest_match; 960: { 961: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 962: reg_errcode_t err; 963: int match = 0; 964: int match_last = -1; 965: int cur_str_idx = re_string_cur_idx (mctx->input); 966: re_dfastate_t *cur_state; 967: 968: cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx); 969: /* An initial state must not be NULL(invalid state). */ 970: if (BE (cur_state == NULL, 0)) 971: return -2; 972: if (mctx->state_log != NULL) 973: mctx->state_log[cur_str_idx] = cur_state; 974: 975: /* Check OP_OPEN_SUBEXP in the initial state in case that we use them 976: later. E.g. Processing back references. */ 977: if (dfa->nbackref) 978: { 979: err = check_subexp_matching_top (dfa, mctx, &cur_state->nodes, 0); 980: if (BE (err != REG_NOERROR, 0)) 981: return err; 982: } 983: 984: if (cur_state->has_backref) 985: { 986: err = transit_state_bkref (preg, &cur_state->nodes, mctx); 987: if (BE (err != REG_NOERROR, 0)) 988: return err; 989: } 990: 991: /* If the RE accepts NULL string. */ 992: if (cur_state->halt) 993: { 994: if (!cur_state->has_constraint 995: || check_halt_state_context (preg, cur_state, mctx, cur_str_idx)) 996: { 997: if (!fl_longest_match) 998: return cur_str_idx; 999: else 1000: { 1001: match_last = cur_str_idx; 1002: match = 1; 1003: } 1004: } 1005: } 1006: 1007: while (!re_string_eoi (mctx->input)) 1008: { 1009: cur_state = transit_state (&err, preg, mctx, cur_state, 1010: fl_search && !match); 1011: if (cur_state == NULL) /* Reached at the invalid state or an error. */ 1012: { 1013: cur_str_idx = re_string_cur_idx (mctx->input); 1014: if (BE (err != REG_NOERROR, 0)) 1015: return -2; 1016: if (fl_search && !match) 1017: { 1018: /* Restart from initial state, since we are searching 1019: the point from where matching start. */ 1020: #ifdef RE_ENABLE_I18N 1021: if (MB_CUR_MAX == 1 1022: || re_string_first_byte (mctx->input, cur_str_idx)) 1023: #endif /* RE_ENABLE_I18N */ 1024: cur_state = acquire_init_state_context (&err, preg, mctx, 1025: cur_str_idx); 1026: if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 1027: return -2; 1028: if (mctx->state_log != NULL) 1029: mctx->state_log[cur_str_idx] = cur_state; 1030: } 1031: else if (!fl_longest_match && match) 1032: break; 1033: else /* (fl_longest_match && match) || (!fl_search && !match) */ 1034: { 1035: if (mctx->state_log == NULL) 1036: break; 1037: else 1038: { 1039: int max = mctx->state_log_top; 1040: for (; cur_str_idx <= max; ++cur_str_idx) 1041: if (mctx->state_log[cur_str_idx] != NULL) 1042: break; 1043: if (cur_str_idx > max) 1044: break; 1045: } 1046: } 1047: } 1048: 1049: if (cur_state != NULL && cur_state->halt) 1050: { 1051: /* Reached at a halt state. 1052: Check the halt state can satisfy the current context. */ 1053: if (!cur_state->has_constraint 1054: || check_halt_state_context (preg, cur_state, mctx, 1055: re_string_cur_idx (mctx->input))) 1056: { 1057: /* We found an appropriate halt state. */ 1058: match_last = re_string_cur_idx (mctx->input); 1059: match = 1; 1060: if (!fl_longest_match) 1061: break; 1062: } 1063: } 1064: } 1065: return match_last; 1066: } 1067: 1068: /* Check NODE match the current context. */ 1069: 1070: static int check_halt_node_context (dfa, node, context) 1071: const re_dfa_t *dfa; 1072: int node; 1073: unsigned int context; 1074: { 1075: re_token_type_t type = dfa->nodes[node].type; 1076: unsigned int constraint = dfa->nodes[node].constraint; 1077: if (type != END_OF_RE) 1078: return 0; 1079: if (!constraint) 1080: return 1; 1081: if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context)) 1082: return 0; 1083: return 1; 1084: } 1085: 1086: /* Check the halt state STATE match the current context. 1087: Return 0 if not match, if the node, STATE has, is a halt node and 1088: match the context, return the node. */ 1089: 1090: static int 1091: check_halt_state_context (preg, state, mctx, idx) 1092: const regex_t *preg; 1093: const re_dfastate_t *state; 1094: const re_match_context_t *mctx; 1095: int idx; 1096: { 1097: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1098: int i; 1099: unsigned int context; 1100: #ifdef DEBUG 1101: assert (state->halt); 1102: #endif 1103: context = re_string_context_at (mctx->input, idx, mctx->eflags, 1104: preg->newline_anchor); 1105: for (i = 0; i < state->nodes.nelem; ++i) 1106: if (check_halt_node_context (dfa, state->nodes.elems[i], context)) 1107: return state->nodes.elems[i]; 1108: return 0; 1109: } 1110: 1111: /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA 1112: corresponding to the DFA). 1113: Return the destination node, and update EPS_VIA_NODES, return -1 in case 1114: of errors. */ 1115: 1116: static int 1117: proceed_next_node (preg, nregs, regs, mctx, pidx, node, eps_via_nodes, fs) 1118: const regex_t *preg; 1119: regmatch_t *regs; 1120: const re_match_context_t *mctx; 1121: int nregs, *pidx, node; 1122: re_node_set *eps_via_nodes; 1123: struct re_fail_stack_t *fs; 1124: { 1125: re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 1126: int i, err, dest_node; 1127: dest_node = -1; 1128: if (IS_EPSILON_NODE (dfa->nodes[node].type)) 1129: { 1130: re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes; 1131: int ndest, dest_nodes[2]; 1132: err = re_node_set_insert (eps_via_nodes, node); 1133: if (BE (err < 0, 0)) 1134: return -1; 1135: /* Pick up valid destinations. */ 1136: for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i) 1137: { 1138: int candidate = dfa->edests[node].elems[i]; 1139: if (!re_node_set_contains (cur_nodes, candidate)) 1140: continue; 1141: dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0]; 1142: dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1]; 1143: ++ndest; 1144: } 1145: if (ndest <= 1) 1146: return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0); 1147: /* In order to avoid infinite loop like "(a*)*". */ 1148: if (re_node_set_contains (eps_via_nodes, dest_nodes[0])) 1149: return dest_nodes[1]; 1150: if (fs != NULL) 1151: push_fail_stack (fs, *pidx, dest_nodes, nregs, regs, eps_via_nodes); 1152: return dest_nodes[0]; 1153: } 1154: else 1155: { 1156: int naccepted = 0; 1157: re_token_type_t type = dfa->nodes[node].type; 1158: 1159: #ifdef RE_ENABLE_I18N 1160: if (ACCEPT_MB_NODE (type)) 1161: naccepted = check_node_accept_bytes (preg, node, mctx->input, *pidx); 1162: else 1163: #endif /* RE_ENABLE_I18N */ 1164: if (type == OP_BACK_REF) 1165: { 1166: int subexp_idx = dfa->nodes[node].opr.idx; 1167: naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so; 1168: if (fs != NULL) 1169: { 1170: if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1) 1171: return -1; 1172: else if (naccepted) 1173: { 1174: char *buf = (char *) re_string_get_buffer (mctx->input); 1175: if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx, 1176: naccepted) != 0) 1177: return -1; 1178: } 1179: } 1180: 1181: if (naccepted == 0) 1182: { 1183: err = re_node_set_insert (eps_via_nodes, node); 1184: if (BE (err < 0, 0)) 1185: return -2; 1186: dest_node = dfa->edests[node].elems[0]; 1187: if (re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1188: dest_node)) 1189: return dest_node; 1190: } 1191: } 1192: 1193: if (naccepted != 0 1194: || check_node_accept (preg, dfa->nodes + node, mctx, *pidx)) 1195: { 1196: dest_node = dfa->nexts[node]; 1197: *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted; 1198: if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL 1199: || !re_node_set_contains (&mctx->state_log[*pidx]->nodes, 1200: dest_node))) 1201: return -1; 1202: re_node_set_empty (eps_via_nodes); 1203: return dest_node; 1204: } 1205: } 1206: return -1; 1207: } 1208: 1209: static reg_errcode_t 1210: push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes) 1211: struct re_fail_stack_t *fs; 1212: int str_idx, *dests, nregs; 1213: regmatch_t *regs; 1214: re_node_set *eps_via_nodes; 1215: { 1216: reg_errcode_t err; 1217: int num = fs->num++; 1218: if (fs->num == fs->alloc) 1219: { 1220: struct re_fail_stack_ent_t *new_array; 1221: fs->alloc *= 2; 1222: new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t) 1223: * fs->alloc)); 1224: if (new_array == NULL) 1225: return REG_ESPACE; 1226: fs->stack = new_array; 1227: } 1228: fs->stack[num].idx = str_idx; 1229: fs->stack[num].node = dests[1]; 1230: fs->stack[num].regs = re_malloc (regmatch_t, nregs); 1231: memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs); 1232: err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes); 1233: return err; 1234: } 1235: 1236: static int 1237: pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes) 1238: struct re_fail_stack_t *fs; 1239: int *pidx, nregs; 1240: regmatch_t *regs; 1241: re_node_set *eps_via_nodes; 1242: { 1243: int num = --fs->num; 1244: assert (num >= 0); 1245: *pidx = fs->stack[num].idx; 1246: memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs); 1247: re_node_set_free (eps_via_nodes); 1248: re_free (fs->stack[num].regs); 1249: *eps_via_nodes = fs->stack[num].eps_via_nodes; 1250: return fs->stack[num].node; 1251: } 1252: 1253: /* Set the positions where the subexpressions are starts/ends to registers 1254: PMATCH. 1255: Note: We assume that pmatch[0] is already set, and 1256: pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */ 1257: 1258: static reg_errcode_t 1259: set_regs (preg, mctx, nmatch, pmatch, fl_backtrack) 1260: const regex_t *preg; 1261: const re_match_context_t *mctx; 1262: size_t nmatch; 1263: regmatch_t *pmatch; 1264: int fl_backtrack; 1265: { 1266: re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 1267: int idx, cur_node, real_nmatch; 1268: re_node_set eps_via_nodes; 1269: struct re_fail_stack_t *fs; 1270: struct re_fail_stack_t fs_body = {0, 2, NULL}; 1271: #ifdef DEBUG 1272: assert (nmatch > 1); 1273: assert (mctx->state_log != NULL); 1274: #endif 1275: if (fl_backtrack) 1276: { 1277: fs = &fs_body; 1278: fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc); 1279: } 1280: else 1281: fs = NULL; 1282: cur_node = dfa->init_node; 1283: real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1; 1284: re_node_set_init_empty (&eps_via_nodes); 1285: for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;) 1286: { 1287: update_regs (dfa, pmatch, cur_node, idx, real_nmatch); 1288: if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node) 1289: { 1290: int reg_idx; 1291: if (fs) 1292: { 1293: for (reg_idx = 0; reg_idx < nmatch; ++reg_idx) 1294: if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1) 1295: break; 1296: if (reg_idx == nmatch) 1297: { 1298: re_node_set_free (&eps_via_nodes); 1299: return free_fail_stack_return (fs); 1300: } 1301: cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1302: &eps_via_nodes); 1303: } 1304: else 1305: { 1306: re_node_set_free (&eps_via_nodes); 1307: return REG_NOERROR; 1308: } 1309: } 1310: 1311: /* Proceed to next node. */ 1312: cur_node = proceed_next_node (preg, nmatch, pmatch, mctx, &idx, cur_node, 1313: &eps_via_nodes, fs); 1314: 1315: if (BE (cur_node < 0, 0)) 1316: { 1317: if (cur_node == -2) 1318: return REG_ESPACE; 1319: if (fs) 1320: cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, 1321: &eps_via_nodes); 1322: else 1323: { 1324: re_node_set_free (&eps_via_nodes); 1325: return REG_NOMATCH; 1326: } 1327: } 1328: } 1329: re_node_set_free (&eps_via_nodes); 1330: return free_fail_stack_return (fs); 1331: } 1332: 1333: static reg_errcode_t 1334: free_fail_stack_return (fs) 1335: struct re_fail_stack_t *fs; 1336: { 1337: if (fs) 1338: { 1339: int fs_idx; 1340: for (fs_idx = 0; fs_idx < fs->num; ++fs_idx) 1341: { 1342: re_node_set_free (&fs->stack[fs_idx].eps_via_nodes); 1343: re_free (fs->stack[fs_idx].regs); 1344: } 1345: re_free (fs->stack); 1346: } 1347: return REG_NOERROR; 1348: } 1349: 1350: static void 1351: update_regs (dfa, pmatch, cur_node, cur_idx, nmatch) 1352: re_dfa_t *dfa; 1353: regmatch_t *pmatch; 1354: int cur_node, cur_idx, nmatch; 1355: { 1356: int type = dfa->nodes[cur_node].type; 1357: int reg_num; 1358: if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP) 1359: return; 1360: reg_num = dfa->nodes[cur_node].opr.idx + 1; 1361: if (reg_num >= nmatch) 1362: return; 1363: if (type == OP_OPEN_SUBEXP) 1364: { 1365: /* We are at the first node of this sub expression. */ 1366: pmatch[reg_num].rm_so = cur_idx; 1367: pmatch[reg_num].rm_eo = -1; 1368: } 1369: else if (type == OP_CLOSE_SUBEXP) 1370: /* We are at the first node of this sub expression. */ 1371: pmatch[reg_num].rm_eo = cur_idx; 1372: } 1373: 1374: #define NUMBER_OF_STATE 1 1375: 1376: /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0 1377: and sift the nodes in each states according to the following rules. 1378: Updated state_log will be wrote to STATE_LOG. 1379: 1380: Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if... 1381: 1. When STR_IDX == MATCH_LAST(the last index in the state_log): 1382: If `a' isn't the LAST_NODE and `a' can't epsilon transit to 1383: the LAST_NODE, we throw away the node `a'. 1384: 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts 1385: string `s' and transit to `b': 1386: i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw 1387: away the node `a'. 1388: ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is 1389: throwed away, we throw away the node `a'. 1390: 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b': 1391: i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the 1392: node `a'. 1393: ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away, 1394: we throw away the node `a'. */ 1395: 1396: #define STATE_NODE_CONTAINS(state,node) \ 1397: ((state) != NULL && re_node_set_contains (&(state)->nodes, node)) 1398: 1399: static reg_errcode_t 1400: sift_states_backward (preg, mctx, sctx) 1401: const regex_t *preg; 1402: re_match_context_t *mctx; 1403: re_sift_context_t *sctx; 1404: { 1405: reg_errcode_t err; 1406: re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 1407: int null_cnt = 0; 1408: int str_idx = sctx->last_str_idx; 1409: re_node_set cur_dest; 1410: re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */ 1411: 1412: #ifdef DEBUG 1413: assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL); 1414: #endif 1415: cur_src = &mctx->state_log[str_idx]->nodes; 1416: 1417: /* Build sifted state_log[str_idx]. It has the nodes which can epsilon 1418: transit to the last_node and the last_node itself. */ 1419: err = re_node_set_init_1 (&cur_dest, sctx->last_node); 1420: if (BE (err != REG_NOERROR, 0)) 1421: return err; 1422: err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest); 1423: if (BE (err != REG_NOERROR, 0)) 1424: goto free_return; 1425: 1426: /* Then check each states in the state_log. */ 1427: while (str_idx > 0) 1428: { 1429: int i, ret; 1430: /* Update counters. */ 1431: null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0; 1432: if (null_cnt > mctx->max_mb_elem_len) 1433: { 1434: memset (sctx->sifted_states, '\0', 1435: sizeof (re_dfastate_t *) * str_idx); 1436: re_node_set_free (&cur_dest); 1437: return REG_NOERROR; 1438: } 1439: re_node_set_empty (&cur_dest); 1440: --str_idx; 1441: cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set 1442: : &mctx->state_log[str_idx]->nodes); 1443: 1444: /* Then build the next sifted state. 1445: We build the next sifted state on `cur_dest', and update 1446: `sifted_states[str_idx]' with `cur_dest'. 1447: Note: 1448: `cur_dest' is the sifted state from `state_log[str_idx + 1]'. 1449: `cur_src' points the node_set of the old `state_log[str_idx]'. */ 1450: for (i = 0; i < cur_src->nelem; i++) 1451: { 1452: int prev_node = cur_src->elems[i]; 1453: int naccepted = 0; 1454: re_token_type_t type = dfa->nodes[prev_node].type; 1455: 1456: if (IS_EPSILON_NODE(type)) 1457: continue; 1458: #ifdef RE_ENABLE_I18N 1459: /* If the node may accept `multi byte'. */ 1460: if (ACCEPT_MB_NODE (type)) 1461: naccepted = sift_states_iter_mb (preg, mctx, sctx, prev_node, 1462: str_idx, sctx->last_str_idx); 1463: 1464: #endif /* RE_ENABLE_I18N */ 1465: /* We don't check backreferences here. 1466: See update_cur_sifted_state(). */ 1467: 1468: if (!naccepted 1469: && check_node_accept (preg, dfa->nodes + prev_node, mctx, 1470: str_idx) 1471: && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1], 1472: dfa->nexts[prev_node])) 1473: naccepted = 1; 1474: 1475: if (naccepted == 0) 1476: continue; 1477: 1478: if (sctx->limits.nelem) 1479: { 1480: int to_idx = str_idx + naccepted; 1481: if (check_dst_limits (dfa, &sctx->limits, mctx, 1482: dfa->nexts[prev_node], to_idx, 1483: prev_node, str_idx)) 1484: continue; 1485: } 1486: ret = re_node_set_insert (&cur_dest, prev_node); 1487: if (BE (ret == -1, 0)) 1488: { 1489: err = REG_ESPACE; 1490: goto free_return; 1491: } 1492: } 1493: 1494: /* Add all the nodes which satisfy the following conditions: 1495: - It can epsilon transit to a node in CUR_DEST. 1496: - It is in CUR_SRC. 1497: And update state_log. */ 1498: err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest); 1499: if (BE (err != REG_NOERROR, 0)) 1500: goto free_return; 1501: } 1502: err = REG_NOERROR; 1503: free_return: 1504: re_node_set_free (&cur_dest); 1505: return err; 1506: } 1507: 1508: /* Helper functions. */ 1509: 1510: static inline reg_errcode_t 1511: clean_state_log_if_need (mctx, next_state_log_idx) 1512: re_match_context_t *mctx; 1513: int next_state_log_idx; 1514: { 1515: int top = mctx->state_log_top; 1516: 1517: if (next_state_log_idx >= mctx->input->bufs_len 1518: || (next_state_log_idx >= mctx->input->valid_len 1519: && mctx->input->valid_len < mctx->input->len)) 1520: { 1521: reg_errcode_t err; 1522: err = extend_buffers (mctx); 1523: if (BE (err != REG_NOERROR, 0)) 1524: return err; 1525: } 1526: 1527: if (top < next_state_log_idx) 1528: { 1529: memset (mctx->state_log + top + 1, '\0', 1530: sizeof (re_dfastate_t *) * (next_state_log_idx - top)); 1531: mctx->state_log_top = next_state_log_idx; 1532: } 1533: return REG_NOERROR; 1534: } 1535: 1536: static reg_errcode_t 1537: merge_state_array (dfa, dst, src, num) 1538: re_dfa_t *dfa; 1539: re_dfastate_t **dst; 1540: re_dfastate_t **src; 1541: int num; 1542: { 1543: int st_idx; 1544: reg_errcode_t err; 1545: for (st_idx = 0; st_idx < num; ++st_idx) 1546: { 1547: if (dst[st_idx] == NULL) 1548: dst[st_idx] = src[st_idx]; 1549: else if (src[st_idx] != NULL) 1550: { 1551: re_node_set merged_set; 1552: err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes, 1553: &src[st_idx]->nodes); 1554: if (BE (err != REG_NOERROR, 0)) 1555: return err; 1556: dst[st_idx] = re_acquire_state (&err, dfa, &merged_set); 1557: re_node_set_free (&merged_set); 1558: if (BE (err != REG_NOERROR, 0)) 1559: return err; 1560: } 1561: } 1562: return REG_NOERROR; 1563: } 1564: 1565: static reg_errcode_t 1566: update_cur_sifted_state (preg, mctx, sctx, str_idx, dest_nodes) 1567: const regex_t *preg; 1568: re_match_context_t *mctx; 1569: re_sift_context_t *sctx; 1570: int str_idx; 1571: re_node_set *dest_nodes; 1572: { 1573: reg_errcode_t err; 1574: re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 1575: const re_node_set *candidates; 1576: candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set 1577: : &mctx->state_log[str_idx]->nodes); 1578: 1579: /* At first, add the nodes which can epsilon transit to a node in 1580: DEST_NODE. */ 1581: if (dest_nodes->nelem) 1582: { 1583: err = add_epsilon_src_nodes (dfa, dest_nodes, candidates); 1584: if (BE (err != REG_NOERROR, 0)) 1585: return err; 1586: } 1587: 1588: /* Then, check the limitations in the current sift_context. */ 1589: if (dest_nodes->nelem && sctx->limits.nelem) 1590: { 1591: err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits, 1592: mctx->bkref_ents, str_idx); 1593: if (BE (err != REG_NOERROR, 0)) 1594: return err; 1595: } 1596: 1597: /* Update state_log. */ 1598: sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes); 1599: if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0)) 1600: return err; 1601: 1602: if ((mctx->state_log[str_idx] != NULL 1603: && mctx->state_log[str_idx]->has_backref)) 1604: { 1605: err = sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes); 1606: if (BE (err != REG_NOERROR, 0)) 1607: return err; 1608: } 1609: return REG_NOERROR; 1610: } 1611: 1612: static reg_errcode_t 1613: add_epsilon_src_nodes (dfa, dest_nodes, candidates) 1614: re_dfa_t *dfa; 1615: re_node_set *dest_nodes; 1616: const re_node_set *candidates; 1617: { 1618: reg_errcode_t err; 1619: int src_idx; 1620: re_node_set src_copy; 1621: 1622: err = re_node_set_init_copy (&src_copy, dest_nodes); 1623: if (BE (err != REG_NOERROR, 0)) 1624: return err; 1625: for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx) 1626: { 1627: err = re_node_set_add_intersect (dest_nodes, candidates, 1628: dfa->inveclosures 1629: + src_copy.elems[src_idx]); 1630: if (BE (err != REG_NOERROR, 0)) 1631: { 1632: re_node_set_free (&src_copy); 1633: return err; 1634: } 1635: } 1636: re_node_set_free (&src_copy); 1637: return REG_NOERROR; 1638: } 1639: 1640: static reg_errcode_t 1641: sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates) 1642: re_dfa_t *dfa; 1643: int node; 1644: re_node_set *dest_nodes; 1645: const re_node_set *candidates; 1646: { 1647: int ecl_idx; 1648: reg_errcode_t err; 1649: re_node_set *inv_eclosure = dfa->inveclosures + node; 1650: re_node_set except_nodes; 1651: re_node_set_init_empty (&except_nodes); 1652: for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1653: { 1654: int cur_node = inv_eclosure->elems[ecl_idx]; 1655: if (cur_node == node) 1656: continue; 1657: if (IS_EPSILON_NODE (dfa->nodes[cur_node].type)) 1658: { 1659: int edst1 = dfa->edests[cur_node].elems[0]; 1660: int edst2 = ((dfa->edests[cur_node].nelem > 1) 1661: ? dfa->edests[cur_node].elems[1] : -1); 1662: if ((!re_node_set_contains (inv_eclosure, edst1) 1663: && re_node_set_contains (dest_nodes, edst1)) 1664: || (edst2 > 0 1665: && !re_node_set_contains (inv_eclosure, edst2) 1666: && re_node_set_contains (dest_nodes, edst2))) 1667: { 1668: err = re_node_set_add_intersect (&except_nodes, candidates, 1669: dfa->inveclosures + cur_node); 1670: if (BE (err != REG_NOERROR, 0)) 1671: { 1672: re_node_set_free (&except_nodes); 1673: return err; 1674: } 1675: } 1676: } 1677: } 1678: for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx) 1679: { 1680: int cur_node = inv_eclosure->elems[ecl_idx]; 1681: if (!re_node_set_contains (&except_nodes, cur_node)) 1682: { 1683: int idx = re_node_set_contains (dest_nodes, cur_node) - 1; 1684: re_node_set_remove_at (dest_nodes, idx); 1685: } 1686: } 1687: re_node_set_free (&except_nodes); 1688: return REG_NOERROR; 1689: } 1690: 1691: static int 1692: check_dst_limits (dfa, limits, mctx, dst_node, dst_idx, src_node, src_idx) 1693: re_dfa_t *dfa; 1694: re_node_set *limits; 1695: re_match_context_t *mctx; 1696: int dst_node, dst_idx, src_node, src_idx; 1697: { 1698: int lim_idx, src_pos, dst_pos; 1699: 1700: for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1701: { 1702: int subexp_idx; 1703: struct re_backref_cache_entry *ent; 1704: ent = mctx->bkref_ents + limits->elems[lim_idx]; 1705: subexp_idx = dfa->nodes[ent->node].opr.idx - 1; 1706: 1707: dst_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx], 1708: dfa->eclosures + dst_node, 1709: subexp_idx, dst_node, dst_idx); 1710: src_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx], 1711: dfa->eclosures + src_node, 1712: subexp_idx, src_node, src_idx); 1713: 1714: /* In case of: 1715: <src> <dst> ( <subexp> ) 1716: ( <subexp> ) <src> <dst> 1717: ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */ 1718: if (src_pos == dst_pos) 1719: continue; /* This is unrelated limitation. */ 1720: else 1721: return 1; 1722: } 1723: return 0; 1724: } 1725: 1726: static int 1727: check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, node, 1728: str_idx) 1729: re_dfa_t *dfa; 1730: re_match_context_t *mctx; 1731: re_node_set *eclosures; 1732: int limit, subexp_idx, node, str_idx; 1733: { 1734: struct re_backref_cache_entry *lim = mctx->bkref_ents + limit; 1735: int pos = (str_idx < lim->subexp_from ? -1 1736: : (lim->subexp_to < str_idx ? 1 : 0)); 1737: if (pos == 0 1738: && (str_idx == lim->subexp_from || str_idx == lim->subexp_to)) 1739: { 1740: int node_idx; 1741: for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx) 1742: { 1743: int node = eclosures->elems[node_idx]; 1744: re_token_type_t type= dfa->nodes[node].type; 1745: if (type == OP_BACK_REF) 1746: { 1747: int bi = search_cur_bkref_entry (mctx, str_idx); 1748: for (; bi < mctx->nbkref_ents; ++bi) 1749: { 1750: struct re_backref_cache_entry *ent = mctx->bkref_ents + bi; 1751: if (ent->str_idx > str_idx) 1752: break; 1753: if (ent->node == node && ent->subexp_from == ent->subexp_to) 1754: { 1755: int cpos, dst; 1756: dst = dfa->edests[node].elems[0]; 1757: cpos = check_dst_limits_calc_pos (dfa, mctx, limit, 1758: dfa->eclosures + dst, 1759: subexp_idx, dst, 1760: str_idx); 1761: if ((str_idx == lim->subexp_from && cpos == -1) 1762: || (str_idx == lim->subexp_to && cpos == 0)) 1763: return cpos; 1764: } 1765: } 1766: } 1767: if (type == OP_OPEN_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx 1768: && str_idx == lim->subexp_from) 1769: { 1770: pos = -1; 1771: break; 1772: } 1773: if (type == OP_CLOSE_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx 1774: && str_idx == lim->subexp_to) 1775: break; 1776: } 1777: if (node_idx == eclosures->nelem && str_idx == lim->subexp_to) 1778: pos = 1; 1779: } 1780: return pos; 1781: } 1782: 1783: /* Check the limitations of sub expressions LIMITS, and remove the nodes 1784: which are against limitations from DEST_NODES. */ 1785: 1786: static reg_errcode_t 1787: check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx) 1788: re_dfa_t *dfa; 1789: re_node_set *dest_nodes; 1790: const re_node_set *candidates; 1791: re_node_set *limits; 1792: struct re_backref_cache_entry *bkref_ents; 1793: int str_idx; 1794: { 1795: reg_errcode_t err; 1796: int node_idx, lim_idx; 1797: 1798: for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx) 1799: { 1800: int subexp_idx; 1801: struct re_backref_cache_entry *ent; 1802: ent = bkref_ents + limits->elems[lim_idx]; 1803: 1804: if (str_idx <= ent->subexp_from || ent->str_idx < str_idx) 1805: continue; /* This is unrelated limitation. */ 1806: 1807: subexp_idx = dfa->nodes[ent->node].opr.idx - 1; 1808: if (ent->subexp_to == str_idx) 1809: { 1810: int ops_node = -1; 1811: int cls_node = -1; 1812: for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 1813: { 1814: int node = dest_nodes->elems[node_idx]; 1815: re_token_type_t type= dfa->nodes[node].type; 1816: if (type == OP_OPEN_SUBEXP 1817: && subexp_idx == dfa->nodes[node].opr.idx) 1818: ops_node = node; 1819: else if (type == OP_CLOSE_SUBEXP 1820: && subexp_idx == dfa->nodes[node].opr.idx) 1821: cls_node = node; 1822: } 1823: 1824: /* Check the limitation of the open subexpression. */ 1825: /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */ 1826: if (ops_node >= 0) 1827: { 1828: err = sub_epsilon_src_nodes(dfa, ops_node, dest_nodes, 1829: candidates); 1830: if (BE (err != REG_NOERROR, 0)) 1831: return err; 1832: } 1833: /* Check the limitation of the close subexpression. */ 1834: for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 1835: { 1836: int node = dest_nodes->elems[node_idx]; 1837: if (!re_node_set_contains (dfa->inveclosures + node, cls_node) 1838: && !re_node_set_contains (dfa->eclosures + node, cls_node)) 1839: { 1840: /* It is against this limitation. 1841: Remove it form the current sifted state. */ 1842: err = sub_epsilon_src_nodes(dfa, node, dest_nodes, 1843: candidates); 1844: if (BE (err != REG_NOERROR, 0)) 1845: return err; 1846: --node_idx; 1847: } 1848: } 1849: } 1850: else /* (ent->subexp_to != str_idx) */ 1851: { 1852: for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx) 1853: { 1854: int node = dest_nodes->elems[node_idx]; 1855: re_token_type_t type= dfa->nodes[node].type; 1856: if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP) 1857: { 1858: if (subexp_idx != dfa->nodes[node].opr.idx) 1859: continue; 1860: if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx) 1861: || (type == OP_OPEN_SUBEXP)) 1862: { 1863: /* It is against this limitation. 1864: Remove it form the current sifted state. */ 1865: err = sub_epsilon_src_nodes(dfa, node, dest_nodes, 1866: candidates); 1867: if (BE (err != REG_NOERROR, 0)) 1868: return err; 1869: } 1870: } 1871: } 1872: } 1873: } 1874: return REG_NOERROR; 1875: } 1876: 1877: static reg_errcode_t 1878: sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes) 1879: const regex_t *preg; 1880: re_match_context_t *mctx; 1881: re_sift_context_t *sctx; 1882: int str_idx; 1883: re_node_set *dest_nodes; 1884: { 1885: reg_errcode_t err; 1886: re_dfa_t *dfa = (re_dfa_t *)preg->buffer; 1887: int node_idx, node; 1888: re_sift_context_t local_sctx; 1889: const re_node_set *candidates; 1890: candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set 1891: : &mctx->state_log[str_idx]->nodes); 1892: local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */ 1893: 1894: for (node_idx = 0; node_idx < candidates->nelem; ++node_idx) 1895: { 1896: int cur_bkref_idx = re_string_cur_idx (mctx->input); 1897: re_token_type_t type; 1898: node = candidates->elems[node_idx]; 1899: type = dfa->nodes[node].type; 1900: if (node == sctx->cur_bkref && str_idx == cur_bkref_idx) 1901: continue; 1902: /* Avoid infinite loop for the REs like "()\1+". */ 1903: if (node == sctx->last_node && str_idx == sctx->last_str_idx) 1904: continue; 1905: if (type == OP_BACK_REF) 1906: { 1907: int enabled_idx = search_cur_bkref_entry (mctx, str_idx); 1908: for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx) 1909: { 1910: int disabled_idx, subexp_len, to_idx, dst_node; 1911: struct re_backref_cache_entry *entry; 1912: entry = mctx->bkref_ents + enabled_idx; 1913: if (entry->str_idx > str_idx) 1914: break; 1915: if (entry->node != node) 1916: continue; 1917: subexp_len = entry->subexp_to - entry->subexp_from; 1918: to_idx = str_idx + subexp_len; 1919: dst_node = (subexp_len ? dfa->nexts[node] 1920: : dfa->edests[node].elems[0]); 1921: 1922: if (to_idx > sctx->last_str_idx 1923: || sctx->sifted_states[to_idx] == NULL 1924: || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], 1925: dst_node) 1926: || check_dst_limits (dfa, &sctx->limits, mctx, node, 1927: str_idx, dst_node, to_idx)) 1928: continue; 1929: { 1930: re_dfastate_t *cur_state; 1931: entry->flag = 0; 1932: for (disabled_idx = enabled_idx + 1; 1933: disabled_idx < mctx->nbkref_ents; ++disabled_idx) 1934: { 1935: struct re_backref_cache_entry *entry2; 1936: entry2 = mctx->bkref_ents + disabled_idx; 1937: if (entry2->str_idx > str_idx) 1938: break; 1939: entry2->flag = (entry2->node == node) ? 1 : entry2->flag; 1940: } 1941: 1942: if (local_sctx.sifted_states == NULL) 1943: { 1944: local_sctx = *sctx; 1945: err = re_node_set_init_copy (&local_sctx.limits, 1946: &sctx->limits); 1947: if (BE (err != REG_NOERROR, 0)) 1948: goto free_return; 1949: } 1950: local_sctx.last_node = node; 1951: local_sctx.last_str_idx = str_idx; 1952: err = re_node_set_insert (&local_sctx.limits, enabled_idx); 1953: if (BE (err < 0, 0)) 1954: { 1955: err = REG_ESPACE; 1956: goto free_return; 1957: } 1958: cur_state = local_sctx.sifted_states[str_idx]; 1959: err = sift_states_backward (preg, mctx, &local_sctx); 1960: if (BE (err != REG_NOERROR, 0)) 1961: goto free_return; 1962: if (sctx->limited_states != NULL) 1963: { 1964: err = merge_state_array (dfa, sctx->limited_states, 1965: local_sctx.sifted_states, 1966: str_idx + 1); 1967: if (BE (err != REG_NOERROR, 0)) 1968: goto free_return; 1969: } 1970: local_sctx.sifted_states[str_idx] = cur_state; 1971: re_node_set_remove (&local_sctx.limits, enabled_idx); 1972: /* We must not use the variable entry here, since 1973: mctx->bkref_ents might be realloced. */ 1974: mctx->bkref_ents[enabled_idx].flag = 1; 1975: } 1976: } 1977: enabled_idx = search_cur_bkref_entry (mctx, str_idx); 1978: for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx) 1979: { 1980: struct re_backref_cache_entry *entry; 1981: entry = mctx->bkref_ents + enabled_idx; 1982: if (entry->str_idx > str_idx) 1983: break; 1984: if (entry->node == node) 1985: entry->flag = 0; 1986: } 1987: } 1988: } 1989: err = REG_NOERROR; 1990: free_return: 1991: if (local_sctx.sifted_states != NULL) 1992: { 1993: re_node_set_free (&local_sctx.limits); 1994: } 1995: 1996: return err; 1997: } 1998: 1999: 2000: #ifdef RE_ENABLE_I18N 2001: static int 2002: sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx) 2003: const regex_t *preg; 2004: const re_match_context_t *mctx; 2005: re_sift_context_t *sctx; 2006: int node_idx, str_idx, max_str_idx; 2007: { 2008: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2009: int naccepted; 2010: /* Check the node can accept `multi byte'. */ 2011: naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx); 2012: if (naccepted > 0 && str_idx + naccepted <= max_str_idx && 2013: !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted], 2014: dfa->nexts[node_idx])) 2015: /* The node can't accept the `multi byte', or the 2016: destination was already throwed away, then the node 2017: could't accept the current input `multi byte'. */ 2018: naccepted = 0; 2019: /* Otherwise, it is sure that the node could accept 2020: `naccepted' bytes input. */ 2021: return naccepted; 2022: } 2023: #endif /* RE_ENABLE_I18N */ 2024: 2025: 2026: /* Functions for state transition. */ 2027: 2028: /* Return the next state to which the current state STATE will transit by 2029: accepting the current input byte, and update STATE_LOG if necessary. 2030: If STATE can accept a multibyte char/collating element/back reference 2031: update the destination of STATE_LOG. */ 2032: 2033: static re_dfastate_t * 2034: transit_state (err, preg, mctx, state, fl_search) 2035: reg_errcode_t *err; 2036: const regex_t *preg; 2037: re_match_context_t *mctx; 2038: re_dfastate_t *state; 2039: int fl_search; 2040: { 2041: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2042: re_dfastate_t **trtable, *next_state; 2043: unsigned char ch; 2044: int cur_idx; 2045: 2046: if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len 2047: || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len 2048: && mctx->input->valid_len < mctx->input->len)) 2049: { 2050: *err = extend_buffers (mctx); 2051: if (BE (*err != REG_NOERROR, 0)) 2052: return NULL; 2053: } 2054: 2055: *err = REG_NOERROR; 2056: if (state == NULL) 2057: { 2058: next_state = state; 2059: re_string_skip_bytes (mctx->input, 1); 2060: } 2061: else 2062: { 2063: #ifdef RE_ENABLE_I18N 2064: /* If the current state can accept multibyte. */ 2065: if (state->accept_mb) 2066: { 2067: *err = transit_state_mb (preg, state, mctx); 2068: if (BE (*err != REG_NOERROR, 0)) 2069: return NULL; 2070: } 2071: #endif /* RE_ENABLE_I18N */ 2072: 2073: /* Then decide the next state with the single byte. */ 2074: if (1) 2075: { 2076: /* Use transition table */ 2077: ch = re_string_fetch_byte (mctx->input); 2078: trtable = fl_search ? state->trtable_search : state->trtable; 2079: if (trtable == NULL) 2080: { 2081: trtable = build_trtable (preg, state, fl_search); 2082: if (fl_search) 2083: state->trtable_search = trtable; 2084: else 2085: state->trtable = trtable; 2086: } 2087: next_state = trtable[ch]; 2088: } 2089: else 2090: { 2091: /* don't use transition table */ 2092: next_state = transit_state_sb (err, preg, state, fl_search, mctx); 2093: if (BE (next_state == NULL && err != REG_NOERROR, 0)) 2094: return NULL; 2095: } 2096: } 2097: 2098: cur_idx = re_string_cur_idx (mctx->input); 2099: /* Update the state_log if we need. */ 2100: if (mctx->state_log != NULL) 2101: { 2102: if (cur_idx > mctx->state_log_top) 2103: { 2104: mctx->state_log[cur_idx] = next_state; 2105: mctx->state_log_top = cur_idx; 2106: } 2107: else if (mctx->state_log[cur_idx] == 0) 2108: { 2109: mctx->state_log[cur_idx] = next_state; 2110: } 2111: else 2112: { 2113: re_dfastate_t *pstate; 2114: unsigned int context; 2115: re_node_set next_nodes, *log_nodes, *table_nodes = NULL; 2116: /* If (state_log[cur_idx] != 0), it implies that cur_idx is 2117: the destination of a multibyte char/collating element/ 2118: back reference. Then the next state is the union set of 2119: these destinations and the results of the transition table. */ 2120: pstate = mctx->state_log[cur_idx]; 2121: log_nodes = pstate->entrance_nodes; 2122: if (next_state != NULL) 2123: { 2124: table_nodes = next_state->entrance_nodes; 2125: *err = re_node_set_init_union (&next_nodes, table_nodes, 2126: log_nodes); 2127: if (BE (*err != REG_NOERROR, 0)) 2128: return NULL; 2129: } 2130: else 2131: next_nodes = *log_nodes; 2132: /* Note: We already add the nodes of the initial state, 2133: then we don't need to add them here. */ 2134: 2135: context = re_string_context_at (mctx->input, 2136: re_string_cur_idx (mctx->input) - 1, 2137: mctx->eflags, preg->newline_anchor); 2138: next_state = mctx->state_log[cur_idx] 2139: = re_acquire_state_context (err, dfa, &next_nodes, context); 2140: /* We don't need to check errors here, since the return value of 2141: this function is next_state and ERR is already set. */ 2142: 2143: if (table_nodes != NULL) 2144: re_node_set_free (&next_nodes); 2145: } 2146: } 2147: 2148: /* Check OP_OPEN_SUBEXP in the current state in case that we use them 2149: later. We must check them here, since the back references in the 2150: next state might use them. */ 2151: if (dfa->nbackref && next_state/* && fl_process_bkref */) 2152: { 2153: *err = check_subexp_matching_top (dfa, mctx, &next_state->nodes, 2154: cur_idx); 2155: if (BE (*err != REG_NOERROR, 0)) 2156: return NULL; 2157: } 2158: 2159: /* If the next state has back references. */ 2160: if (next_state != NULL && next_state->has_backref) 2161: { 2162: *err = transit_state_bkref (preg, &next_state->nodes, mctx); 2163: if (BE (*err != REG_NOERROR, 0)) 2164: return NULL; 2165: next_state = mctx->state_log[cur_idx]; 2166: } 2167: return next_state; 2168: } 2169: 2170: /* Helper functions for transit_state. */ 2171: 2172: /* From the node set CUR_NODES, pick up the nodes whose types are 2173: OP_OPEN_SUBEXP and which have corresponding back references in the regular 2174: expression. And register them to use them later for evaluating the 2175: correspoding back references. */ 2176: 2177: static reg_errcode_t 2178: check_subexp_matching_top (dfa, mctx, cur_nodes, str_idx) 2179: re_dfa_t *dfa; 2180: re_match_context_t *mctx; 2181: re_node_set *cur_nodes; 2182: int str_idx; 2183: { 2184: int node_idx; 2185: reg_errcode_t err; 2186: 2187: /* TODO: This isn't efficient. 2188: Because there might be more than one nodes whose types are 2189: OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2190: nodes. 2191: E.g. RE: (a){2} */ 2192: for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx) 2193: { 2194: int node = cur_nodes->elems[node_idx]; 2195: if (dfa->nodes[node].type == OP_OPEN_SUBEXP 2196: && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx)) 2197: { 2198: err = match_ctx_add_subtop (mctx, node, str_idx); 2199: if (BE (err != REG_NOERROR, 0)) 2200: return err; 2201: } 2202: } 2203: return REG_NOERROR; 2204: } 2205: 2206: /* Return the next state to which the current state STATE will transit by 2207: accepting the current input byte. */ 2208: 2209: static re_dfastate_t * 2210: transit_state_sb (err, preg, state, fl_search, mctx) 2211: reg_errcode_t *err; 2212: const regex_t *preg; 2213: re_dfastate_t *state; 2214: int fl_search; 2215: re_match_context_t *mctx; 2216: { 2217: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2218: re_node_set next_nodes; 2219: re_dfastate_t *next_state; 2220: int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input); 2221: unsigned int context; 2222: 2223: *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1); 2224: if (BE (*err != REG_NOERROR, 0)) 2225: return NULL; 2226: for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt) 2227: { 2228: int cur_node = state->nodes.elems[node_cnt]; 2229: if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx)) 2230: { 2231: *err = re_node_set_merge (&next_nodes, 2232: dfa->eclosures + dfa->nexts[cur_node]); 2233: if (BE (*err != REG_NOERROR, 0)) 2234: { 2235: re_node_set_free (&next_nodes); 2236: return NULL; 2237: } 2238: } 2239: } 2240: if (fl_search) 2241: { 2242: #ifdef RE_ENABLE_I18N 2243: int not_initial = 0; 2244: if (MB_CUR_MAX > 1) 2245: for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt) 2246: if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER) 2247: { 2248: not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial; 2249: break; 2250: } 2251: if (!not_initial) 2252: #endif 2253: { 2254: *err = re_node_set_merge (&next_nodes, 2255: dfa->init_state->entrance_nodes); 2256: if (BE (*err != REG_NOERROR, 0)) 2257: { 2258: re_node_set_free (&next_nodes); 2259: return NULL; 2260: } 2261: } 2262: } 2263: context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags, 2264: preg->newline_anchor); 2265: next_state = re_acquire_state_context (err, dfa, &next_nodes, context); 2266: /* We don't need to check errors here, since the return value of 2267: this function is next_state and ERR is already set. */ 2268: 2269: re_node_set_free (&next_nodes); 2270: re_string_skip_bytes (mctx->input, 1); 2271: return next_state; 2272: } 2273: 2274: #ifdef RE_ENABLE_I18N 2275: static reg_errcode_t 2276: transit_state_mb (preg, pstate, mctx) 2277: const regex_t *preg; 2278: re_dfastate_t *pstate; 2279: re_match_context_t *mctx; 2280: { 2281: reg_errcode_t err; 2282: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2283: int i; 2284: 2285: for (i = 0; i < pstate->nodes.nelem; ++i) 2286: { 2287: re_node_set dest_nodes, *new_nodes; 2288: int cur_node_idx = pstate->nodes.elems[i]; 2289: int naccepted = 0, dest_idx; 2290: unsigned int context; 2291: re_dfastate_t *dest_state; 2292: 2293: if (dfa->nodes[cur_node_idx].constraint) 2294: { 2295: context = re_string_context_at (mctx->input, 2296: re_string_cur_idx (mctx->input), 2297: mctx->eflags, preg->newline_anchor); 2298: if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint, 2299: context)) 2300: continue; 2301: } 2302: 2303: /* How many bytes the node can accepts? */ 2304: if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type)) 2305: naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input, 2306: re_string_cur_idx (mctx->input)); 2307: if (naccepted == 0) 2308: continue; 2309: 2310: /* The node can accepts `naccepted' bytes. */ 2311: dest_idx = re_string_cur_idx (mctx->input) + naccepted; 2312: mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted 2313: : mctx->max_mb_elem_len); 2314: err = clean_state_log_if_need (mctx, dest_idx); 2315: if (BE (err != REG_NOERROR, 0)) 2316: return err; 2317: #ifdef DEBUG 2318: assert (dfa->nexts[cur_node_idx] != -1); 2319: #endif 2320: /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE, 2321: then we use pstate->nodes.elems[i] instead. */ 2322: new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]]; 2323: 2324: dest_state = mctx->state_log[dest_idx]; 2325: if (dest_state == NULL) 2326: dest_nodes = *new_nodes; 2327: else 2328: { 2329: err = re_node_set_init_union (&dest_nodes, 2330: dest_state->entrance_nodes, new_nodes); 2331: if (BE (err != REG_NOERROR, 0)) 2332: return err; 2333: } 2334: context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags, 2335: preg->newline_anchor); 2336: mctx->state_log[dest_idx] 2337: = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2338: if (dest_state != NULL) 2339: re_node_set_free (&dest_nodes); 2340: if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0)) 2341: return err; 2342: } 2343: return REG_NOERROR; 2344: } 2345: #endif /* RE_ENABLE_I18N */ 2346: 2347: static reg_errcode_t 2348: transit_state_bkref (preg, nodes, mctx) 2349: const regex_t *preg; 2350: re_node_set *nodes; 2351: re_match_context_t *mctx; 2352: { 2353: reg_errcode_t err; 2354: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2355: int i; 2356: int cur_str_idx = re_string_cur_idx (mctx->input); 2357: 2358: for (i = 0; i < nodes->nelem; ++i) 2359: { 2360: int dest_str_idx, prev_nelem, bkc_idx; 2361: int node_idx = nodes->elems[i]; 2362: unsigned int context; 2363: re_token_t *node = dfa->nodes + node_idx; 2364: re_node_set *new_dest_nodes; 2365: 2366: /* Check whether `node' is a backreference or not. */ 2367: if (node->type != OP_BACK_REF) 2368: continue; 2369: 2370: if (node->constraint) 2371: { 2372: context = re_string_context_at (mctx->input, cur_str_idx, 2373: mctx->eflags, preg->newline_anchor); 2374: if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 2375: continue; 2376: } 2377: 2378: /* `node' is a backreference. 2379: Check the substring which the substring matched. */ 2380: bkc_idx = mctx->nbkref_ents; 2381: err = get_subexp (preg, mctx, node_idx, cur_str_idx); 2382: if (BE (err != REG_NOERROR, 0)) 2383: goto free_return; 2384: 2385: /* And add the epsilon closures (which is `new_dest_nodes') of 2386: the backreference to appropriate state_log. */ 2387: #ifdef DEBUG 2388: assert (dfa->nexts[node_idx] != -1); 2389: #endif 2390: for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx) 2391: { 2392: int subexp_len; 2393: re_dfastate_t *dest_state; 2394: struct re_backref_cache_entry *bkref_ent; 2395: bkref_ent = mctx->bkref_ents + bkc_idx; 2396: if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx) 2397: continue; 2398: subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from; 2399: new_dest_nodes = (subexp_len == 0 2400: ? dfa->eclosures + dfa->edests[node_idx].elems[0] 2401: : dfa->eclosures + dfa->nexts[node_idx]); 2402: dest_str_idx = (cur_str_idx + bkref_ent->subexp_to 2403: - bkref_ent->subexp_from); 2404: context = re_string_context_at (mctx->input, dest_str_idx - 1, 2405: mctx->eflags, preg->newline_anchor); 2406: dest_state = mctx->state_log[dest_str_idx]; 2407: prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0 2408: : mctx->state_log[cur_str_idx]->nodes.nelem); 2409: /* Add `new_dest_node' to state_log. */ 2410: if (dest_state == NULL) 2411: { 2412: mctx->state_log[dest_str_idx] 2413: = re_acquire_state_context (&err, dfa, new_dest_nodes, 2414: context); 2415: if (BE (mctx->state_log[dest_str_idx] == NULL 2416: && err != REG_NOERROR, 0)) 2417: goto free_return; 2418: } 2419: else 2420: { 2421: re_node_set dest_nodes; 2422: err = re_node_set_init_union (&dest_nodes, 2423: dest_state->entrance_nodes, 2424: new_dest_nodes); 2425: if (BE (err != REG_NOERROR, 0)) 2426: { 2427: re_node_set_free (&dest_nodes); 2428: goto free_return; 2429: } 2430: mctx->state_log[dest_str_idx] 2431: = re_acquire_state_context (&err, dfa, &dest_nodes, context); 2432: re_node_set_free (&dest_nodes); 2433: if (BE (mctx->state_log[dest_str_idx] == NULL 2434: && err != REG_NOERROR, 0)) 2435: goto free_return; 2436: } 2437: /* We need to check recursively if the backreference can epsilon 2438: transit. */ 2439: if (subexp_len == 0 2440: && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem) 2441: { 2442: err = check_subexp_matching_top (dfa, mctx, new_dest_nodes, 2443: cur_str_idx); 2444: if (BE (err != REG_NOERROR, 0)) 2445: goto free_return; 2446: err = transit_state_bkref (preg, new_dest_nodes, mctx); 2447: if (BE (err != REG_NOERROR, 0)) 2448: goto free_return; 2449: } 2450: } 2451: } 2452: err = REG_NOERROR; 2453: free_return: 2454: return err; 2455: } 2456: 2457: /* Enumerate all the candidates which the backreference BKREF_NODE can match 2458: at BKREF_STR_IDX, and register them by match_ctx_add_entry(). 2459: Note that we might collect inappropriate candidates here. 2460: However, the cost of checking them strictly here is too high, then we 2461: delay these checking for prune_impossible_nodes(). */ 2462: 2463: static reg_errcode_t 2464: get_subexp (preg, mctx, bkref_node, bkref_str_idx) 2465: const regex_t *preg; 2466: re_match_context_t *mctx; 2467: int bkref_node, bkref_str_idx; 2468: { 2469: int subexp_num, sub_top_idx; 2470: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2471: char *buf = (char *) re_string_get_buffer (mctx->input); 2472: /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */ 2473: int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx); 2474: for (; cache_idx < mctx->nbkref_ents; ++cache_idx) 2475: { 2476: struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx; 2477: if (entry->str_idx > bkref_str_idx) 2478: break; 2479: if (entry->node == bkref_node) 2480: return REG_NOERROR; /* We already checked it. */ 2481: } 2482: subexp_num = dfa->nodes[bkref_node].opr.idx - 1; 2483: 2484: /* For each sub expression */ 2485: for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx) 2486: { 2487: reg_errcode_t err; 2488: re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx]; 2489: re_sub_match_last_t *sub_last; 2490: int sub_last_idx, sl_str; 2491: char *bkref_str; 2492: 2493: if (dfa->nodes[sub_top->node].opr.idx != subexp_num) 2494: continue; /* It isn't related. */ 2495: 2496: sl_str = sub_top->str_idx; 2497: bkref_str = buf + bkref_str_idx; 2498: /* At first, check the last node of sub expressions we already 2499: evaluated. */ 2500: for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx) 2501: { 2502: int sl_str_diff; 2503: sub_last = sub_top->lasts[sub_last_idx]; 2504: sl_str_diff = sub_last->str_idx - sl_str; 2505: /* The matched string by the sub expression match with the substring 2506: at the back reference? */ 2507: if (sl_str_diff > 0 2508: && memcmp (bkref_str, buf + sl_str, sl_str_diff) != 0) 2509: break; /* We don't need to search this sub expression any more. */ 2510: bkref_str += sl_str_diff; 2511: sl_str += sl_str_diff; 2512: err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, 2513: bkref_str_idx); 2514: if (err == REG_NOMATCH) 2515: continue; 2516: if (BE (err != REG_NOERROR, 0)) 2517: return err; 2518: } 2519: if (sub_last_idx < sub_top->nlasts) 2520: continue; 2521: if (sub_last_idx > 0) 2522: ++sl_str; 2523: /* Then, search for the other last nodes of the sub expression. */ 2524: for (; sl_str <= bkref_str_idx; ++sl_str) 2525: { 2526: int cls_node, sl_str_off; 2527: re_node_set *nodes; 2528: sl_str_off = sl_str - sub_top->str_idx; 2529: /* The matched string by the sub expression match with the substring 2530: at the back reference? */ 2531: if (sl_str_off > 0 2532: && memcmp (bkref_str++, buf + sl_str - 1, 1) != 0) 2533: break; /* We don't need to search this sub expression any more. */ 2534: if (mctx->state_log[sl_str] == NULL) 2535: continue; 2536: /* Does this state have a ')' of the sub expression? */ 2537: nodes = &mctx->state_log[sl_str]->nodes; 2538: cls_node = find_subexp_node (dfa, nodes, subexp_num, 0); 2539: if (cls_node == -1) 2540: continue; /* No. */ 2541: if (sub_top->path == NULL) 2542: { 2543: sub_top->path = calloc (sizeof (state_array_t), 2544: sl_str - sub_top->str_idx + 1); 2545: if (sub_top->path == NULL) 2546: return REG_ESPACE; 2547: } 2548: /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node 2549: in the current context? */ 2550: err = check_arrival (preg, mctx, sub_top->path, sub_top->node, 2551: sub_top->str_idx, cls_node, sl_str, 0); 2552: if (err == REG_NOMATCH) 2553: continue; 2554: if (BE (err != REG_NOERROR, 0)) 2555: return err; 2556: sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str); 2557: if (BE (sub_last == NULL, 0)) 2558: return REG_ESPACE; 2559: err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, 2560: bkref_str_idx); 2561: if (err == REG_NOMATCH) 2562: continue; 2563: } 2564: } 2565: return REG_NOERROR; 2566: } 2567: 2568: /* Helper functions for get_subexp(). */ 2569: 2570: /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR. 2571: If it can arrive, register the sub expression expressed with SUB_TOP 2572: and SUB_LAST. */ 2573: 2574: static reg_errcode_t 2575: get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, bkref_str) 2576: const regex_t *preg; 2577: re_match_context_t *mctx; 2578: re_sub_match_top_t *sub_top; 2579: re_sub_match_last_t *sub_last; 2580: int bkref_node, bkref_str; 2581: { 2582: reg_errcode_t err; 2583: int to_idx; 2584: /* Can the subexpression arrive the back reference? */ 2585: err = check_arrival (preg, mctx, &sub_last->path, sub_last->node, 2586: sub_last->str_idx, bkref_node, bkref_str, 1); 2587: if (err != REG_NOERROR) 2588: return err; 2589: err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx, 2590: sub_last->str_idx); 2591: if (BE (err != REG_NOERROR, 0)) 2592: return err; 2593: to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx; 2594: clean_state_log_if_need (mctx, to_idx); 2595: return REG_NOERROR; 2596: } 2597: 2598: /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX. 2599: Search '(' if FL_OPEN, or search ')' otherwise. 2600: TODO: This function isn't efficient... 2601: Because there might be more than one nodes whose types are 2602: OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all 2603: nodes. 2604: E.g. RE: (a){2} */ 2605: 2606: static int 2607: find_subexp_node (dfa, nodes, subexp_idx, fl_open) 2608: re_dfa_t *dfa; 2609: re_node_set *nodes; 2610: int subexp_idx, fl_open; 2611: { 2612: int cls_idx; 2613: for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx) 2614: { 2615: int cls_node = nodes->elems[cls_idx]; 2616: re_token_t *node = dfa->nodes + cls_node; 2617: if (((fl_open && node->type == OP_OPEN_SUBEXP) 2618: || (!fl_open && node->type == OP_CLOSE_SUBEXP)) 2619: && node->opr.idx == subexp_idx) 2620: return cls_node; 2621: } 2622: return -1; 2623: } 2624: 2625: /* Check whether the node TOP_NODE at TOP_STR can arrive to the node 2626: LAST_NODE at LAST_STR. We record the path onto PATH since it will be 2627: heavily reused. 2628: Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */ 2629: 2630: static reg_errcode_t 2631: check_arrival (preg, mctx, path, top_node, top_str, last_node, last_str, 2632: fl_open) 2633: const regex_t *preg; 2634: re_match_context_t *mctx; 2635: state_array_t *path; 2636: int top_node, top_str, last_node, last_str, fl_open; 2637: { 2638: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2639: reg_errcode_t err; 2640: int subexp_num, backup_cur_idx, str_idx, null_cnt; 2641: re_dfastate_t *cur_state = NULL; 2642: re_node_set *cur_nodes, next_nodes; 2643: re_dfastate_t **backup_state_log; 2644: unsigned int context; 2645: 2646: subexp_num = dfa->nodes[top_node].opr.idx; 2647: /* Extend the buffer if we need. */ 2648: if (path->alloc < last_str + mctx->max_mb_elem_len + 1) 2649: { 2650: re_dfastate_t **new_array; 2651: int old_alloc = path->alloc; 2652: path->alloc += last_str + mctx->max_mb_elem_len + 1; 2653: new_array = re_realloc (path->array, re_dfastate_t *, path->alloc); 2654: if (new_array == NULL) 2655: return REG_ESPACE; 2656: path->array = new_array; 2657: memset (new_array + old_alloc, '\0', 2658: sizeof (re_dfastate_t *) * (path->alloc - old_alloc)); 2659: } 2660: 2661: str_idx = path->next_idx == 0 ? top_str : path->next_idx; 2662: 2663: /* Temporary modify MCTX. */ 2664: backup_state_log = mctx->state_log; 2665: backup_cur_idx = mctx->input->cur_idx; 2666: mctx->state_log = path->array; 2667: mctx->input->cur_idx = str_idx; 2668: 2669: /* Setup initial node set. */ 2670: context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags, 2671: preg->newline_anchor); 2672: if (str_idx == top_str) 2673: { 2674: err = re_node_set_init_1 (&next_nodes, top_node); 2675: if (BE (err != REG_NOERROR, 0)) 2676: return err; 2677: err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, fl_open); 2678: if (BE (err != REG_NOERROR, 0)) 2679: { 2680: re_node_set_free (&next_nodes); 2681: return err; 2682: } 2683: } 2684: else 2685: { 2686: cur_state = mctx->state_log[str_idx]; 2687: if (cur_state && cur_state->has_backref) 2688: { 2689: err = re_node_set_init_copy (&next_nodes, &cur_state->nodes); 2690: if (BE ( err != REG_NOERROR, 0)) 2691: return err; 2692: } 2693: else 2694: re_node_set_init_empty (&next_nodes); 2695: } 2696: if (str_idx == top_str || (cur_state && cur_state->has_backref)) 2697: { 2698: if (next_nodes.nelem) 2699: { 2700: err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str, 2701: subexp_num, fl_open); 2702: if (BE ( err != REG_NOERROR, 0)) 2703: { 2704: re_node_set_free (&next_nodes); 2705: return err; 2706: } 2707: } 2708: cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2709: if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2710: { 2711: re_node_set_free (&next_nodes); 2712: return err; 2713: } 2714: mctx->state_log[str_idx] = cur_state; 2715: } 2716: 2717: for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;) 2718: { 2719: re_node_set_empty (&next_nodes); 2720: if (mctx->state_log[str_idx + 1]) 2721: { 2722: err = re_node_set_merge (&next_nodes, 2723: &mctx->state_log[str_idx + 1]->nodes); 2724: if (BE (err != REG_NOERROR, 0)) 2725: { 2726: re_node_set_free (&next_nodes); 2727: return err; 2728: } 2729: } 2730: if (cur_state) 2731: { 2732: err = check_arrival_add_next_nodes(preg, dfa, mctx, str_idx, 2733: &cur_state->nodes, &next_nodes); 2734: if (BE (err != REG_NOERROR, 0)) 2735: { 2736: re_node_set_free (&next_nodes); 2737: return err; 2738: } 2739: } 2740: ++str_idx; 2741: if (next_nodes.nelem) 2742: { 2743: err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, 2744: fl_open); 2745: if (BE (err != REG_NOERROR, 0)) 2746: { 2747: re_node_set_free (&next_nodes); 2748: return err; 2749: } 2750: err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str, 2751: subexp_num, fl_open); 2752: if (BE ( err != REG_NOERROR, 0)) 2753: { 2754: re_node_set_free (&next_nodes); 2755: return err; 2756: } 2757: } 2758: context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags, 2759: preg->newline_anchor); 2760: cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context); 2761: if (BE (cur_state == NULL && err != REG_NOERROR, 0)) 2762: { 2763: re_node_set_free (&next_nodes); 2764: return err; 2765: } 2766: mctx->state_log[str_idx] = cur_state; 2767: null_cnt = cur_state == NULL ? null_cnt + 1 : 0; 2768: } 2769: re_node_set_free (&next_nodes); 2770: cur_nodes = (mctx->state_log[last_str] == NULL ? NULL 2771: : &mctx->state_log[last_str]->nodes); 2772: path->next_idx = str_idx; 2773: 2774: /* Fix MCTX. */ 2775: mctx->state_log = backup_state_log; 2776: mctx->input->cur_idx = backup_cur_idx; 2777: 2778: if (cur_nodes == NULL) 2779: return REG_NOMATCH; 2780: /* Then check the current node set has the node LAST_NODE. */ 2781: return (re_node_set_contains (cur_nodes, last_node) 2782: || re_node_set_contains (cur_nodes, last_node) ? REG_NOERROR 2783: : REG_NOMATCH); 2784: } 2785: 2786: /* Helper functions for check_arrival. */ 2787: 2788: /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them 2789: to NEXT_NODES. 2790: TODO: This function is similar to the functions transit_state*(), 2791: however this function has many additional works. 2792: Can't we unify them? */ 2793: 2794: static reg_errcode_t 2795: check_arrival_add_next_nodes (preg, dfa, mctx, str_idx, cur_nodes, next_nodes) 2796: const regex_t *preg; 2797: re_dfa_t *dfa; 2798: re_match_context_t *mctx; 2799: int str_idx; 2800: re_node_set *cur_nodes, *next_nodes; 2801: { 2802: int cur_idx; 2803: reg_errcode_t err; 2804: re_node_set union_set; 2805: re_node_set_init_empty (&union_set); 2806: for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx) 2807: { 2808: int naccepted = 0; 2809: int cur_node = cur_nodes->elems[cur_idx]; 2810: re_token_type_t type = dfa->nodes[cur_node].type; 2811: if (IS_EPSILON_NODE(type)) 2812: continue; 2813: #ifdef RE_ENABLE_I18N 2814: /* If the node may accept `multi byte'. */ 2815: if (ACCEPT_MB_NODE (type)) 2816: { 2817: naccepted = check_node_accept_bytes (preg, cur_node, mctx->input, 2818: str_idx); 2819: if (naccepted > 1) 2820: { 2821: re_dfastate_t *dest_state; 2822: int next_node = dfa->nexts[cur_node]; 2823: int next_idx = str_idx + naccepted; 2824: dest_state = mctx->state_log[next_idx]; 2825: re_node_set_empty (&union_set); 2826: if (dest_state) 2827: { 2828: err = re_node_set_merge (&union_set, &dest_state->nodes); 2829: if (BE (err != REG_NOERROR, 0)) 2830: { 2831: re_node_set_free (&union_set); 2832: return err; 2833: } 2834: err = re_node_set_insert (&union_set, next_node); 2835: if (BE (err < 0, 0)) 2836: { 2837: re_node_set_free (&union_set); 2838: return REG_ESPACE; 2839: } 2840: } 2841: else 2842: { 2843: err = re_node_set_insert (&union_set, next_node); 2844: if (BE (err < 0, 0)) 2845: { 2846: re_node_set_free (&union_set); 2847: return REG_ESPACE; 2848: } 2849: } 2850: mctx->state_log[next_idx] = re_acquire_state (&err, dfa, 2851: &union_set); 2852: if (BE (mctx->state_log[next_idx] == NULL 2853: && err != REG_NOERROR, 0)) 2854: { 2855: re_node_set_free (&union_set); 2856: return err; 2857: } 2858: } 2859: } 2860: #endif /* RE_ENABLE_I18N */ 2861: if (naccepted 2862: || check_node_accept (preg, dfa->nodes + cur_node, mctx, 2863: str_idx)) 2864: { 2865: err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]); 2866: if (BE (err < 0, 0)) 2867: { 2868: re_node_set_free (&union_set); 2869: return REG_ESPACE; 2870: } 2871: } 2872: } 2873: re_node_set_free (&union_set); 2874: return REG_NOERROR; 2875: } 2876: 2877: /* For all the nodes in CUR_NODES, add the epsilon closures of them to 2878: CUR_NODES, however exclude the nodes which are: 2879: - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN. 2880: - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN. 2881: */ 2882: 2883: static reg_errcode_t 2884: check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, fl_open) 2885: re_dfa_t *dfa; 2886: re_node_set *cur_nodes; 2887: int ex_subexp, fl_open; 2888: { 2889: reg_errcode_t err; 2890: int idx, outside_node; 2891: re_node_set new_nodes; 2892: #ifdef DEBUG 2893: assert (cur_nodes->nelem); 2894: #endif 2895: err = re_node_set_alloc (&new_nodes, cur_nodes->nelem); 2896: if (BE (err != REG_NOERROR, 0)) 2897: return err; 2898: /* Create a new node set NEW_NODES with the nodes which are epsilon 2899: closures of the node in CUR_NODES. */ 2900: 2901: for (idx = 0; idx < cur_nodes->nelem; ++idx) 2902: { 2903: int cur_node = cur_nodes->elems[idx]; 2904: re_node_set *eclosure = dfa->eclosures + cur_node; 2905: outside_node = find_subexp_node (dfa, eclosure, ex_subexp, fl_open); 2906: if (outside_node == -1) 2907: { 2908: /* There are no problematic nodes, just merge them. */ 2909: err = re_node_set_merge (&new_nodes, eclosure); 2910: if (BE (err != REG_NOERROR, 0)) 2911: { 2912: re_node_set_free (&new_nodes); 2913: return err; 2914: } 2915: } 2916: else 2917: { 2918: /* There are problematic nodes, re-calculate incrementally. */ 2919: err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node, 2920: ex_subexp, fl_open); 2921: if (BE (err != REG_NOERROR, 0)) 2922: { 2923: re_node_set_free (&new_nodes); 2924: return err; 2925: } 2926: } 2927: } 2928: re_node_set_free (cur_nodes); 2929: *cur_nodes = new_nodes; 2930: return REG_NOERROR; 2931: } 2932: 2933: /* Helper function for check_arrival_expand_ecl. 2934: Check incrementally the epsilon closure of TARGET, and if it isn't 2935: problematic append it to DST_NODES. */ 2936: 2937: static reg_errcode_t 2938: check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, fl_open) 2939: re_dfa_t *dfa; 2940: int target, ex_subexp, fl_open; 2941: re_node_set *dst_nodes; 2942: { 2943: int cur_node, type; 2944: for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);) 2945: { 2946: int err; 2947: type = dfa->nodes[cur_node].type; 2948: 2949: if (((type == OP_OPEN_SUBEXP && fl_open) 2950: || (type == OP_CLOSE_SUBEXP && !fl_open)) 2951: && dfa->nodes[cur_node].opr.idx == ex_subexp) 2952: { 2953: if (!fl_open) 2954: { 2955: err = re_node_set_insert (dst_nodes, cur_node); 2956: if (BE (err == -1, 0)) 2957: return REG_ESPACE; 2958: } 2959: break; 2960: } 2961: err = re_node_set_insert (dst_nodes, cur_node); 2962: if (BE (err == -1, 0)) 2963: return REG_ESPACE; 2964: if (dfa->edests[cur_node].nelem == 0) 2965: break; 2966: if (dfa->edests[cur_node].nelem == 2) 2967: { 2968: err = check_arrival_expand_ecl_sub (dfa, dst_nodes, 2969: dfa->edests[cur_node].elems[1], 2970: ex_subexp, fl_open); 2971: if (BE (err != REG_NOERROR, 0)) 2972: return err; 2973: } 2974: cur_node = dfa->edests[cur_node].elems[0]; 2975: } 2976: return REG_NOERROR; 2977: } 2978: 2979: 2980: /* For all the back references in the current state, calculate the 2981: destination of the back references by the appropriate entry 2982: in MCTX->BKREF_ENTS. */ 2983: 2984: static reg_errcode_t 2985: expand_bkref_cache (preg, mctx, cur_nodes, cur_str, last_str, subexp_num, 2986: fl_open) 2987: const regex_t *preg; 2988: re_match_context_t *mctx; 2989: int cur_str, last_str, subexp_num, fl_open; 2990: re_node_set *cur_nodes; 2991: { 2992: reg_errcode_t err; 2993: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2994: int cache_idx, cache_idx_start; 2995: /* The current state. */ 2996: 2997: cache_idx_start = search_cur_bkref_entry (mctx, cur_str); 2998: for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx) 2999: { 3000: int to_idx, next_node; 3001: struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx; 3002: if (ent->str_idx > cur_str) 3003: break; 3004: /* Is this entry ENT is appropriate? */ 3005: if (!re_node_set_contains (cur_nodes, ent->node)) 3006: continue; /* No. */ 3007: 3008: to_idx = cur_str + ent->subexp_to - ent->subexp_from; 3009: /* Calculate the destination of the back reference, and append it 3010: to MCTX->STATE_LOG. */ 3011: if (to_idx == cur_str) 3012: { 3013: /* The backreference did epsilon transit, we must re-check all the 3014: node in the current state. */ 3015: re_node_set new_dests; 3016: reg_errcode_t err2, err3; 3017: next_node = dfa->edests[ent->node].elems[0]; 3018: if (re_node_set_contains (cur_nodes, next_node)) 3019: continue; 3020: err = re_node_set_init_1 (&new_dests, next_node); 3021: err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, 3022: fl_open); 3023: err3 = re_node_set_merge (cur_nodes, &new_dests); 3024: re_node_set_free (&new_dests); 3025: if (BE (err != REG_NOERROR || err2 != REG_NOERROR 3026: || err3 != REG_NOERROR, 0)) 3027: { 3028: err = (err != REG_NOERROR ? err 3029: : (err2 != REG_NOERROR ? err2 : err3)); 3030: return err; 3031: } 3032: /* TODO: It is still inefficient... */ 3033: cache_idx = cache_idx_start - 1; 3034: continue; 3035: } 3036: else 3037: { 3038: re_node_set union_set; 3039: next_node = dfa->nexts[ent->node]; 3040: if (mctx->state_log[to_idx]) 3041: { 3042: int ret; 3043: if (re_node_set_contains (&mctx->state_log[to_idx]->nodes, 3044: next_node)) 3045: continue; 3046: err = re_node_set_init_copy (&union_set, 3047: &mctx->state_log[to_idx]->nodes); 3048: ret = re_node_set_insert (&union_set, next_node); 3049: if (BE (err != REG_NOERROR || ret < 0, 0)) 3050: { 3051: re_node_set_free (&union_set); 3052: err = err != REG_NOERROR ? err : REG_ESPACE; 3053: return err; 3054: } 3055: } 3056: else 3057: { 3058: err = re_node_set_init_1 (&union_set, next_node); 3059: if (BE (err != REG_NOERROR, 0)) 3060: return err; 3061: } 3062: mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set); 3063: re_node_set_free (&union_set); 3064: if (BE (mctx->state_log[to_idx] == NULL 3065: && err != REG_NOERROR, 0)) 3066: return err; 3067: } 3068: } 3069: return REG_NOERROR; 3070: } 3071: 3072: /* Build transition table for the state. 3073: Return the new table if succeeded, otherwise return NULL. */ 3074: 3075: static re_dfastate_t ** 3076: build_trtable (preg, state, fl_search) 3077: const regex_t *preg; 3078: const re_dfastate_t *state; 3079: int fl_search; 3080: { 3081: reg_errcode_t err; 3082: re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 3083: int i, j, k, ch; 3084: int dests_node_malloced = 0, dest_states_malloced = 0; 3085: int ndests; /* Number of the destination states from `state'. */ 3086: re_dfastate_t **trtable; 3087: re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl; 3088: re_node_set follows, *dests_node; 3089: bitset *dests_ch; 3090: bitset acceptable; 3091: 3092: /* We build DFA states which corresponds to the destination nodes 3093: from `state'. `dests_node[i]' represents the nodes which i-th 3094: destination state contains, and `dests_ch[i]' represents the 3095: characters which i-th destination state accepts. */ 3096: #ifdef _LIBC 3097: if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX)) 3098: dests_node = (re_node_set *) 3099: alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); 3100: else 3101: #endif 3102: { 3103: dests_node = (re_node_set *) 3104: malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); 3105: if (BE (dests_node == NULL, 0)) 3106: return NULL; 3107: dests_node_malloced = 1; 3108: } 3109: dests_ch = (bitset *) (dests_node + SBC_MAX); 3110: 3111: /* Initialize transiton table. */ 3112: trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); 3113: if (BE (trtable == NULL, 0)) 3114: { 3115: if (dests_node_malloced) 3116: free (dests_node); 3117: return NULL; 3118: } 3119: 3120: /* At first, group all nodes belonging to `state' into several 3121: destinations. */ 3122: ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch); 3123: if (BE (ndests <= 0, 0)) 3124: { 3125: if (dests_node_malloced) 3126: free (dests_node); 3127: /* Return NULL in case of an error, trtable otherwise. */ 3128: if (ndests == 0) 3129: return trtable; 3130: free (trtable); 3131: return NULL; 3132: } 3133: 3134: err = re_node_set_alloc (&follows, ndests + 1); 3135: if (BE (err != REG_NOERROR, 0)) 3136: goto out_free; 3137: 3138: #ifdef _LIBC 3139: if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX 3140: + ndests * 3 * sizeof (re_dfastate_t *))) 3141: dest_states = (re_dfastate_t **) 3142: alloca (ndests * 3 * sizeof (re_dfastate_t *)); 3143: else 3144: #endif 3145: { 3146: dest_states = (re_dfastate_t **) 3147: malloc (ndests * 3 * sizeof (re_dfastate_t *)); 3148: if (BE (dest_states == NULL, 0)) 3149: { 3150: out_free: 3151: if (dest_states_malloced) 3152: free (dest_states); 3153: re_node_set_free (&follows); 3154: for (i = 0; i < ndests; ++i) 3155: re_node_set_free (dests_node + i); 3156: free (trtable); 3157: if (dests_node_malloced) 3158: free (dests_node); 3159: return NULL; 3160: } 3161: dest_states_malloced = 1; 3162: } 3163: dest_states_word = dest_states + ndests; 3164: dest_states_nl = dest_states_word + ndests; 3165: bitset_empty (acceptable); 3166: 3167: /* Then build the states for all destinations. */ 3168: for (i = 0; i < ndests; ++i) 3169: { 3170: int next_node; 3171: re_node_set_empty (&follows); 3172: /* Merge the follows of this destination states. */ 3173: for (j = 0; j < dests_node[i].nelem; ++j) 3174: { 3175: next_node = dfa->nexts[dests_node[i].elems[j]]; 3176: if (next_node != -1) 3177: { 3178: err = re_node_set_merge (&follows, dfa->eclosures + next_node); 3179: if (BE (err != REG_NOERROR, 0)) 3180: goto out_free; 3181: } 3182: } 3183: /* If search flag is set, merge the initial state. */ 3184: if (fl_search) 3185: { 3186: #ifdef RE_ENABLE_I18N 3187: int not_initial = 0; 3188: for (j = 0; j < follows.nelem; ++j) 3189: if (dfa->nodes[follows.elems[j]].type == CHARACTER) 3190: { 3191: not_initial = dfa->nodes[follows.elems[j]].mb_partial; 3192: break; 3193: } 3194: if (!not_initial) 3195: #endif 3196: { 3197: err = re_node_set_merge (&follows, 3198: dfa->init_state->entrance_nodes); 3199: if (BE (err != REG_NOERROR, 0)) 3200: goto out_free; 3201: } 3202: } 3203: dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0); 3204: if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0)) 3205: goto out_free; 3206: /* If the new state has context constraint, 3207: build appropriate states for these contexts. */ 3208: if (dest_states[i]->has_constraint) 3209: { 3210: dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows, 3211: CONTEXT_WORD); 3212: if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0)) 3213: goto out_free; 3214: dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, 3215: CONTEXT_NEWLINE); 3216: if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0)) 3217: goto out_free; 3218: } 3219: else 3220: { 3221: dest_states_word[i] = dest_states[i]; 3222: dest_states_nl[i] = dest_states[i]; 3223: } 3224: bitset_merge (acceptable, dests_ch[i]); 3225: } 3226: 3227: /* Update the transition table. */ 3228: /* For all characters ch...: */ 3229: for (i = 0, ch = 0; i < BITSET_UINTS; ++i) 3230: for (j = 0; j < UINT_BITS; ++j, ++ch) 3231: if ((acceptable[i] >> j) & 1) 3232: { 3233: /* The current state accepts the character ch. */ 3234: if (IS_WORD_CHAR (ch)) 3235: { 3236: for (k = 0; k < ndests; ++k) 3237: if ((dests_ch[k][i] >> j) & 1) 3238: { 3239: /* k-th destination accepts the word character ch. */ 3240: trtable[ch] = dest_states_word[k]; 3241: /* There must be only one destination which accepts 3242: character ch. See group_nodes_into_DFAstates. */ 3243: break; 3244: } 3245: } 3246: else /* not WORD_CHAR */ 3247: { 3248: for (k = 0; k < ndests; ++k) 3249: if ((dests_ch[k][i] >> j) & 1) 3250: { 3251: /* k-th destination accepts the non-word character ch. */ 3252: trtable[ch] = dest_states[k]; 3253: /* There must be only one destination which accepts 3254: character ch. See group_nodes_into_DFAstates. */ 3255: break; 3256: } 3257: } 3258: } 3259: /* new line */ 3260: if (bitset_contain (acceptable, NEWLINE_CHAR)) 3261: { 3262: /* The current state accepts newline character. */ 3263: for (k = 0; k < ndests; ++k) 3264: if (bitset_contain (dests_ch[k], NEWLINE_CHAR)) 3265: { 3266: /* k-th destination accepts newline character. */ 3267: trtable[NEWLINE_CHAR] = dest_states_nl[k]; 3268: /* There must be only one destination which accepts 3269: newline. See group_nodes_into_DFAstates. */ 3270: break; 3271: } 3272: } 3273: 3274: if (dest_states_malloced) 3275: free (dest_states); 3276: 3277: re_node_set_free (&follows); 3278: for (i = 0; i < ndests; ++i) 3279: re_node_set_free (dests_node + i); 3280: 3281: if (dests_node_malloced) 3282: free (dests_node); 3283: 3284: return trtable; 3285: } 3286: 3287: /* Group all nodes belonging to STATE into several destinations. 3288: Then for all destinations, set the nodes belonging to the destination 3289: to DESTS_NODE[i] and set the characters accepted by the destination 3290: to DEST_CH[i]. This function return the number of destinations. */ 3291: 3292: static int 3293: group_nodes_into_DFAstates (preg, state, dests_node, dests_ch) 3294: const regex_t *preg; 3295: const re_dfastate_t *state; 3296: re_node_set *dests_node; 3297: bitset *dests_ch; 3298: { 3299: reg_errcode_t err; 3300: const re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 3301: int i, j, k; 3302: int ndests; /* Number of the destinations from `state'. */ 3303: bitset accepts; /* Characters a node can accept. */ 3304: const re_node_set *cur_nodes = &state->nodes; 3305: bitset_empty (accepts); 3306: ndests = 0; 3307: 3308: /* For all the nodes belonging to `state', */ 3309: for (i = 0; i < cur_nodes->nelem; ++i) 3310: { 3311: re_token_t *node = &dfa->nodes[cur_nodes->elems[i]]; 3312: re_token_type_t type = node->type; 3313: unsigned int constraint = node->constraint; 3314: 3315: /* Enumerate all single byte character this node can accept. */ 3316: if (type == CHARACTER) 3317: bitset_set (accepts, node->opr.c); 3318: else if (type == SIMPLE_BRACKET) 3319: { 3320: bitset_merge (accepts, node->opr.sbcset); 3321: } 3322: else if (type == OP_PERIOD) 3323: { 3324: bitset_set_all (accepts); 3325: if (!(preg->syntax & RE_DOT_NEWLINE)) 3326: bitset_clear (accepts, '\n'); 3327: if (preg->syntax & RE_DOT_NOT_NULL) 3328: bitset_clear (accepts, '\0'); 3329: } 3330: else 3331: continue; 3332: 3333: /* Check the `accepts' and sift the characters which are not 3334: match it the context. */ 3335: if (constraint) 3336: { 3337: if (constraint & NEXT_WORD_CONSTRAINT) 3338: for (j = 0; j < BITSET_UINTS; ++j) 3339: accepts[j] &= dfa->word_char[j]; 3340: if (constraint & NEXT_NOTWORD_CONSTRAINT) 3341: for (j = 0; j < BITSET_UINTS; ++j) 3342: accepts[j] &= ~dfa->word_char[j]; 3343: if (constraint & NEXT_NEWLINE_CONSTRAINT) 3344: { 3345: int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR); 3346: bitset_empty (accepts); 3347: if (accepts_newline) 3348: bitset_set (accepts, NEWLINE_CHAR); 3349: else 3350: continue; 3351: } 3352: } 3353: 3354: /* Then divide `accepts' into DFA states, or create a new 3355: state. */ 3356: for (j = 0; j < ndests; ++j) 3357: { 3358: bitset intersec; /* Intersection sets, see below. */ 3359: bitset remains; 3360: /* Flags, see below. */ 3361: int has_intersec, not_subset, not_consumed; 3362: 3363: /* Optimization, skip if this state doesn't accept the character. */ 3364: if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c)) 3365: continue; 3366: 3367: /* Enumerate the intersection set of this state and `accepts'. */ 3368: has_intersec = 0; 3369: for (k = 0; k < BITSET_UINTS; ++k) 3370: has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k]; 3371: /* And skip if the intersection set is empty. */ 3372: if (!has_intersec) 3373: continue; 3374: 3375: /* Then check if this state is a subset of `accepts'. */ 3376: not_subset = not_consumed = 0; 3377: for (k = 0; k < BITSET_UINTS; ++k) 3378: { 3379: not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k]; 3380: not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k]; 3381: } 3382: 3383: /* If this state isn't a subset of `accepts', create a 3384: new group state, which has the `remains'. */ 3385: if (not_subset) 3386: { 3387: bitset_copy (dests_ch[ndests], remains); 3388: bitset_copy (dests_ch[j], intersec); 3389: err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]); 3390: if (BE (err != REG_NOERROR, 0)) 3391: goto error_return; 3392: ++ndests; 3393: } 3394: 3395: /* Put the position in the current group. */ 3396: err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]); 3397: if (BE (err < 0, 0)) 3398: goto error_return; 3399: 3400: /* If all characters are consumed, go to next node. */ 3401: if (!not_consumed) 3402: break; 3403: } 3404: /* Some characters remain, create a new group. */ 3405: if (j == ndests) 3406: { 3407: bitset_copy (dests_ch[ndests], accepts); 3408: err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]); 3409: if (BE (err != REG_NOERROR, 0)) 3410: goto error_return; 3411: ++ndests; 3412: bitset_empty (accepts); 3413: } 3414: } 3415: return ndests; 3416: error_return: 3417: for (j = 0; j < ndests; ++j) 3418: re_node_set_free (dests_node + j); 3419: return -1; 3420: } 3421: 3422: #ifdef RE_ENABLE_I18N 3423: /* Check how many bytes the node `dfa->nodes[node_idx]' accepts. 3424: Return the number of the bytes the node accepts. 3425: STR_IDX is the current index of the input string. 3426: 3427: This function handles the nodes which can accept one character, or 3428: one collating element like '.', '[a-z]', opposite to the other nodes 3429: can only accept one byte. */ 3430: 3431: static int 3432: check_node_accept_bytes (preg, node_idx, input, str_idx) 3433: const regex_t *preg; 3434: int node_idx, str_idx; 3435: const re_string_t *input; 3436: { 3437: const re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 3438: const re_token_t *node = dfa->nodes + node_idx; 3439: int elem_len = re_string_elem_size_at (input, str_idx); 3440: int char_len = re_string_char_size_at (input, str_idx); 3441: int i; 3442: # ifdef _LIBC 3443: int j; 3444: uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3445: # endif /* _LIBC */ 3446: if (elem_len <= 1 && char_len <= 1) 3447: return 0; 3448: if (node->type == OP_PERIOD) 3449: { 3450: /* '.' accepts any one character except the following two cases. */ 3451: if ((!(preg->syntax & RE_DOT_NEWLINE) && 3452: re_string_byte_at (input, str_idx) == '\n') || 3453: ((preg->syntax & RE_DOT_NOT_NULL) && 3454: re_string_byte_at (input, str_idx) == '\0')) 3455: return 0; 3456: return char_len; 3457: } 3458: else if (node->type == COMPLEX_BRACKET) 3459: { 3460: const re_charset_t *cset = node->opr.mbcset; 3461: # ifdef _LIBC 3462: const unsigned char *pin = ((char *) re_string_get_buffer (input) 3463: + str_idx); 3464: # endif /* _LIBC */ 3465: int match_len = 0; 3466: wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) 3467: ? re_string_wchar_at (input, str_idx) : 0); 3468: 3469: /* match with multibyte character? */ 3470: for (i = 0; i < cset->nmbchars; ++i) 3471: if (wc == cset->mbchars[i]) 3472: { 3473: match_len = char_len; 3474: goto check_node_accept_bytes_match; 3475: } 3476: /* match with character_class? */ 3477: for (i = 0; i < cset->nchar_classes; ++i) 3478: { 3479: wctype_t wt = cset->char_classes[i]; 3480: if (__iswctype (wc, wt)) 3481: { 3482: match_len = char_len; 3483: goto check_node_accept_bytes_match; 3484: } 3485: } 3486: 3487: # ifdef _LIBC 3488: if (nrules != 0) 3489: { 3490: unsigned int in_collseq = 0; 3491: const int32_t *table, *indirect; 3492: const unsigned char *weights, *extra; 3493: const char *collseqwc; 3494: int32_t idx; 3495: /* This #include defines a local function! */ 3496: # include <locale/weight.h> 3497: 3498: /* match with collating_symbol? */ 3499: if (cset->ncoll_syms) 3500: extra = (const unsigned char *) 3501: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3502: for (i = 0; i < cset->ncoll_syms; ++i) 3503: { 3504: const unsigned char *coll_sym = extra + cset->coll_syms[i]; 3505: /* Compare the length of input collating element and 3506: the length of current collating element. */ 3507: if (*coll_sym != elem_len) 3508: continue; 3509: /* Compare each bytes. */ 3510: for (j = 0; j < *coll_sym; j++) 3511: if (pin[j] != coll_sym[1 + j]) 3512: break; 3513: if (j == *coll_sym) 3514: { 3515: /* Match if every bytes is equal. */ 3516: match_len = j; 3517: goto check_node_accept_bytes_match; 3518: } 3519: } 3520: 3521: if (cset->nranges) 3522: { 3523: if (elem_len <= char_len) 3524: { 3525: collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3526: in_collseq = collseq_table_lookup (collseqwc, wc); 3527: } 3528: else 3529: in_collseq = find_collation_sequence_value (pin, elem_len); 3530: } 3531: /* match with range expression? */ 3532: for (i = 0; i < cset->nranges; ++i) 3533: if (cset->range_starts[i] <= in_collseq 3534: && in_collseq <= cset->range_ends[i]) 3535: { 3536: match_len = elem_len; 3537: goto check_node_accept_bytes_match; 3538: } 3539: 3540: /* match with equivalence_class? */ 3541: if (cset->nequiv_classes) 3542: { 3543: const unsigned char *cp = pin; 3544: table = (const int32_t *) 3545: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3546: weights = (const unsigned char *) 3547: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3548: extra = (const unsigned char *) 3549: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3550: indirect = (const int32_t *) 3551: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3552: idx = findidx (&cp); 3553: if (idx > 0) 3554: for (i = 0; i < cset->nequiv_classes; ++i) 3555: { 3556: int32_t equiv_class_idx = cset->equiv_classes[i]; 3557: size_t weight_len = weights[idx]; 3558: if (weight_len == weights[equiv_class_idx]) 3559: { 3560: int cnt = 0; 3561: while (cnt <= weight_len 3562: && (weights[equiv_class_idx + 1 + cnt] 3563: == weights[idx + 1 + cnt])) 3564: ++cnt; 3565: if (cnt > weight_len) 3566: { 3567: match_len = elem_len; 3568: goto check_node_accept_bytes_match; 3569: } 3570: } 3571: } 3572: } 3573: } 3574: else 3575: # endif /* _LIBC */ 3576: { 3577: /* match with range expression? */ 3578: #if __GNUC__ >= 2 3579: wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'}; 3580: #else 3581: wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 3582: cmp_buf[2] = wc; 3583: #endif 3584: for (i = 0; i < cset->nranges; ++i) 3585: { 3586: cmp_buf[0] = cset->range_starts[i]; 3587: cmp_buf[4] = cset->range_ends[i]; 3588: if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 3589: && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 3590: { 3591: match_len = char_len; 3592: goto check_node_accept_bytes_match; 3593: } 3594: } 3595: } 3596: check_node_accept_bytes_match: 3597: if (!cset->non_match) 3598: return match_len; 3599: else 3600: { 3601: if (match_len > 0) 3602: return 0; 3603: else 3604: return (elem_len > char_len) ? elem_len : char_len; 3605: } 3606: } 3607: return 0; 3608: } 3609: 3610: # ifdef _LIBC 3611: static unsigned int 3612: find_collation_sequence_value (mbs, mbs_len) 3613: const unsigned char *mbs; 3614: size_t mbs_len; 3615: { 3616: uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3617: if (nrules == 0) 3618: { 3619: if (mbs_len == 1) 3620: { 3621: /* No valid character. Match it as a single byte character. */ 3622: const unsigned char *collseq = (const unsigned char *) 3623: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3624: return collseq[mbs[0]]; 3625: } 3626: return UINT_MAX; 3627: } 3628: else 3629: { 3630: int32_t idx; 3631: const unsigned char *extra = (const unsigned char *) 3632: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 3633: 3634: for (idx = 0; ;) 3635: { 3636: int mbs_cnt, found = 0; 3637: int32_t elem_mbs_len; 3638: /* Skip the name of collating element name. */ 3639: idx = idx + extra[idx] + 1; 3640: elem_mbs_len = extra[idx++]; 3641: if (mbs_len == elem_mbs_len) 3642: { 3643: for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt) 3644: if (extra[idx + mbs_cnt] != mbs[mbs_cnt]) 3645: break; 3646: if (mbs_cnt == elem_mbs_len) 3647: /* Found the entry. */ 3648: found = 1; 3649: } 3650: /* Skip the byte sequence of the collating element. */ 3651: idx += elem_mbs_len; 3652: /* Adjust for the alignment. */ 3653: idx = (idx + 3) & ~3; 3654: /* Skip the collation sequence value. */ 3655: idx += sizeof (uint32_t); 3656: /* Skip the wide char sequence of the collating element. */ 3657: idx = idx + sizeof (uint32_t) * (extra[idx] + 1); 3658: /* If we found the entry, return the sequence value. */ 3659: if (found) 3660: return *(uint32_t *) (extra + idx); 3661: /* Skip the collation sequence value. */ 3662: idx += sizeof (uint32_t); 3663: } 3664: } 3665: } 3666: # endif /* _LIBC */ 3667: #endif /* RE_ENABLE_I18N */ 3668: 3669: /* Check whether the node accepts the byte which is IDX-th 3670: byte of the INPUT. */ 3671: 3672: static int 3673: check_node_accept (preg, node, mctx, idx) 3674: const regex_t *preg; 3675: const re_token_t *node; 3676: const re_match_context_t *mctx; 3677: int idx; 3678: { 3679: unsigned char ch; 3680: if (node->constraint) 3681: { 3682: /* The node has constraints. Check whether the current context 3683: satisfies the constraints. */ 3684: unsigned int context = re_string_context_at (mctx->input, idx, 3685: mctx->eflags, 3686: preg->newline_anchor); 3687: if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context)) 3688: return 0; 3689: } 3690: ch = re_string_byte_at (mctx->input, idx); 3691: if (node->type == CHARACTER) 3692: return node->opr.c == ch; 3693: else if (node->type == SIMPLE_BRACKET) 3694: return bitset_contain (node->opr.sbcset, ch); 3695: else if (node->type == OP_PERIOD) 3696: return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE)) 3697: || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL))); 3698: else 3699: return 0; 3700: } 3701: 3702: /* Extend the buffers, if the buffers have run out. */ 3703: 3704: static reg_errcode_t 3705: extend_buffers (mctx) 3706: re_match_context_t *mctx; 3707: { 3708: reg_errcode_t ret; 3709: re_string_t *pstr = mctx->input; 3710: 3711: /* Double the lengthes of the buffers. */ 3712: ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 3713: if (BE (ret != REG_NOERROR, 0)) 3714: return ret; 3715: 3716: if (mctx->state_log != NULL) 3717: { 3718: /* And double the length of state_log. */ 3719: re_dfastate_t **new_array; 3720: new_array = re_realloc (mctx->state_log, re_dfastate_t *, 3721: pstr->bufs_len * 2); 3722: if (BE (new_array == NULL, 0)) 3723: return REG_ESPACE; 3724: mctx->state_log = new_array; 3725: } 3726: 3727: /* Then reconstruct the buffers. */ 3728: if (pstr->icase) 3729: { 3730: #ifdef RE_ENABLE_I18N 3731: if (MB_CUR_MAX > 1) 3732: build_wcs_upper_buffer (pstr); 3733: else 3734: #endif /* RE_ENABLE_I18N */ 3735: build_upper_buffer (pstr); 3736: } 3737: else 3738: { 3739: #ifdef RE_ENABLE_I18N 3740: if (MB_CUR_MAX > 1) 3741: build_wcs_buffer (pstr); 3742: else 3743: #endif /* RE_ENABLE_I18N */ 3744: { 3745: if (pstr->trans != NULL) 3746: re_string_translate_buffer (pstr); 3747: else 3748: pstr->valid_len = pstr->bufs_len; 3749: } 3750: } 3751: return REG_NOERROR; 3752: } 3753: 3754: 3755: /* Functions for matching context. */ 3756: 3757: /* Initialize MCTX. */ 3758: 3759: static reg_errcode_t 3760: match_ctx_init (mctx, eflags, input, n) 3761: re_match_context_t *mctx; 3762: int eflags, n; 3763: re_string_t *input; 3764: { 3765: mctx->eflags = eflags; 3766: mctx->input = input; 3767: mctx->match_last = -1; 3768: if (n > 0) 3769: { 3770: mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n); 3771: mctx->sub_tops = re_malloc (re_sub_match_top_t *, n); 3772: if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0)) 3773: return REG_ESPACE; 3774: } 3775: else 3776: mctx->bkref_ents = NULL; 3777: mctx->nbkref_ents = 0; 3778: mctx->abkref_ents = n; 3779: mctx->max_mb_elem_len = 1; 3780: mctx->nsub_tops = 0; 3781: mctx->asub_tops = n; 3782: return REG_NOERROR; 3783: } 3784: 3785: /* Clean the entries which depend on the current input in MCTX. 3786: This function must be invoked when the matcher changes the start index 3787: of the input, or changes the input string. */ 3788: 3789: static void 3790: match_ctx_clean (mctx) 3791: re_match_context_t *mctx; 3792: { 3793: match_ctx_free_subtops (mctx); 3794: mctx->nsub_tops = 0; 3795: mctx->nbkref_ents = 0; 3796: } 3797: 3798: /* Free all the memory associated with MCTX. */ 3799: 3800: static void 3801: match_ctx_free (mctx) 3802: re_match_context_t *mctx; 3803: { 3804: match_ctx_free_subtops (mctx); 3805: re_free (mctx->sub_tops); 3806: re_free (mctx->bkref_ents); 3807: } 3808: 3809: /* Free all the memory associated with MCTX->SUB_TOPS. */ 3810: 3811: static void 3812: match_ctx_free_subtops (mctx) 3813: re_match_context_t *mctx; 3814: { 3815: int st_idx; 3816: for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx) 3817: { 3818: int sl_idx; 3819: re_sub_match_top_t *top = mctx->sub_tops[st_idx]; 3820: for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx) 3821: { 3822: re_sub_match_last_t *last = top->lasts[sl_idx]; 3823: re_free (last->path.array); 3824: re_free (last); 3825: } 3826: re_free (top->lasts); 3827: if (top->path) 3828: { 3829: re_free (top->path->array); 3830: re_free (top->path); 3831: } 3832: free (top); 3833: } 3834: } 3835: 3836: /* Add a new backreference entry to MCTX. 3837: Note that we assume that caller never call this function with duplicate 3838: entry, and call with STR_IDX which isn't smaller than any existing entry. 3839: */ 3840: 3841: static reg_errcode_t 3842: match_ctx_add_entry (mctx, node, str_idx, from, to) 3843: re_match_context_t *mctx; 3844: int node, str_idx, from, to; 3845: { 3846: if (mctx->nbkref_ents >= mctx->abkref_ents) 3847: { 3848: struct re_backref_cache_entry* new_entry; 3849: new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry, 3850: mctx->abkref_ents * 2); 3851: if (BE (new_entry == NULL, 0)) 3852: { 3853: re_free (mctx->bkref_ents); 3854: return REG_ESPACE; 3855: } 3856: mctx->bkref_ents = new_entry; 3857: memset (mctx->bkref_ents + mctx->nbkref_ents, '\0', 3858: sizeof (struct re_backref_cache_entry) * mctx->abkref_ents); 3859: mctx->abkref_ents *= 2; 3860: } 3861: mctx->bkref_ents[mctx->nbkref_ents].node = node; 3862: mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx; 3863: mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from; 3864: mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to; 3865: mctx->bkref_ents[mctx->nbkref_ents++].flag = 0; 3866: if (mctx->max_mb_elem_len < to - from) 3867: mctx->max_mb_elem_len = to - from; 3868: return REG_NOERROR; 3869: } 3870: 3871: /* Search for the first entry which has the same str_idx. 3872: Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */ 3873: 3874: static int 3875: search_cur_bkref_entry (mctx, str_idx) 3876: re_match_context_t *mctx; 3877: int str_idx; 3878: { 3879: int left, right, mid; 3880: right = mctx->nbkref_ents; 3881: for (left = 0; left < right;) 3882: { 3883: mid = (left + right) / 2; 3884: if (mctx->bkref_ents[mid].str_idx < str_idx) 3885: left = mid + 1; 3886: else 3887: right = mid; 3888: } 3889: return left; 3890: } 3891: 3892: static void 3893: match_ctx_clear_flag (mctx) 3894: re_match_context_t *mctx; 3895: { 3896: int i; 3897: for (i = 0; i < mctx->nbkref_ents; ++i) 3898: { 3899: mctx->bkref_ents[i].flag = 0; 3900: } 3901: } 3902: 3903: /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches 3904: at STR_IDX. */ 3905: 3906: static reg_errcode_t 3907: match_ctx_add_subtop (mctx, node, str_idx) 3908: re_match_context_t *mctx; 3909: int node, str_idx; 3910: { 3911: #ifdef DEBUG 3912: assert (mctx->sub_tops != NULL); 3913: assert (mctx->asub_tops > 0); 3914: #endif 3915: if (mctx->nsub_tops == mctx->asub_tops) 3916: { 3917: re_sub_match_top_t **new_array; 3918: mctx->asub_tops *= 2; 3919: new_array = re_realloc (mctx->sub_tops, re_sub_match_top_t *, 3920: mctx->asub_tops); 3921: if (BE (new_array == NULL, 0)) 3922: return REG_ESPACE; 3923: mctx->sub_tops = new_array; 3924: } 3925: mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t)); 3926: if (mctx->sub_tops[mctx->nsub_tops] == NULL) 3927: return REG_ESPACE; 3928: mctx->sub_tops[mctx->nsub_tops]->node = node; 3929: mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx; 3930: return REG_NOERROR; 3931: } 3932: 3933: /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches 3934: at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */ 3935: 3936: static re_sub_match_last_t * 3937: match_ctx_add_sublast (subtop, node, str_idx) 3938: re_sub_match_top_t *subtop; 3939: int node, str_idx; 3940: { 3941: re_sub_match_last_t *new_entry; 3942: if (subtop->nlasts == subtop->alasts) 3943: { 3944: re_sub_match_last_t **new_array; 3945: subtop->alasts = 2 * subtop->alasts + 1; 3946: new_array = re_realloc (subtop->lasts, re_sub_match_last_t *, 3947: subtop->alasts); 3948: if (BE (new_array == NULL, 0)) 3949: return NULL; 3950: subtop->lasts = new_array; 3951: } 3952: new_entry = calloc (1, sizeof (re_sub_match_last_t)); 3953: if (BE (new_entry == NULL, 0)) 3954: return NULL; 3955: subtop->lasts[subtop->nlasts] = new_entry; 3956: new_entry->node = node; 3957: new_entry->str_idx = str_idx; 3958: ++subtop->nlasts; 3959: return new_entry; 3960: } 3961: 3962: static void 3963: sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx, 3964: check_subexp) 3965: re_sift_context_t *sctx; 3966: re_dfastate_t **sifted_sts, **limited_sts; 3967: int last_node, last_str_idx, check_subexp; 3968: { 3969: sctx->sifted_states = sifted_sts; 3970: sctx->limited_states = limited_sts; 3971: sctx->last_node = last_node; 3972: sctx->last_str_idx = last_str_idx; 3973: sctx->check_subexp = check_subexp; 3974: sctx->cur_bkref = -1; 3975: sctx->cls_subexp_idx = -1; 3976: re_node_set_init_empty (&sctx->limits); 3977: }