Annotation of embedaddon/smartmontools/regex/regexec.c, revision 1.1.1.1
1.1 misho 1: /* Extended regular expression matching and search library.
2: Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3: This file is part of the GNU C Library.
4: Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5:
6: The GNU C Library is free software; you can redistribute it and/or
7: modify it under the terms of the GNU Lesser General Public
8: License as published by the Free Software Foundation; either
9: version 2.1 of the License, or (at your option) any later version.
10:
11: The GNU C Library is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14: Lesser General Public License for more details.
15:
16: You should have received a copy of the GNU Lesser General Public
17: License along with the GNU C Library; if not, write to the Free
18: Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19: 02111-1307 USA. */
20:
21: static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
22: re_string_t *input, int n);
23: static void match_ctx_clean (re_match_context_t *mctx);
24: static void match_ctx_free (re_match_context_t *cache);
25: static void match_ctx_free_subtops (re_match_context_t *mctx);
26: static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
27: int str_idx, int from, int to);
28: static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx);
29: static void match_ctx_clear_flag (re_match_context_t *mctx);
30: static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
31: int str_idx);
32: static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
33: int node, int str_idx);
34: static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
35: re_dfastate_t **limited_sts, int last_node,
36: int last_str_idx, int check_subexp);
37: static reg_errcode_t re_search_internal (const regex_t *preg,
38: const char *string, int length,
39: int start, int range, int stop,
40: size_t nmatch, regmatch_t pmatch[],
41: int eflags);
42: static int re_search_2_stub (struct re_pattern_buffer *bufp,
43: const char *string1, int length1,
44: const char *string2, int length2,
45: int start, int range, struct re_registers *regs,
46: int stop, int ret_len);
47: static int re_search_stub (struct re_pattern_buffer *bufp,
48: const char *string, int length, int start,
49: int range, int stop, struct re_registers *regs,
50: int ret_len);
51: static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
52: int nregs, int regs_allocated);
53: static inline re_dfastate_t *acquire_init_state_context (reg_errcode_t *err,
54: const regex_t *preg,
55: const re_match_context_t *mctx,
56: int idx);
57: static reg_errcode_t prune_impossible_nodes (const regex_t *preg,
58: re_match_context_t *mctx);
59: static int check_matching (const regex_t *preg, re_match_context_t *mctx,
60: int fl_search, int fl_longest_match);
61: static int check_halt_node_context (const re_dfa_t *dfa, int node,
62: unsigned int context);
63: static int check_halt_state_context (const regex_t *preg,
64: const re_dfastate_t *state,
65: const re_match_context_t *mctx, int idx);
66: static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
67: int cur_idx, int nmatch);
68: static int proceed_next_node (const regex_t *preg, int nregs, regmatch_t *regs,
69: const re_match_context_t *mctx,
70: int *pidx, int node, re_node_set *eps_via_nodes,
71: struct re_fail_stack_t *fs);
72: static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
73: int str_idx, int *dests, int nregs,
74: regmatch_t *regs,
75: re_node_set *eps_via_nodes);
76: static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
77: regmatch_t *regs, re_node_set *eps_via_nodes);
78: static reg_errcode_t set_regs (const regex_t *preg,
79: const re_match_context_t *mctx,
80: size_t nmatch, regmatch_t *pmatch,
81: int fl_backtrack);
82: static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
83:
84: #ifdef RE_ENABLE_I18N
85: static int sift_states_iter_mb (const regex_t *preg,
86: const re_match_context_t *mctx,
87: re_sift_context_t *sctx,
88: int node_idx, int str_idx, int max_str_idx);
89: #endif /* RE_ENABLE_I18N */
90: static reg_errcode_t sift_states_backward (const regex_t *preg,
91: re_match_context_t *mctx,
92: re_sift_context_t *sctx);
93: static reg_errcode_t update_cur_sifted_state (const regex_t *preg,
94: re_match_context_t *mctx,
95: re_sift_context_t *sctx,
96: int str_idx,
97: re_node_set *dest_nodes);
98: static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
99: re_node_set *dest_nodes,
100: const re_node_set *candidates);
101: static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
102: re_node_set *dest_nodes,
103: const re_node_set *and_nodes);
104: static int check_dst_limits (re_dfa_t *dfa, re_node_set *limits,
105: re_match_context_t *mctx, int dst_node,
106: int dst_idx, int src_node, int src_idx);
107: static int check_dst_limits_calc_pos (re_dfa_t *dfa, re_match_context_t *mctx,
108: int limit, re_node_set *eclosures,
109: int subexp_idx, int node, int str_idx);
110: static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
111: re_node_set *dest_nodes,
112: const re_node_set *candidates,
113: re_node_set *limits,
114: struct re_backref_cache_entry *bkref_ents,
115: int str_idx);
116: static reg_errcode_t sift_states_bkref (const regex_t *preg,
117: re_match_context_t *mctx,
118: re_sift_context_t *sctx,
119: int str_idx, re_node_set *dest_nodes);
120: static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx,
121: int next_state_log_idx);
122: static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
123: re_dfastate_t **src, int num);
124: static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
125: re_match_context_t *mctx,
126: re_dfastate_t *state, int fl_search);
127: static reg_errcode_t check_subexp_matching_top (re_dfa_t *dfa,
128: re_match_context_t *mctx,
129: re_node_set *cur_nodes,
130: int str_idx);
131: static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
132: re_dfastate_t *pstate,
133: int fl_search,
134: re_match_context_t *mctx);
135: #ifdef RE_ENABLE_I18N
136: static reg_errcode_t transit_state_mb (const regex_t *preg,
137: re_dfastate_t *pstate,
138: re_match_context_t *mctx);
139: #endif /* RE_ENABLE_I18N */
140: static reg_errcode_t transit_state_bkref (const regex_t *preg,
141: re_node_set *nodes,
142: re_match_context_t *mctx);
143: static reg_errcode_t get_subexp (const regex_t *preg, re_match_context_t *mctx,
144: int bkref_node, int bkref_str_idx);
145: static reg_errcode_t get_subexp_sub (const regex_t *preg,
146: re_match_context_t *mctx,
147: re_sub_match_top_t *sub_top,
148: re_sub_match_last_t *sub_last,
149: int bkref_node, int bkref_str);
150: static int find_subexp_node (re_dfa_t *dfa, re_node_set *nodes,
151: int subexp_idx, int fl_open);
152: static reg_errcode_t check_arrival (const regex_t *preg,
153: re_match_context_t *mctx,
154: state_array_t *path, int top_node,
155: int top_str, int last_node, int last_str,
156: int fl_open);
157: static reg_errcode_t check_arrival_add_next_nodes (const regex_t *preg,
158: re_dfa_t *dfa,
159: re_match_context_t *mctx,
160: int str_idx,
161: re_node_set *cur_nodes,
162: re_node_set *next_nodes);
163: static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
164: re_node_set *cur_nodes,
165: int ex_subexp, int fl_open);
166: static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
167: re_node_set *dst_nodes,
168: int target, int ex_subexp,
169: int fl_open);
170: static reg_errcode_t expand_bkref_cache (const regex_t *preg,
171: re_match_context_t *mctx,
172: re_node_set *cur_nodes, int cur_str,
173: int last_str, int subexp_num,
174: int fl_open);
175: static re_dfastate_t **build_trtable (const regex_t *dfa,
176: const re_dfastate_t *state,
177: int fl_search);
178: #ifdef RE_ENABLE_I18N
179: static int check_node_accept_bytes (const regex_t *preg, int node_idx,
180: const re_string_t *input, int idx);
181: # ifdef _LIBC
182: static unsigned int find_collation_sequence_value (const unsigned char *mbs,
183: size_t name_len);
184: # endif /* _LIBC */
185: #endif /* RE_ENABLE_I18N */
186: static int group_nodes_into_DFAstates (const regex_t *dfa,
187: const re_dfastate_t *state,
188: re_node_set *states_node,
189: bitset *states_ch);
190: static int check_node_accept (const regex_t *preg, const re_token_t *node,
191: const re_match_context_t *mctx, int idx);
192: static reg_errcode_t extend_buffers (re_match_context_t *mctx);
193:
194: /* Entry point for POSIX code. */
195:
196: /* regexec searches for a given pattern, specified by PREG, in the
197: string STRING.
198:
199: If NMATCH is zero or REG_NOSUB was set in the cflags argument to
200: `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
201: least NMATCH elements, and we set them to the offsets of the
202: corresponding matched substrings.
203:
204: EFLAGS specifies `execution flags' which affect matching: if
205: REG_NOTBOL is set, then ^ does not match at the beginning of the
206: string; if REG_NOTEOL is set, then $ does not match at the end.
207:
208: We return 0 if we find a match and REG_NOMATCH if not. */
209:
210: int
211: regexec (preg, string, nmatch, pmatch, eflags)
212: const regex_t *__restrict preg;
213: const char *__restrict string;
214: size_t nmatch;
215: regmatch_t pmatch[];
216: int eflags;
217: {
218: reg_errcode_t err;
219: int length = strlen (string);
220: if (preg->no_sub)
221: err = re_search_internal (preg, string, length, 0, length, length, 0,
222: NULL, eflags);
223: else
224: err = re_search_internal (preg, string, length, 0, length, length, nmatch,
225: pmatch, eflags);
226: return err != REG_NOERROR;
227: }
228: #ifdef _LIBC
229: weak_alias (__regexec, regexec)
230: #endif
231:
232: /* Entry points for GNU code. */
233:
234: /* re_match, re_search, re_match_2, re_search_2
235:
236: The former two functions operate on STRING with length LENGTH,
237: while the later two operate on concatenation of STRING1 and STRING2
238: with lengths LENGTH1 and LENGTH2, respectively.
239:
240: re_match() matches the compiled pattern in BUFP against the string,
241: starting at index START.
242:
243: re_search() first tries matching at index START, then it tries to match
244: starting from index START + 1, and so on. The last start position tried
245: is START + RANGE. (Thus RANGE = 0 forces re_search to operate the same
246: way as re_match().)
247:
248: The parameter STOP of re_{match,search}_2 specifies that no match exceeding
249: the first STOP characters of the concatenation of the strings should be
250: concerned.
251:
252: If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
253: and all groups is stroed in REGS. (For the "_2" variants, the offsets are
254: computed relative to the concatenation, not relative to the individual
255: strings.)
256:
257: On success, re_match* functions return the length of the match, re_search*
258: return the position of the start of the match. Return value -1 means no
259: match was found and -2 indicates an internal error. */
260:
261: int
262: re_match (bufp, string, length, start, regs)
263: struct re_pattern_buffer *bufp;
264: const char *string;
265: int length, start;
266: struct re_registers *regs;
267: {
268: return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
269: }
270: #ifdef _LIBC
271: weak_alias (__re_match, re_match)
272: #endif
273:
274: int
275: re_search (bufp, string, length, start, range, regs)
276: struct re_pattern_buffer *bufp;
277: const char *string;
278: int length, start, range;
279: struct re_registers *regs;
280: {
281: return re_search_stub (bufp, string, length, start, range, length, regs, 0);
282: }
283: #ifdef _LIBC
284: weak_alias (__re_search, re_search)
285: #endif
286:
287: int
288: re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
289: struct re_pattern_buffer *bufp;
290: const char *string1, *string2;
291: int length1, length2, start, stop;
292: struct re_registers *regs;
293: {
294: return re_search_2_stub (bufp, string1, length1, string2, length2,
295: start, 0, regs, stop, 1);
296: }
297: #ifdef _LIBC
298: weak_alias (__re_match_2, re_match_2)
299: #endif
300:
301: int
302: re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
303: struct re_pattern_buffer *bufp;
304: const char *string1, *string2;
305: int length1, length2, start, range, stop;
306: struct re_registers *regs;
307: {
308: return re_search_2_stub (bufp, string1, length1, string2, length2,
309: start, range, regs, stop, 0);
310: }
311: #ifdef _LIBC
312: weak_alias (__re_search_2, re_search_2)
313: #endif
314:
315: static int
316: re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
317: stop, ret_len)
318: struct re_pattern_buffer *bufp;
319: const char *string1, *string2;
320: int length1, length2, start, range, stop, ret_len;
321: struct re_registers *regs;
322: {
323: const char *str;
324: int rval;
325: int len = length1 + length2;
326: int free_str = 0;
327:
328: if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
329: return -2;
330:
331: /* Concatenate the strings. */
332: if (length2 > 0)
333: if (length1 > 0)
334: {
335: char *s = re_malloc (char, len);
336:
337: if (BE (s == NULL, 0))
338: return -2;
339: memcpy (s, string1, length1);
340: memcpy (s + length1, string2, length2);
341: str = s;
342: free_str = 1;
343: }
344: else
345: str = string2;
346: else
347: str = string1;
348:
349: rval = re_search_stub (bufp, str, len, start, range, stop, regs,
350: ret_len);
351: if (free_str)
352: re_free ((char *) str);
353: return rval;
354: }
355:
356: /* The parameters have the same meaning as those of re_search.
357: Additional parameters:
358: If RET_LEN is nonzero the length of the match is returned (re_match style);
359: otherwise the position of the match is returned. */
360:
361: static int
362: re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
363: struct re_pattern_buffer *bufp;
364: const char *string;
365: int length, start, range, stop, ret_len;
366: struct re_registers *regs;
367: {
368: reg_errcode_t result;
369: regmatch_t *pmatch;
370: int nregs, rval;
371: int eflags = 0;
372:
373: /* Check for out-of-range. */
374: if (BE (start < 0 || start > length, 0))
375: return -1;
376: if (BE (start + range > length, 0))
377: range = length - start;
378: else if (BE (start + range < 0, 0))
379: range = -start;
380:
381: eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
382: eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
383:
384: /* Compile fastmap if we haven't yet. */
385: if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
386: re_compile_fastmap (bufp);
387:
388: if (BE (bufp->no_sub, 0))
389: regs = NULL;
390:
391: /* We need at least 1 register. */
392: if (regs == NULL)
393: nregs = 1;
394: else if (BE (bufp->regs_allocated == REGS_FIXED &&
395: regs->num_regs < bufp->re_nsub + 1, 0))
396: {
397: nregs = regs->num_regs;
398: if (BE (nregs < 1, 0))
399: {
400: /* Nothing can be copied to regs. */
401: regs = NULL;
402: nregs = 1;
403: }
404: }
405: else
406: nregs = bufp->re_nsub + 1;
407: pmatch = re_malloc (regmatch_t, nregs);
408: if (BE (pmatch == NULL, 0))
409: return -2;
410:
411: result = re_search_internal (bufp, string, length, start, range, stop,
412: nregs, pmatch, eflags);
413:
414: rval = 0;
415:
416: /* I hope we needn't fill ther regs with -1's when no match was found. */
417: if (result != REG_NOERROR)
418: rval = -1;
419: else if (regs != NULL)
420: {
421: /* If caller wants register contents data back, copy them. */
422: bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
423: bufp->regs_allocated);
424: if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
425: rval = -2;
426: }
427:
428: if (BE (rval == 0, 1))
429: {
430: if (ret_len)
431: {
432: assert (pmatch[0].rm_so == start);
433: rval = pmatch[0].rm_eo - start;
434: }
435: else
436: rval = pmatch[0].rm_so;
437: }
438: re_free (pmatch);
439: return rval;
440: }
441:
442: static unsigned
443: re_copy_regs (regs, pmatch, nregs, regs_allocated)
444: struct re_registers *regs;
445: regmatch_t *pmatch;
446: int nregs, regs_allocated;
447: {
448: int rval = REGS_REALLOCATE;
449: int i;
450: int need_regs = nregs + 1;
451: /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
452: uses. */
453:
454: /* Have the register data arrays been allocated? */
455: if (regs_allocated == REGS_UNALLOCATED)
456: { /* No. So allocate them with malloc. */
457: regs->start = re_malloc (regoff_t, need_regs);
458: if (BE (regs->start == NULL, 0))
459: return REGS_UNALLOCATED;
460: regs->end = re_malloc (regoff_t, need_regs);
461: if (BE (regs->end == NULL, 0))
462: {
463: re_free (regs->start);
464: return REGS_UNALLOCATED;
465: }
466: regs->num_regs = need_regs;
467: }
468: else if (regs_allocated == REGS_REALLOCATE)
469: { /* Yes. If we need more elements than were already
470: allocated, reallocate them. If we need fewer, just
471: leave it alone. */
472: if (need_regs > regs->num_regs)
473: {
474: regs->start = re_realloc (regs->start, regoff_t, need_regs);
475: if (BE (regs->start == NULL, 0))
476: {
477: if (regs->end != NULL)
478: re_free (regs->end);
479: return REGS_UNALLOCATED;
480: }
481: regs->end = re_realloc (regs->end, regoff_t, need_regs);
482: if (BE (regs->end == NULL, 0))
483: {
484: re_free (regs->start);
485: return REGS_UNALLOCATED;
486: }
487: regs->num_regs = need_regs;
488: }
489: }
490: else
491: {
492: assert (regs_allocated == REGS_FIXED);
493: /* This function may not be called with REGS_FIXED and nregs too big. */
494: assert (regs->num_regs >= nregs);
495: rval = REGS_FIXED;
496: }
497:
498: /* Copy the regs. */
499: for (i = 0; i < nregs; ++i)
500: {
501: regs->start[i] = pmatch[i].rm_so;
502: regs->end[i] = pmatch[i].rm_eo;
503: }
504: for ( ; i < regs->num_regs; ++i)
505: regs->start[i] = regs->end[i] = -1;
506:
507: return rval;
508: }
509:
510: /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
511: ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
512: this memory for recording register information. STARTS and ENDS
513: must be allocated using the malloc library routine, and must each
514: be at least NUM_REGS * sizeof (regoff_t) bytes long.
515:
516: If NUM_REGS == 0, then subsequent matches should allocate their own
517: register data.
518:
519: Unless this function is called, the first search or match using
520: PATTERN_BUFFER will allocate its own register data, without
521: freeing the old data. */
522:
523: void
524: re_set_registers (bufp, regs, num_regs, starts, ends)
525: struct re_pattern_buffer *bufp;
526: struct re_registers *regs;
527: unsigned num_regs;
528: regoff_t *starts, *ends;
529: {
530: if (num_regs)
531: {
532: bufp->regs_allocated = REGS_REALLOCATE;
533: regs->num_regs = num_regs;
534: regs->start = starts;
535: regs->end = ends;
536: }
537: else
538: {
539: bufp->regs_allocated = REGS_UNALLOCATED;
540: regs->num_regs = 0;
541: regs->start = regs->end = (regoff_t *) 0;
542: }
543: }
544: #ifdef _LIBC
545: weak_alias (__re_set_registers, re_set_registers)
546: #endif
547:
548: /* Entry points compatible with 4.2 BSD regex library. We don't define
549: them unless specifically requested. */
550:
551: #if defined _REGEX_RE_COMP || defined _LIBC
552: int
553: # ifdef _LIBC
554: weak_function
555: # endif
556: re_exec (s)
557: const char *s;
558: {
559: return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
560: }
561: #endif /* _REGEX_RE_COMP */
562:
563: static re_node_set empty_set;
564:
565: /* Internal entry point. */
566:
567: /* Searches for a compiled pattern PREG in the string STRING, whose
568: length is LENGTH. NMATCH, PMATCH, and EFLAGS have the same
569: mingings with regexec. START, and RANGE have the same meanings
570: with re_search.
571: Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
572: otherwise return the error code.
573: Note: We assume front end functions already check ranges.
574: (START + RANGE >= 0 && START + RANGE <= LENGTH) */
575:
576: static reg_errcode_t
577: re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
578: eflags)
579: const regex_t *preg;
580: const char *string;
581: int length, start, range, stop, eflags;
582: size_t nmatch;
583: regmatch_t pmatch[];
584: {
585: reg_errcode_t err;
586: re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
587: re_string_t input;
588: int left_lim, right_lim, incr;
589: int fl_longest_match, match_first, match_last = -1;
590: int fast_translate, sb;
591: re_match_context_t mctx;
592: char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
593: && range && !preg->can_be_null) ? preg->fastmap : NULL);
594:
595: /* Check if the DFA haven't been compiled. */
596: if (BE (preg->used == 0 || dfa->init_state == NULL
597: || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
598: || dfa->init_state_begbuf == NULL, 0))
599: return REG_NOMATCH;
600:
601: re_node_set_init_empty (&empty_set);
602: memset (&mctx, '\0', sizeof (re_match_context_t));
603:
604: /* We must check the longest matching, if nmatch > 0. */
605: fl_longest_match = (nmatch != 0 || dfa->nbackref);
606:
607: err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
608: preg->translate, preg->syntax & RE_ICASE);
609: if (BE (err != REG_NOERROR, 0))
610: goto free_return;
611: input.stop = stop;
612:
613: err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
614: if (BE (err != REG_NOERROR, 0))
615: goto free_return;
616:
617: /* We will log all the DFA states through which the dfa pass,
618: if nmatch > 1, or this dfa has "multibyte node", which is a
619: back-reference or a node which can accept multibyte character or
620: multi character collating element. */
621: if (nmatch > 1 || dfa->has_mb_node)
622: {
623: mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1);
624: if (BE (mctx.state_log == NULL, 0))
625: {
626: err = REG_ESPACE;
627: goto free_return;
628: }
629: }
630: else
631: mctx.state_log = NULL;
632:
633: #ifdef DEBUG
634: /* We assume front-end functions already check them. */
635: assert (start + range >= 0 && start + range <= length);
636: #endif
637:
638: match_first = start;
639: input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
640: : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
641:
642: /* Check incrementally whether of not the input string match. */
643: incr = (range < 0) ? -1 : 1;
644: left_lim = (range < 0) ? start + range : start;
645: right_lim = (range < 0) ? start : start + range;
646: sb = MB_CUR_MAX == 1;
647: fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
648:
649: for (;;)
650: {
651: /* At first get the current byte from input string. */
652: if (fastmap)
653: {
654: if (BE (fast_translate, 1))
655: {
656: unsigned RE_TRANSLATE_TYPE t
657: = (unsigned RE_TRANSLATE_TYPE) preg->translate;
658: if (BE (range >= 0, 1))
659: {
660: if (BE (t != NULL, 0))
661: {
662: while (BE (match_first < right_lim, 1)
663: && !fastmap[t[(unsigned char) string[match_first]]])
664: ++match_first;
665: }
666: else
667: {
668: while (BE (match_first < right_lim, 1)
669: && !fastmap[(unsigned char) string[match_first]])
670: ++match_first;
671: }
672: if (BE (match_first == right_lim, 0))
673: {
674: int ch = match_first >= length
675: ? 0 : (unsigned char) string[match_first];
676: if (!fastmap[t ? t[ch] : ch])
677: break;
678: }
679: }
680: else
681: {
682: while (match_first >= left_lim)
683: {
684: int ch = match_first >= length
685: ? 0 : (unsigned char) string[match_first];
686: if (fastmap[t ? t[ch] : ch])
687: break;
688: --match_first;
689: }
690: if (match_first < left_lim)
691: break;
692: }
693: }
694: else
695: {
696: int ch;
697:
698: do
699: {
700: /* In this case, we can't determine easily the current byte,
701: since it might be a component byte of a multibyte
702: character. Then we use the constructed buffer
703: instead. */
704: /* If MATCH_FIRST is out of the valid range, reconstruct the
705: buffers. */
706: if (input.raw_mbs_idx + input.valid_len <= match_first
707: || match_first < input.raw_mbs_idx)
708: {
709: err = re_string_reconstruct (&input, match_first, eflags,
710: preg->newline_anchor);
711: if (BE (err != REG_NOERROR, 0))
712: goto free_return;
713: }
714: /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
715: Note that MATCH_FIRST must not be smaller than 0. */
716: ch = ((match_first >= length) ? 0
717: : re_string_byte_at (&input,
718: match_first - input.raw_mbs_idx));
719: if (fastmap[ch])
720: break;
721: match_first += incr;
722: }
723: while (match_first >= left_lim && match_first <= right_lim);
724: if (! fastmap[ch])
725: break;
726: }
727: }
728:
729: /* Reconstruct the buffers so that the matcher can assume that
730: the matching starts from the begining of the buffer. */
731: err = re_string_reconstruct (&input, match_first, eflags,
732: preg->newline_anchor);
733: if (BE (err != REG_NOERROR, 0))
734: goto free_return;
735: #ifdef RE_ENABLE_I18N
736: /* Eliminate it when it is a component of a multibyte character
737: and isn't the head of a multibyte character. */
738: if (sb || re_string_first_byte (&input, 0))
739: #endif
740: {
741: /* It seems to be appropriate one, then use the matcher. */
742: /* We assume that the matching starts from 0. */
743: mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
744: match_last = check_matching (preg, &mctx, 0, fl_longest_match);
745: if (match_last != -1)
746: {
747: if (BE (match_last == -2, 0))
748: {
749: err = REG_ESPACE;
750: goto free_return;
751: }
752: else
753: {
754: mctx.match_last = match_last;
755: if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
756: {
757: re_dfastate_t *pstate = mctx.state_log[match_last];
758: mctx.last_node = check_halt_state_context (preg, pstate,
759: &mctx, match_last);
760: }
761: if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
762: || dfa->nbackref)
763: {
764: err = prune_impossible_nodes (preg, &mctx);
765: if (err == REG_NOERROR)
766: break;
767: if (BE (err != REG_NOMATCH, 0))
768: goto free_return;
769: }
770: else
771: break; /* We found a matching. */
772: }
773: }
774: match_ctx_clean (&mctx);
775: }
776: /* Update counter. */
777: match_first += incr;
778: if (match_first < left_lim || right_lim < match_first)
779: break;
780: }
781:
782: /* Set pmatch[] if we need. */
783: if (match_last != -1 && nmatch > 0)
784: {
785: int reg_idx;
786:
787: /* Initialize registers. */
788: for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
789: pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
790:
791: /* Set the points where matching start/end. */
792: pmatch[0].rm_so = 0;
793: pmatch[0].rm_eo = mctx.match_last;
794:
795: if (!preg->no_sub && nmatch > 1)
796: {
797: err = set_regs (preg, &mctx, nmatch, pmatch,
798: dfa->has_plural_match && dfa->nbackref > 0);
799: if (BE (err != REG_NOERROR, 0))
800: goto free_return;
801: }
802:
803: /* At last, add the offset to the each registers, since we slided
804: the buffers so that We can assume that the matching starts from 0. */
805: for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
806: if (pmatch[reg_idx].rm_so != -1)
807: {
808: pmatch[reg_idx].rm_so += match_first;
809: pmatch[reg_idx].rm_eo += match_first;
810: }
811: }
812: err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
813: free_return:
814: re_free (mctx.state_log);
815: if (dfa->nbackref)
816: match_ctx_free (&mctx);
817: re_string_destruct (&input);
818: return err;
819: }
820:
821: static reg_errcode_t
822: prune_impossible_nodes (preg, mctx)
823: const regex_t *preg;
824: re_match_context_t *mctx;
825: {
826: int halt_node, match_last;
827: reg_errcode_t ret;
828: re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
829: re_dfastate_t **sifted_states;
830: re_dfastate_t **lim_states = NULL;
831: re_sift_context_t sctx;
832: #ifdef DEBUG
833: assert (mctx->state_log != NULL);
834: #endif
835: match_last = mctx->match_last;
836: halt_node = mctx->last_node;
837: sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
838: if (BE (sifted_states == NULL, 0))
839: {
840: ret = REG_ESPACE;
841: goto free_return;
842: }
843: if (dfa->nbackref)
844: {
845: lim_states = re_malloc (re_dfastate_t *, match_last + 1);
846: if (BE (lim_states == NULL, 0))
847: {
848: ret = REG_ESPACE;
849: goto free_return;
850: }
851: while (1)
852: {
853: memset (lim_states, '\0',
854: sizeof (re_dfastate_t *) * (match_last + 1));
855: match_ctx_clear_flag (mctx);
856: sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
857: match_last, 0);
858: ret = sift_states_backward (preg, mctx, &sctx);
859: re_node_set_free (&sctx.limits);
860: if (BE (ret != REG_NOERROR, 0))
861: goto free_return;
862: if (sifted_states[0] != NULL || lim_states[0] != NULL)
863: break;
864: do
865: {
866: --match_last;
867: if (match_last < 0)
868: {
869: ret = REG_NOMATCH;
870: goto free_return;
871: }
872: } while (!mctx->state_log[match_last]->halt);
873: halt_node = check_halt_state_context (preg,
874: mctx->state_log[match_last],
875: mctx, match_last);
876: }
877: ret = merge_state_array (dfa, sifted_states, lim_states,
878: match_last + 1);
879: re_free (lim_states);
880: lim_states = NULL;
881: if (BE (ret != REG_NOERROR, 0))
882: goto free_return;
883: }
884: else
885: {
886: sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
887: match_last, 0);
888: ret = sift_states_backward (preg, mctx, &sctx);
889: re_node_set_free (&sctx.limits);
890: if (BE (ret != REG_NOERROR, 0))
891: goto free_return;
892: }
893: re_free (mctx->state_log);
894: mctx->state_log = sifted_states;
895: sifted_states = NULL;
896: mctx->last_node = halt_node;
897: mctx->match_last = match_last;
898: ret = REG_NOERROR;
899: free_return:
900: re_free (sifted_states);
901: re_free (lim_states);
902: return ret;
903: }
904:
905: /* Acquire an initial state and return it.
906: We must select appropriate initial state depending on the context,
907: since initial states may have constraints like "\<", "^", etc.. */
908:
909: static inline re_dfastate_t *
910: acquire_init_state_context (err, preg, mctx, idx)
911: reg_errcode_t *err;
912: const regex_t *preg;
913: const re_match_context_t *mctx;
914: int idx;
915: {
916: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
917:
918: *err = REG_NOERROR;
919: if (dfa->init_state->has_constraint)
920: {
921: unsigned int context;
922: context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
923: preg->newline_anchor);
924: if (IS_WORD_CONTEXT (context))
925: return dfa->init_state_word;
926: else if (IS_ORDINARY_CONTEXT (context))
927: return dfa->init_state;
928: else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
929: return dfa->init_state_begbuf;
930: else if (IS_NEWLINE_CONTEXT (context))
931: return dfa->init_state_nl;
932: else if (IS_BEGBUF_CONTEXT (context))
933: {
934: /* It is relatively rare case, then calculate on demand. */
935: return re_acquire_state_context (err, dfa,
936: dfa->init_state->entrance_nodes,
937: context);
938: }
939: else
940: /* Must not happen? */
941: return dfa->init_state;
942: }
943: else
944: return dfa->init_state;
945: }
946:
947: /* Check whether the regular expression match input string INPUT or not,
948: and return the index where the matching end, return -1 if not match,
949: or return -2 in case of an error.
950: FL_SEARCH means we must search where the matching starts,
951: FL_LONGEST_MATCH means we want the POSIX longest matching.
952: Note that the matcher assume that the maching starts from the current
953: index of the buffer. */
954:
955: static int
956: check_matching (preg, mctx, fl_search, fl_longest_match)
957: const regex_t *preg;
958: re_match_context_t *mctx;
959: int fl_search, fl_longest_match;
960: {
961: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
962: reg_errcode_t err;
963: int match = 0;
964: int match_last = -1;
965: int cur_str_idx = re_string_cur_idx (mctx->input);
966: re_dfastate_t *cur_state;
967:
968: cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx);
969: /* An initial state must not be NULL(invalid state). */
970: if (BE (cur_state == NULL, 0))
971: return -2;
972: if (mctx->state_log != NULL)
973: mctx->state_log[cur_str_idx] = cur_state;
974:
975: /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
976: later. E.g. Processing back references. */
977: if (dfa->nbackref)
978: {
979: err = check_subexp_matching_top (dfa, mctx, &cur_state->nodes, 0);
980: if (BE (err != REG_NOERROR, 0))
981: return err;
982: }
983:
984: if (cur_state->has_backref)
985: {
986: err = transit_state_bkref (preg, &cur_state->nodes, mctx);
987: if (BE (err != REG_NOERROR, 0))
988: return err;
989: }
990:
991: /* If the RE accepts NULL string. */
992: if (cur_state->halt)
993: {
994: if (!cur_state->has_constraint
995: || check_halt_state_context (preg, cur_state, mctx, cur_str_idx))
996: {
997: if (!fl_longest_match)
998: return cur_str_idx;
999: else
1000: {
1001: match_last = cur_str_idx;
1002: match = 1;
1003: }
1004: }
1005: }
1006:
1007: while (!re_string_eoi (mctx->input))
1008: {
1009: cur_state = transit_state (&err, preg, mctx, cur_state,
1010: fl_search && !match);
1011: if (cur_state == NULL) /* Reached at the invalid state or an error. */
1012: {
1013: cur_str_idx = re_string_cur_idx (mctx->input);
1014: if (BE (err != REG_NOERROR, 0))
1015: return -2;
1016: if (fl_search && !match)
1017: {
1018: /* Restart from initial state, since we are searching
1019: the point from where matching start. */
1020: #ifdef RE_ENABLE_I18N
1021: if (MB_CUR_MAX == 1
1022: || re_string_first_byte (mctx->input, cur_str_idx))
1023: #endif /* RE_ENABLE_I18N */
1024: cur_state = acquire_init_state_context (&err, preg, mctx,
1025: cur_str_idx);
1026: if (BE (cur_state == NULL && err != REG_NOERROR, 0))
1027: return -2;
1028: if (mctx->state_log != NULL)
1029: mctx->state_log[cur_str_idx] = cur_state;
1030: }
1031: else if (!fl_longest_match && match)
1032: break;
1033: else /* (fl_longest_match && match) || (!fl_search && !match) */
1034: {
1035: if (mctx->state_log == NULL)
1036: break;
1037: else
1038: {
1039: int max = mctx->state_log_top;
1040: for (; cur_str_idx <= max; ++cur_str_idx)
1041: if (mctx->state_log[cur_str_idx] != NULL)
1042: break;
1043: if (cur_str_idx > max)
1044: break;
1045: }
1046: }
1047: }
1048:
1049: if (cur_state != NULL && cur_state->halt)
1050: {
1051: /* Reached at a halt state.
1052: Check the halt state can satisfy the current context. */
1053: if (!cur_state->has_constraint
1054: || check_halt_state_context (preg, cur_state, mctx,
1055: re_string_cur_idx (mctx->input)))
1056: {
1057: /* We found an appropriate halt state. */
1058: match_last = re_string_cur_idx (mctx->input);
1059: match = 1;
1060: if (!fl_longest_match)
1061: break;
1062: }
1063: }
1064: }
1065: return match_last;
1066: }
1067:
1068: /* Check NODE match the current context. */
1069:
1070: static int check_halt_node_context (dfa, node, context)
1071: const re_dfa_t *dfa;
1072: int node;
1073: unsigned int context;
1074: {
1075: re_token_type_t type = dfa->nodes[node].type;
1076: unsigned int constraint = dfa->nodes[node].constraint;
1077: if (type != END_OF_RE)
1078: return 0;
1079: if (!constraint)
1080: return 1;
1081: if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1082: return 0;
1083: return 1;
1084: }
1085:
1086: /* Check the halt state STATE match the current context.
1087: Return 0 if not match, if the node, STATE has, is a halt node and
1088: match the context, return the node. */
1089:
1090: static int
1091: check_halt_state_context (preg, state, mctx, idx)
1092: const regex_t *preg;
1093: const re_dfastate_t *state;
1094: const re_match_context_t *mctx;
1095: int idx;
1096: {
1097: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1098: int i;
1099: unsigned int context;
1100: #ifdef DEBUG
1101: assert (state->halt);
1102: #endif
1103: context = re_string_context_at (mctx->input, idx, mctx->eflags,
1104: preg->newline_anchor);
1105: for (i = 0; i < state->nodes.nelem; ++i)
1106: if (check_halt_node_context (dfa, state->nodes.elems[i], context))
1107: return state->nodes.elems[i];
1108: return 0;
1109: }
1110:
1111: /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1112: corresponding to the DFA).
1113: Return the destination node, and update EPS_VIA_NODES, return -1 in case
1114: of errors. */
1115:
1116: static int
1117: proceed_next_node (preg, nregs, regs, mctx, pidx, node, eps_via_nodes, fs)
1118: const regex_t *preg;
1119: regmatch_t *regs;
1120: const re_match_context_t *mctx;
1121: int nregs, *pidx, node;
1122: re_node_set *eps_via_nodes;
1123: struct re_fail_stack_t *fs;
1124: {
1125: re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1126: int i, err, dest_node;
1127: dest_node = -1;
1128: if (IS_EPSILON_NODE (dfa->nodes[node].type))
1129: {
1130: re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1131: int ndest, dest_nodes[2];
1132: err = re_node_set_insert (eps_via_nodes, node);
1133: if (BE (err < 0, 0))
1134: return -1;
1135: /* Pick up valid destinations. */
1136: for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1137: {
1138: int candidate = dfa->edests[node].elems[i];
1139: if (!re_node_set_contains (cur_nodes, candidate))
1140: continue;
1141: dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1142: dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1143: ++ndest;
1144: }
1145: if (ndest <= 1)
1146: return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1147: /* In order to avoid infinite loop like "(a*)*". */
1148: if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1149: return dest_nodes[1];
1150: if (fs != NULL)
1151: push_fail_stack (fs, *pidx, dest_nodes, nregs, regs, eps_via_nodes);
1152: return dest_nodes[0];
1153: }
1154: else
1155: {
1156: int naccepted = 0;
1157: re_token_type_t type = dfa->nodes[node].type;
1158:
1159: #ifdef RE_ENABLE_I18N
1160: if (ACCEPT_MB_NODE (type))
1161: naccepted = check_node_accept_bytes (preg, node, mctx->input, *pidx);
1162: else
1163: #endif /* RE_ENABLE_I18N */
1164: if (type == OP_BACK_REF)
1165: {
1166: int subexp_idx = dfa->nodes[node].opr.idx;
1167: naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1168: if (fs != NULL)
1169: {
1170: if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1171: return -1;
1172: else if (naccepted)
1173: {
1174: char *buf = (char *) re_string_get_buffer (mctx->input);
1175: if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1176: naccepted) != 0)
1177: return -1;
1178: }
1179: }
1180:
1181: if (naccepted == 0)
1182: {
1183: err = re_node_set_insert (eps_via_nodes, node);
1184: if (BE (err < 0, 0))
1185: return -2;
1186: dest_node = dfa->edests[node].elems[0];
1187: if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1188: dest_node))
1189: return dest_node;
1190: }
1191: }
1192:
1193: if (naccepted != 0
1194: || check_node_accept (preg, dfa->nodes + node, mctx, *pidx))
1195: {
1196: dest_node = dfa->nexts[node];
1197: *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1198: if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1199: || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1200: dest_node)))
1201: return -1;
1202: re_node_set_empty (eps_via_nodes);
1203: return dest_node;
1204: }
1205: }
1206: return -1;
1207: }
1208:
1209: static reg_errcode_t
1210: push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1211: struct re_fail_stack_t *fs;
1212: int str_idx, *dests, nregs;
1213: regmatch_t *regs;
1214: re_node_set *eps_via_nodes;
1215: {
1216: reg_errcode_t err;
1217: int num = fs->num++;
1218: if (fs->num == fs->alloc)
1219: {
1220: struct re_fail_stack_ent_t *new_array;
1221: fs->alloc *= 2;
1222: new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1223: * fs->alloc));
1224: if (new_array == NULL)
1225: return REG_ESPACE;
1226: fs->stack = new_array;
1227: }
1228: fs->stack[num].idx = str_idx;
1229: fs->stack[num].node = dests[1];
1230: fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1231: memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1232: err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1233: return err;
1234: }
1235:
1236: static int
1237: pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1238: struct re_fail_stack_t *fs;
1239: int *pidx, nregs;
1240: regmatch_t *regs;
1241: re_node_set *eps_via_nodes;
1242: {
1243: int num = --fs->num;
1244: assert (num >= 0);
1245: *pidx = fs->stack[num].idx;
1246: memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1247: re_node_set_free (eps_via_nodes);
1248: re_free (fs->stack[num].regs);
1249: *eps_via_nodes = fs->stack[num].eps_via_nodes;
1250: return fs->stack[num].node;
1251: }
1252:
1253: /* Set the positions where the subexpressions are starts/ends to registers
1254: PMATCH.
1255: Note: We assume that pmatch[0] is already set, and
1256: pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1). */
1257:
1258: static reg_errcode_t
1259: set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1260: const regex_t *preg;
1261: const re_match_context_t *mctx;
1262: size_t nmatch;
1263: regmatch_t *pmatch;
1264: int fl_backtrack;
1265: {
1266: re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1267: int idx, cur_node, real_nmatch;
1268: re_node_set eps_via_nodes;
1269: struct re_fail_stack_t *fs;
1270: struct re_fail_stack_t fs_body = {0, 2, NULL};
1271: #ifdef DEBUG
1272: assert (nmatch > 1);
1273: assert (mctx->state_log != NULL);
1274: #endif
1275: if (fl_backtrack)
1276: {
1277: fs = &fs_body;
1278: fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1279: }
1280: else
1281: fs = NULL;
1282: cur_node = dfa->init_node;
1283: real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1284: re_node_set_init_empty (&eps_via_nodes);
1285: for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1286: {
1287: update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
1288: if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1289: {
1290: int reg_idx;
1291: if (fs)
1292: {
1293: for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1294: if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1295: break;
1296: if (reg_idx == nmatch)
1297: {
1298: re_node_set_free (&eps_via_nodes);
1299: return free_fail_stack_return (fs);
1300: }
1301: cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1302: &eps_via_nodes);
1303: }
1304: else
1305: {
1306: re_node_set_free (&eps_via_nodes);
1307: return REG_NOERROR;
1308: }
1309: }
1310:
1311: /* Proceed to next node. */
1312: cur_node = proceed_next_node (preg, nmatch, pmatch, mctx, &idx, cur_node,
1313: &eps_via_nodes, fs);
1314:
1315: if (BE (cur_node < 0, 0))
1316: {
1317: if (cur_node == -2)
1318: return REG_ESPACE;
1319: if (fs)
1320: cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1321: &eps_via_nodes);
1322: else
1323: {
1324: re_node_set_free (&eps_via_nodes);
1325: return REG_NOMATCH;
1326: }
1327: }
1328: }
1329: re_node_set_free (&eps_via_nodes);
1330: return free_fail_stack_return (fs);
1331: }
1332:
1333: static reg_errcode_t
1334: free_fail_stack_return (fs)
1335: struct re_fail_stack_t *fs;
1336: {
1337: if (fs)
1338: {
1339: int fs_idx;
1340: for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1341: {
1342: re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1343: re_free (fs->stack[fs_idx].regs);
1344: }
1345: re_free (fs->stack);
1346: }
1347: return REG_NOERROR;
1348: }
1349:
1350: static void
1351: update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
1352: re_dfa_t *dfa;
1353: regmatch_t *pmatch;
1354: int cur_node, cur_idx, nmatch;
1355: {
1356: int type = dfa->nodes[cur_node].type;
1357: int reg_num;
1358: if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
1359: return;
1360: reg_num = dfa->nodes[cur_node].opr.idx + 1;
1361: if (reg_num >= nmatch)
1362: return;
1363: if (type == OP_OPEN_SUBEXP)
1364: {
1365: /* We are at the first node of this sub expression. */
1366: pmatch[reg_num].rm_so = cur_idx;
1367: pmatch[reg_num].rm_eo = -1;
1368: }
1369: else if (type == OP_CLOSE_SUBEXP)
1370: /* We are at the first node of this sub expression. */
1371: pmatch[reg_num].rm_eo = cur_idx;
1372: }
1373:
1374: #define NUMBER_OF_STATE 1
1375:
1376: /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1377: and sift the nodes in each states according to the following rules.
1378: Updated state_log will be wrote to STATE_LOG.
1379:
1380: Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1381: 1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1382: If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1383: the LAST_NODE, we throw away the node `a'.
1384: 2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1385: string `s' and transit to `b':
1386: i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1387: away the node `a'.
1388: ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1389: throwed away, we throw away the node `a'.
1390: 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
1391: i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1392: node `a'.
1393: ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1394: we throw away the node `a'. */
1395:
1396: #define STATE_NODE_CONTAINS(state,node) \
1397: ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1398:
1399: static reg_errcode_t
1400: sift_states_backward (preg, mctx, sctx)
1401: const regex_t *preg;
1402: re_match_context_t *mctx;
1403: re_sift_context_t *sctx;
1404: {
1405: reg_errcode_t err;
1406: re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1407: int null_cnt = 0;
1408: int str_idx = sctx->last_str_idx;
1409: re_node_set cur_dest;
1410: re_node_set *cur_src; /* Points the state_log[str_idx]->nodes */
1411:
1412: #ifdef DEBUG
1413: assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1414: #endif
1415: cur_src = &mctx->state_log[str_idx]->nodes;
1416:
1417: /* Build sifted state_log[str_idx]. It has the nodes which can epsilon
1418: transit to the last_node and the last_node itself. */
1419: err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1420: if (BE (err != REG_NOERROR, 0))
1421: return err;
1422: err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1423: if (BE (err != REG_NOERROR, 0))
1424: goto free_return;
1425:
1426: /* Then check each states in the state_log. */
1427: while (str_idx > 0)
1428: {
1429: int i, ret;
1430: /* Update counters. */
1431: null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1432: if (null_cnt > mctx->max_mb_elem_len)
1433: {
1434: memset (sctx->sifted_states, '\0',
1435: sizeof (re_dfastate_t *) * str_idx);
1436: re_node_set_free (&cur_dest);
1437: return REG_NOERROR;
1438: }
1439: re_node_set_empty (&cur_dest);
1440: --str_idx;
1441: cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1442: : &mctx->state_log[str_idx]->nodes);
1443:
1444: /* Then build the next sifted state.
1445: We build the next sifted state on `cur_dest', and update
1446: `sifted_states[str_idx]' with `cur_dest'.
1447: Note:
1448: `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1449: `cur_src' points the node_set of the old `state_log[str_idx]'. */
1450: for (i = 0; i < cur_src->nelem; i++)
1451: {
1452: int prev_node = cur_src->elems[i];
1453: int naccepted = 0;
1454: re_token_type_t type = dfa->nodes[prev_node].type;
1455:
1456: if (IS_EPSILON_NODE(type))
1457: continue;
1458: #ifdef RE_ENABLE_I18N
1459: /* If the node may accept `multi byte'. */
1460: if (ACCEPT_MB_NODE (type))
1461: naccepted = sift_states_iter_mb (preg, mctx, sctx, prev_node,
1462: str_idx, sctx->last_str_idx);
1463:
1464: #endif /* RE_ENABLE_I18N */
1465: /* We don't check backreferences here.
1466: See update_cur_sifted_state(). */
1467:
1468: if (!naccepted
1469: && check_node_accept (preg, dfa->nodes + prev_node, mctx,
1470: str_idx)
1471: && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1472: dfa->nexts[prev_node]))
1473: naccepted = 1;
1474:
1475: if (naccepted == 0)
1476: continue;
1477:
1478: if (sctx->limits.nelem)
1479: {
1480: int to_idx = str_idx + naccepted;
1481: if (check_dst_limits (dfa, &sctx->limits, mctx,
1482: dfa->nexts[prev_node], to_idx,
1483: prev_node, str_idx))
1484: continue;
1485: }
1486: ret = re_node_set_insert (&cur_dest, prev_node);
1487: if (BE (ret == -1, 0))
1488: {
1489: err = REG_ESPACE;
1490: goto free_return;
1491: }
1492: }
1493:
1494: /* Add all the nodes which satisfy the following conditions:
1495: - It can epsilon transit to a node in CUR_DEST.
1496: - It is in CUR_SRC.
1497: And update state_log. */
1498: err = update_cur_sifted_state (preg, mctx, sctx, str_idx, &cur_dest);
1499: if (BE (err != REG_NOERROR, 0))
1500: goto free_return;
1501: }
1502: err = REG_NOERROR;
1503: free_return:
1504: re_node_set_free (&cur_dest);
1505: return err;
1506: }
1507:
1508: /* Helper functions. */
1509:
1510: static inline reg_errcode_t
1511: clean_state_log_if_need (mctx, next_state_log_idx)
1512: re_match_context_t *mctx;
1513: int next_state_log_idx;
1514: {
1515: int top = mctx->state_log_top;
1516:
1517: if (next_state_log_idx >= mctx->input->bufs_len
1518: || (next_state_log_idx >= mctx->input->valid_len
1519: && mctx->input->valid_len < mctx->input->len))
1520: {
1521: reg_errcode_t err;
1522: err = extend_buffers (mctx);
1523: if (BE (err != REG_NOERROR, 0))
1524: return err;
1525: }
1526:
1527: if (top < next_state_log_idx)
1528: {
1529: memset (mctx->state_log + top + 1, '\0',
1530: sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1531: mctx->state_log_top = next_state_log_idx;
1532: }
1533: return REG_NOERROR;
1534: }
1535:
1536: static reg_errcode_t
1537: merge_state_array (dfa, dst, src, num)
1538: re_dfa_t *dfa;
1539: re_dfastate_t **dst;
1540: re_dfastate_t **src;
1541: int num;
1542: {
1543: int st_idx;
1544: reg_errcode_t err;
1545: for (st_idx = 0; st_idx < num; ++st_idx)
1546: {
1547: if (dst[st_idx] == NULL)
1548: dst[st_idx] = src[st_idx];
1549: else if (src[st_idx] != NULL)
1550: {
1551: re_node_set merged_set;
1552: err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1553: &src[st_idx]->nodes);
1554: if (BE (err != REG_NOERROR, 0))
1555: return err;
1556: dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1557: re_node_set_free (&merged_set);
1558: if (BE (err != REG_NOERROR, 0))
1559: return err;
1560: }
1561: }
1562: return REG_NOERROR;
1563: }
1564:
1565: static reg_errcode_t
1566: update_cur_sifted_state (preg, mctx, sctx, str_idx, dest_nodes)
1567: const regex_t *preg;
1568: re_match_context_t *mctx;
1569: re_sift_context_t *sctx;
1570: int str_idx;
1571: re_node_set *dest_nodes;
1572: {
1573: reg_errcode_t err;
1574: re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1575: const re_node_set *candidates;
1576: candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1577: : &mctx->state_log[str_idx]->nodes);
1578:
1579: /* At first, add the nodes which can epsilon transit to a node in
1580: DEST_NODE. */
1581: if (dest_nodes->nelem)
1582: {
1583: err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1584: if (BE (err != REG_NOERROR, 0))
1585: return err;
1586: }
1587:
1588: /* Then, check the limitations in the current sift_context. */
1589: if (dest_nodes->nelem && sctx->limits.nelem)
1590: {
1591: err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1592: mctx->bkref_ents, str_idx);
1593: if (BE (err != REG_NOERROR, 0))
1594: return err;
1595: }
1596:
1597: /* Update state_log. */
1598: sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1599: if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1600: return err;
1601:
1602: if ((mctx->state_log[str_idx] != NULL
1603: && mctx->state_log[str_idx]->has_backref))
1604: {
1605: err = sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes);
1606: if (BE (err != REG_NOERROR, 0))
1607: return err;
1608: }
1609: return REG_NOERROR;
1610: }
1611:
1612: static reg_errcode_t
1613: add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1614: re_dfa_t *dfa;
1615: re_node_set *dest_nodes;
1616: const re_node_set *candidates;
1617: {
1618: reg_errcode_t err;
1619: int src_idx;
1620: re_node_set src_copy;
1621:
1622: err = re_node_set_init_copy (&src_copy, dest_nodes);
1623: if (BE (err != REG_NOERROR, 0))
1624: return err;
1625: for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1626: {
1627: err = re_node_set_add_intersect (dest_nodes, candidates,
1628: dfa->inveclosures
1629: + src_copy.elems[src_idx]);
1630: if (BE (err != REG_NOERROR, 0))
1631: {
1632: re_node_set_free (&src_copy);
1633: return err;
1634: }
1635: }
1636: re_node_set_free (&src_copy);
1637: return REG_NOERROR;
1638: }
1639:
1640: static reg_errcode_t
1641: sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1642: re_dfa_t *dfa;
1643: int node;
1644: re_node_set *dest_nodes;
1645: const re_node_set *candidates;
1646: {
1647: int ecl_idx;
1648: reg_errcode_t err;
1649: re_node_set *inv_eclosure = dfa->inveclosures + node;
1650: re_node_set except_nodes;
1651: re_node_set_init_empty (&except_nodes);
1652: for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1653: {
1654: int cur_node = inv_eclosure->elems[ecl_idx];
1655: if (cur_node == node)
1656: continue;
1657: if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1658: {
1659: int edst1 = dfa->edests[cur_node].elems[0];
1660: int edst2 = ((dfa->edests[cur_node].nelem > 1)
1661: ? dfa->edests[cur_node].elems[1] : -1);
1662: if ((!re_node_set_contains (inv_eclosure, edst1)
1663: && re_node_set_contains (dest_nodes, edst1))
1664: || (edst2 > 0
1665: && !re_node_set_contains (inv_eclosure, edst2)
1666: && re_node_set_contains (dest_nodes, edst2)))
1667: {
1668: err = re_node_set_add_intersect (&except_nodes, candidates,
1669: dfa->inveclosures + cur_node);
1670: if (BE (err != REG_NOERROR, 0))
1671: {
1672: re_node_set_free (&except_nodes);
1673: return err;
1674: }
1675: }
1676: }
1677: }
1678: for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1679: {
1680: int cur_node = inv_eclosure->elems[ecl_idx];
1681: if (!re_node_set_contains (&except_nodes, cur_node))
1682: {
1683: int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1684: re_node_set_remove_at (dest_nodes, idx);
1685: }
1686: }
1687: re_node_set_free (&except_nodes);
1688: return REG_NOERROR;
1689: }
1690:
1691: static int
1692: check_dst_limits (dfa, limits, mctx, dst_node, dst_idx, src_node, src_idx)
1693: re_dfa_t *dfa;
1694: re_node_set *limits;
1695: re_match_context_t *mctx;
1696: int dst_node, dst_idx, src_node, src_idx;
1697: {
1698: int lim_idx, src_pos, dst_pos;
1699:
1700: for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1701: {
1702: int subexp_idx;
1703: struct re_backref_cache_entry *ent;
1704: ent = mctx->bkref_ents + limits->elems[lim_idx];
1705: subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1706:
1707: dst_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
1708: dfa->eclosures + dst_node,
1709: subexp_idx, dst_node, dst_idx);
1710: src_pos = check_dst_limits_calc_pos (dfa, mctx, limits->elems[lim_idx],
1711: dfa->eclosures + src_node,
1712: subexp_idx, src_node, src_idx);
1713:
1714: /* In case of:
1715: <src> <dst> ( <subexp> )
1716: ( <subexp> ) <src> <dst>
1717: ( <subexp1> <src> <subexp2> <dst> <subexp3> ) */
1718: if (src_pos == dst_pos)
1719: continue; /* This is unrelated limitation. */
1720: else
1721: return 1;
1722: }
1723: return 0;
1724: }
1725:
1726: static int
1727: check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, node,
1728: str_idx)
1729: re_dfa_t *dfa;
1730: re_match_context_t *mctx;
1731: re_node_set *eclosures;
1732: int limit, subexp_idx, node, str_idx;
1733: {
1734: struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1735: int pos = (str_idx < lim->subexp_from ? -1
1736: : (lim->subexp_to < str_idx ? 1 : 0));
1737: if (pos == 0
1738: && (str_idx == lim->subexp_from || str_idx == lim->subexp_to))
1739: {
1740: int node_idx;
1741: for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1742: {
1743: int node = eclosures->elems[node_idx];
1744: re_token_type_t type= dfa->nodes[node].type;
1745: if (type == OP_BACK_REF)
1746: {
1747: int bi = search_cur_bkref_entry (mctx, str_idx);
1748: for (; bi < mctx->nbkref_ents; ++bi)
1749: {
1750: struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
1751: if (ent->str_idx > str_idx)
1752: break;
1753: if (ent->node == node && ent->subexp_from == ent->subexp_to)
1754: {
1755: int cpos, dst;
1756: dst = dfa->edests[node].elems[0];
1757: cpos = check_dst_limits_calc_pos (dfa, mctx, limit,
1758: dfa->eclosures + dst,
1759: subexp_idx, dst,
1760: str_idx);
1761: if ((str_idx == lim->subexp_from && cpos == -1)
1762: || (str_idx == lim->subexp_to && cpos == 0))
1763: return cpos;
1764: }
1765: }
1766: }
1767: if (type == OP_OPEN_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
1768: && str_idx == lim->subexp_from)
1769: {
1770: pos = -1;
1771: break;
1772: }
1773: if (type == OP_CLOSE_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
1774: && str_idx == lim->subexp_to)
1775: break;
1776: }
1777: if (node_idx == eclosures->nelem && str_idx == lim->subexp_to)
1778: pos = 1;
1779: }
1780: return pos;
1781: }
1782:
1783: /* Check the limitations of sub expressions LIMITS, and remove the nodes
1784: which are against limitations from DEST_NODES. */
1785:
1786: static reg_errcode_t
1787: check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1788: re_dfa_t *dfa;
1789: re_node_set *dest_nodes;
1790: const re_node_set *candidates;
1791: re_node_set *limits;
1792: struct re_backref_cache_entry *bkref_ents;
1793: int str_idx;
1794: {
1795: reg_errcode_t err;
1796: int node_idx, lim_idx;
1797:
1798: for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1799: {
1800: int subexp_idx;
1801: struct re_backref_cache_entry *ent;
1802: ent = bkref_ents + limits->elems[lim_idx];
1803:
1804: if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1805: continue; /* This is unrelated limitation. */
1806:
1807: subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1808: if (ent->subexp_to == str_idx)
1809: {
1810: int ops_node = -1;
1811: int cls_node = -1;
1812: for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1813: {
1814: int node = dest_nodes->elems[node_idx];
1815: re_token_type_t type= dfa->nodes[node].type;
1816: if (type == OP_OPEN_SUBEXP
1817: && subexp_idx == dfa->nodes[node].opr.idx)
1818: ops_node = node;
1819: else if (type == OP_CLOSE_SUBEXP
1820: && subexp_idx == dfa->nodes[node].opr.idx)
1821: cls_node = node;
1822: }
1823:
1824: /* Check the limitation of the open subexpression. */
1825: /* Note that (ent->subexp_to = str_idx != ent->subexp_from). */
1826: if (ops_node >= 0)
1827: {
1828: err = sub_epsilon_src_nodes(dfa, ops_node, dest_nodes,
1829: candidates);
1830: if (BE (err != REG_NOERROR, 0))
1831: return err;
1832: }
1833: /* Check the limitation of the close subexpression. */
1834: for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1835: {
1836: int node = dest_nodes->elems[node_idx];
1837: if (!re_node_set_contains (dfa->inveclosures + node, cls_node)
1838: && !re_node_set_contains (dfa->eclosures + node, cls_node))
1839: {
1840: /* It is against this limitation.
1841: Remove it form the current sifted state. */
1842: err = sub_epsilon_src_nodes(dfa, node, dest_nodes,
1843: candidates);
1844: if (BE (err != REG_NOERROR, 0))
1845: return err;
1846: --node_idx;
1847: }
1848: }
1849: }
1850: else /* (ent->subexp_to != str_idx) */
1851: {
1852: for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1853: {
1854: int node = dest_nodes->elems[node_idx];
1855: re_token_type_t type= dfa->nodes[node].type;
1856: if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1857: {
1858: if (subexp_idx != dfa->nodes[node].opr.idx)
1859: continue;
1860: if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1861: || (type == OP_OPEN_SUBEXP))
1862: {
1863: /* It is against this limitation.
1864: Remove it form the current sifted state. */
1865: err = sub_epsilon_src_nodes(dfa, node, dest_nodes,
1866: candidates);
1867: if (BE (err != REG_NOERROR, 0))
1868: return err;
1869: }
1870: }
1871: }
1872: }
1873: }
1874: return REG_NOERROR;
1875: }
1876:
1877: static reg_errcode_t
1878: sift_states_bkref (preg, mctx, sctx, str_idx, dest_nodes)
1879: const regex_t *preg;
1880: re_match_context_t *mctx;
1881: re_sift_context_t *sctx;
1882: int str_idx;
1883: re_node_set *dest_nodes;
1884: {
1885: reg_errcode_t err;
1886: re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1887: int node_idx, node;
1888: re_sift_context_t local_sctx;
1889: const re_node_set *candidates;
1890: candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1891: : &mctx->state_log[str_idx]->nodes);
1892: local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized. */
1893:
1894: for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
1895: {
1896: int cur_bkref_idx = re_string_cur_idx (mctx->input);
1897: re_token_type_t type;
1898: node = candidates->elems[node_idx];
1899: type = dfa->nodes[node].type;
1900: if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
1901: continue;
1902: /* Avoid infinite loop for the REs like "()\1+". */
1903: if (node == sctx->last_node && str_idx == sctx->last_str_idx)
1904: continue;
1905: if (type == OP_BACK_REF)
1906: {
1907: int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
1908: for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
1909: {
1910: int disabled_idx, subexp_len, to_idx, dst_node;
1911: struct re_backref_cache_entry *entry;
1912: entry = mctx->bkref_ents + enabled_idx;
1913: if (entry->str_idx > str_idx)
1914: break;
1915: if (entry->node != node)
1916: continue;
1917: subexp_len = entry->subexp_to - entry->subexp_from;
1918: to_idx = str_idx + subexp_len;
1919: dst_node = (subexp_len ? dfa->nexts[node]
1920: : dfa->edests[node].elems[0]);
1921:
1922: if (to_idx > sctx->last_str_idx
1923: || sctx->sifted_states[to_idx] == NULL
1924: || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
1925: dst_node)
1926: || check_dst_limits (dfa, &sctx->limits, mctx, node,
1927: str_idx, dst_node, to_idx))
1928: continue;
1929: {
1930: re_dfastate_t *cur_state;
1931: entry->flag = 0;
1932: for (disabled_idx = enabled_idx + 1;
1933: disabled_idx < mctx->nbkref_ents; ++disabled_idx)
1934: {
1935: struct re_backref_cache_entry *entry2;
1936: entry2 = mctx->bkref_ents + disabled_idx;
1937: if (entry2->str_idx > str_idx)
1938: break;
1939: entry2->flag = (entry2->node == node) ? 1 : entry2->flag;
1940: }
1941:
1942: if (local_sctx.sifted_states == NULL)
1943: {
1944: local_sctx = *sctx;
1945: err = re_node_set_init_copy (&local_sctx.limits,
1946: &sctx->limits);
1947: if (BE (err != REG_NOERROR, 0))
1948: goto free_return;
1949: }
1950: local_sctx.last_node = node;
1951: local_sctx.last_str_idx = str_idx;
1952: err = re_node_set_insert (&local_sctx.limits, enabled_idx);
1953: if (BE (err < 0, 0))
1954: {
1955: err = REG_ESPACE;
1956: goto free_return;
1957: }
1958: cur_state = local_sctx.sifted_states[str_idx];
1959: err = sift_states_backward (preg, mctx, &local_sctx);
1960: if (BE (err != REG_NOERROR, 0))
1961: goto free_return;
1962: if (sctx->limited_states != NULL)
1963: {
1964: err = merge_state_array (dfa, sctx->limited_states,
1965: local_sctx.sifted_states,
1966: str_idx + 1);
1967: if (BE (err != REG_NOERROR, 0))
1968: goto free_return;
1969: }
1970: local_sctx.sifted_states[str_idx] = cur_state;
1971: re_node_set_remove (&local_sctx.limits, enabled_idx);
1972: /* We must not use the variable entry here, since
1973: mctx->bkref_ents might be realloced. */
1974: mctx->bkref_ents[enabled_idx].flag = 1;
1975: }
1976: }
1977: enabled_idx = search_cur_bkref_entry (mctx, str_idx);
1978: for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
1979: {
1980: struct re_backref_cache_entry *entry;
1981: entry = mctx->bkref_ents + enabled_idx;
1982: if (entry->str_idx > str_idx)
1983: break;
1984: if (entry->node == node)
1985: entry->flag = 0;
1986: }
1987: }
1988: }
1989: err = REG_NOERROR;
1990: free_return:
1991: if (local_sctx.sifted_states != NULL)
1992: {
1993: re_node_set_free (&local_sctx.limits);
1994: }
1995:
1996: return err;
1997: }
1998:
1999:
2000: #ifdef RE_ENABLE_I18N
2001: static int
2002: sift_states_iter_mb (preg, mctx, sctx, node_idx, str_idx, max_str_idx)
2003: const regex_t *preg;
2004: const re_match_context_t *mctx;
2005: re_sift_context_t *sctx;
2006: int node_idx, str_idx, max_str_idx;
2007: {
2008: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2009: int naccepted;
2010: /* Check the node can accept `multi byte'. */
2011: naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx);
2012: if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2013: !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2014: dfa->nexts[node_idx]))
2015: /* The node can't accept the `multi byte', or the
2016: destination was already throwed away, then the node
2017: could't accept the current input `multi byte'. */
2018: naccepted = 0;
2019: /* Otherwise, it is sure that the node could accept
2020: `naccepted' bytes input. */
2021: return naccepted;
2022: }
2023: #endif /* RE_ENABLE_I18N */
2024:
2025:
2026: /* Functions for state transition. */
2027:
2028: /* Return the next state to which the current state STATE will transit by
2029: accepting the current input byte, and update STATE_LOG if necessary.
2030: If STATE can accept a multibyte char/collating element/back reference
2031: update the destination of STATE_LOG. */
2032:
2033: static re_dfastate_t *
2034: transit_state (err, preg, mctx, state, fl_search)
2035: reg_errcode_t *err;
2036: const regex_t *preg;
2037: re_match_context_t *mctx;
2038: re_dfastate_t *state;
2039: int fl_search;
2040: {
2041: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2042: re_dfastate_t **trtable, *next_state;
2043: unsigned char ch;
2044: int cur_idx;
2045:
2046: if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len
2047: || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len
2048: && mctx->input->valid_len < mctx->input->len))
2049: {
2050: *err = extend_buffers (mctx);
2051: if (BE (*err != REG_NOERROR, 0))
2052: return NULL;
2053: }
2054:
2055: *err = REG_NOERROR;
2056: if (state == NULL)
2057: {
2058: next_state = state;
2059: re_string_skip_bytes (mctx->input, 1);
2060: }
2061: else
2062: {
2063: #ifdef RE_ENABLE_I18N
2064: /* If the current state can accept multibyte. */
2065: if (state->accept_mb)
2066: {
2067: *err = transit_state_mb (preg, state, mctx);
2068: if (BE (*err != REG_NOERROR, 0))
2069: return NULL;
2070: }
2071: #endif /* RE_ENABLE_I18N */
2072:
2073: /* Then decide the next state with the single byte. */
2074: if (1)
2075: {
2076: /* Use transition table */
2077: ch = re_string_fetch_byte (mctx->input);
2078: trtable = fl_search ? state->trtable_search : state->trtable;
2079: if (trtable == NULL)
2080: {
2081: trtable = build_trtable (preg, state, fl_search);
2082: if (fl_search)
2083: state->trtable_search = trtable;
2084: else
2085: state->trtable = trtable;
2086: }
2087: next_state = trtable[ch];
2088: }
2089: else
2090: {
2091: /* don't use transition table */
2092: next_state = transit_state_sb (err, preg, state, fl_search, mctx);
2093: if (BE (next_state == NULL && err != REG_NOERROR, 0))
2094: return NULL;
2095: }
2096: }
2097:
2098: cur_idx = re_string_cur_idx (mctx->input);
2099: /* Update the state_log if we need. */
2100: if (mctx->state_log != NULL)
2101: {
2102: if (cur_idx > mctx->state_log_top)
2103: {
2104: mctx->state_log[cur_idx] = next_state;
2105: mctx->state_log_top = cur_idx;
2106: }
2107: else if (mctx->state_log[cur_idx] == 0)
2108: {
2109: mctx->state_log[cur_idx] = next_state;
2110: }
2111: else
2112: {
2113: re_dfastate_t *pstate;
2114: unsigned int context;
2115: re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2116: /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2117: the destination of a multibyte char/collating element/
2118: back reference. Then the next state is the union set of
2119: these destinations and the results of the transition table. */
2120: pstate = mctx->state_log[cur_idx];
2121: log_nodes = pstate->entrance_nodes;
2122: if (next_state != NULL)
2123: {
2124: table_nodes = next_state->entrance_nodes;
2125: *err = re_node_set_init_union (&next_nodes, table_nodes,
2126: log_nodes);
2127: if (BE (*err != REG_NOERROR, 0))
2128: return NULL;
2129: }
2130: else
2131: next_nodes = *log_nodes;
2132: /* Note: We already add the nodes of the initial state,
2133: then we don't need to add them here. */
2134:
2135: context = re_string_context_at (mctx->input,
2136: re_string_cur_idx (mctx->input) - 1,
2137: mctx->eflags, preg->newline_anchor);
2138: next_state = mctx->state_log[cur_idx]
2139: = re_acquire_state_context (err, dfa, &next_nodes, context);
2140: /* We don't need to check errors here, since the return value of
2141: this function is next_state and ERR is already set. */
2142:
2143: if (table_nodes != NULL)
2144: re_node_set_free (&next_nodes);
2145: }
2146: }
2147:
2148: /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2149: later. We must check them here, since the back references in the
2150: next state might use them. */
2151: if (dfa->nbackref && next_state/* && fl_process_bkref */)
2152: {
2153: *err = check_subexp_matching_top (dfa, mctx, &next_state->nodes,
2154: cur_idx);
2155: if (BE (*err != REG_NOERROR, 0))
2156: return NULL;
2157: }
2158:
2159: /* If the next state has back references. */
2160: if (next_state != NULL && next_state->has_backref)
2161: {
2162: *err = transit_state_bkref (preg, &next_state->nodes, mctx);
2163: if (BE (*err != REG_NOERROR, 0))
2164: return NULL;
2165: next_state = mctx->state_log[cur_idx];
2166: }
2167: return next_state;
2168: }
2169:
2170: /* Helper functions for transit_state. */
2171:
2172: /* From the node set CUR_NODES, pick up the nodes whose types are
2173: OP_OPEN_SUBEXP and which have corresponding back references in the regular
2174: expression. And register them to use them later for evaluating the
2175: correspoding back references. */
2176:
2177: static reg_errcode_t
2178: check_subexp_matching_top (dfa, mctx, cur_nodes, str_idx)
2179: re_dfa_t *dfa;
2180: re_match_context_t *mctx;
2181: re_node_set *cur_nodes;
2182: int str_idx;
2183: {
2184: int node_idx;
2185: reg_errcode_t err;
2186:
2187: /* TODO: This isn't efficient.
2188: Because there might be more than one nodes whose types are
2189: OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2190: nodes.
2191: E.g. RE: (a){2} */
2192: for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2193: {
2194: int node = cur_nodes->elems[node_idx];
2195: if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2196: && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2197: {
2198: err = match_ctx_add_subtop (mctx, node, str_idx);
2199: if (BE (err != REG_NOERROR, 0))
2200: return err;
2201: }
2202: }
2203: return REG_NOERROR;
2204: }
2205:
2206: /* Return the next state to which the current state STATE will transit by
2207: accepting the current input byte. */
2208:
2209: static re_dfastate_t *
2210: transit_state_sb (err, preg, state, fl_search, mctx)
2211: reg_errcode_t *err;
2212: const regex_t *preg;
2213: re_dfastate_t *state;
2214: int fl_search;
2215: re_match_context_t *mctx;
2216: {
2217: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2218: re_node_set next_nodes;
2219: re_dfastate_t *next_state;
2220: int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input);
2221: unsigned int context;
2222:
2223: *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2224: if (BE (*err != REG_NOERROR, 0))
2225: return NULL;
2226: for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2227: {
2228: int cur_node = state->nodes.elems[node_cnt];
2229: if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx))
2230: {
2231: *err = re_node_set_merge (&next_nodes,
2232: dfa->eclosures + dfa->nexts[cur_node]);
2233: if (BE (*err != REG_NOERROR, 0))
2234: {
2235: re_node_set_free (&next_nodes);
2236: return NULL;
2237: }
2238: }
2239: }
2240: if (fl_search)
2241: {
2242: #ifdef RE_ENABLE_I18N
2243: int not_initial = 0;
2244: if (MB_CUR_MAX > 1)
2245: for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt)
2246: if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER)
2247: {
2248: not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial;
2249: break;
2250: }
2251: if (!not_initial)
2252: #endif
2253: {
2254: *err = re_node_set_merge (&next_nodes,
2255: dfa->init_state->entrance_nodes);
2256: if (BE (*err != REG_NOERROR, 0))
2257: {
2258: re_node_set_free (&next_nodes);
2259: return NULL;
2260: }
2261: }
2262: }
2263: context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
2264: preg->newline_anchor);
2265: next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2266: /* We don't need to check errors here, since the return value of
2267: this function is next_state and ERR is already set. */
2268:
2269: re_node_set_free (&next_nodes);
2270: re_string_skip_bytes (mctx->input, 1);
2271: return next_state;
2272: }
2273:
2274: #ifdef RE_ENABLE_I18N
2275: static reg_errcode_t
2276: transit_state_mb (preg, pstate, mctx)
2277: const regex_t *preg;
2278: re_dfastate_t *pstate;
2279: re_match_context_t *mctx;
2280: {
2281: reg_errcode_t err;
2282: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2283: int i;
2284:
2285: for (i = 0; i < pstate->nodes.nelem; ++i)
2286: {
2287: re_node_set dest_nodes, *new_nodes;
2288: int cur_node_idx = pstate->nodes.elems[i];
2289: int naccepted = 0, dest_idx;
2290: unsigned int context;
2291: re_dfastate_t *dest_state;
2292:
2293: if (dfa->nodes[cur_node_idx].constraint)
2294: {
2295: context = re_string_context_at (mctx->input,
2296: re_string_cur_idx (mctx->input),
2297: mctx->eflags, preg->newline_anchor);
2298: if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2299: context))
2300: continue;
2301: }
2302:
2303: /* How many bytes the node can accepts? */
2304: if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2305: naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input,
2306: re_string_cur_idx (mctx->input));
2307: if (naccepted == 0)
2308: continue;
2309:
2310: /* The node can accepts `naccepted' bytes. */
2311: dest_idx = re_string_cur_idx (mctx->input) + naccepted;
2312: mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2313: : mctx->max_mb_elem_len);
2314: err = clean_state_log_if_need (mctx, dest_idx);
2315: if (BE (err != REG_NOERROR, 0))
2316: return err;
2317: #ifdef DEBUG
2318: assert (dfa->nexts[cur_node_idx] != -1);
2319: #endif
2320: /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2321: then we use pstate->nodes.elems[i] instead. */
2322: new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2323:
2324: dest_state = mctx->state_log[dest_idx];
2325: if (dest_state == NULL)
2326: dest_nodes = *new_nodes;
2327: else
2328: {
2329: err = re_node_set_init_union (&dest_nodes,
2330: dest_state->entrance_nodes, new_nodes);
2331: if (BE (err != REG_NOERROR, 0))
2332: return err;
2333: }
2334: context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
2335: preg->newline_anchor);
2336: mctx->state_log[dest_idx]
2337: = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2338: if (dest_state != NULL)
2339: re_node_set_free (&dest_nodes);
2340: if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2341: return err;
2342: }
2343: return REG_NOERROR;
2344: }
2345: #endif /* RE_ENABLE_I18N */
2346:
2347: static reg_errcode_t
2348: transit_state_bkref (preg, nodes, mctx)
2349: const regex_t *preg;
2350: re_node_set *nodes;
2351: re_match_context_t *mctx;
2352: {
2353: reg_errcode_t err;
2354: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2355: int i;
2356: int cur_str_idx = re_string_cur_idx (mctx->input);
2357:
2358: for (i = 0; i < nodes->nelem; ++i)
2359: {
2360: int dest_str_idx, prev_nelem, bkc_idx;
2361: int node_idx = nodes->elems[i];
2362: unsigned int context;
2363: re_token_t *node = dfa->nodes + node_idx;
2364: re_node_set *new_dest_nodes;
2365:
2366: /* Check whether `node' is a backreference or not. */
2367: if (node->type != OP_BACK_REF)
2368: continue;
2369:
2370: if (node->constraint)
2371: {
2372: context = re_string_context_at (mctx->input, cur_str_idx,
2373: mctx->eflags, preg->newline_anchor);
2374: if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2375: continue;
2376: }
2377:
2378: /* `node' is a backreference.
2379: Check the substring which the substring matched. */
2380: bkc_idx = mctx->nbkref_ents;
2381: err = get_subexp (preg, mctx, node_idx, cur_str_idx);
2382: if (BE (err != REG_NOERROR, 0))
2383: goto free_return;
2384:
2385: /* And add the epsilon closures (which is `new_dest_nodes') of
2386: the backreference to appropriate state_log. */
2387: #ifdef DEBUG
2388: assert (dfa->nexts[node_idx] != -1);
2389: #endif
2390: for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2391: {
2392: int subexp_len;
2393: re_dfastate_t *dest_state;
2394: struct re_backref_cache_entry *bkref_ent;
2395: bkref_ent = mctx->bkref_ents + bkc_idx;
2396: if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2397: continue;
2398: subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2399: new_dest_nodes = (subexp_len == 0
2400: ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2401: : dfa->eclosures + dfa->nexts[node_idx]);
2402: dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2403: - bkref_ent->subexp_from);
2404: context = re_string_context_at (mctx->input, dest_str_idx - 1,
2405: mctx->eflags, preg->newline_anchor);
2406: dest_state = mctx->state_log[dest_str_idx];
2407: prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2408: : mctx->state_log[cur_str_idx]->nodes.nelem);
2409: /* Add `new_dest_node' to state_log. */
2410: if (dest_state == NULL)
2411: {
2412: mctx->state_log[dest_str_idx]
2413: = re_acquire_state_context (&err, dfa, new_dest_nodes,
2414: context);
2415: if (BE (mctx->state_log[dest_str_idx] == NULL
2416: && err != REG_NOERROR, 0))
2417: goto free_return;
2418: }
2419: else
2420: {
2421: re_node_set dest_nodes;
2422: err = re_node_set_init_union (&dest_nodes,
2423: dest_state->entrance_nodes,
2424: new_dest_nodes);
2425: if (BE (err != REG_NOERROR, 0))
2426: {
2427: re_node_set_free (&dest_nodes);
2428: goto free_return;
2429: }
2430: mctx->state_log[dest_str_idx]
2431: = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2432: re_node_set_free (&dest_nodes);
2433: if (BE (mctx->state_log[dest_str_idx] == NULL
2434: && err != REG_NOERROR, 0))
2435: goto free_return;
2436: }
2437: /* We need to check recursively if the backreference can epsilon
2438: transit. */
2439: if (subexp_len == 0
2440: && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2441: {
2442: err = check_subexp_matching_top (dfa, mctx, new_dest_nodes,
2443: cur_str_idx);
2444: if (BE (err != REG_NOERROR, 0))
2445: goto free_return;
2446: err = transit_state_bkref (preg, new_dest_nodes, mctx);
2447: if (BE (err != REG_NOERROR, 0))
2448: goto free_return;
2449: }
2450: }
2451: }
2452: err = REG_NOERROR;
2453: free_return:
2454: return err;
2455: }
2456:
2457: /* Enumerate all the candidates which the backreference BKREF_NODE can match
2458: at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2459: Note that we might collect inappropriate candidates here.
2460: However, the cost of checking them strictly here is too high, then we
2461: delay these checking for prune_impossible_nodes(). */
2462:
2463: static reg_errcode_t
2464: get_subexp (preg, mctx, bkref_node, bkref_str_idx)
2465: const regex_t *preg;
2466: re_match_context_t *mctx;
2467: int bkref_node, bkref_str_idx;
2468: {
2469: int subexp_num, sub_top_idx;
2470: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2471: char *buf = (char *) re_string_get_buffer (mctx->input);
2472: /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX. */
2473: int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2474: for (; cache_idx < mctx->nbkref_ents; ++cache_idx)
2475: {
2476: struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2477: if (entry->str_idx > bkref_str_idx)
2478: break;
2479: if (entry->node == bkref_node)
2480: return REG_NOERROR; /* We already checked it. */
2481: }
2482: subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2483:
2484: /* For each sub expression */
2485: for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2486: {
2487: reg_errcode_t err;
2488: re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2489: re_sub_match_last_t *sub_last;
2490: int sub_last_idx, sl_str;
2491: char *bkref_str;
2492:
2493: if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2494: continue; /* It isn't related. */
2495:
2496: sl_str = sub_top->str_idx;
2497: bkref_str = buf + bkref_str_idx;
2498: /* At first, check the last node of sub expressions we already
2499: evaluated. */
2500: for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2501: {
2502: int sl_str_diff;
2503: sub_last = sub_top->lasts[sub_last_idx];
2504: sl_str_diff = sub_last->str_idx - sl_str;
2505: /* The matched string by the sub expression match with the substring
2506: at the back reference? */
2507: if (sl_str_diff > 0
2508: && memcmp (bkref_str, buf + sl_str, sl_str_diff) != 0)
2509: break; /* We don't need to search this sub expression any more. */
2510: bkref_str += sl_str_diff;
2511: sl_str += sl_str_diff;
2512: err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node,
2513: bkref_str_idx);
2514: if (err == REG_NOMATCH)
2515: continue;
2516: if (BE (err != REG_NOERROR, 0))
2517: return err;
2518: }
2519: if (sub_last_idx < sub_top->nlasts)
2520: continue;
2521: if (sub_last_idx > 0)
2522: ++sl_str;
2523: /* Then, search for the other last nodes of the sub expression. */
2524: for (; sl_str <= bkref_str_idx; ++sl_str)
2525: {
2526: int cls_node, sl_str_off;
2527: re_node_set *nodes;
2528: sl_str_off = sl_str - sub_top->str_idx;
2529: /* The matched string by the sub expression match with the substring
2530: at the back reference? */
2531: if (sl_str_off > 0
2532: && memcmp (bkref_str++, buf + sl_str - 1, 1) != 0)
2533: break; /* We don't need to search this sub expression any more. */
2534: if (mctx->state_log[sl_str] == NULL)
2535: continue;
2536: /* Does this state have a ')' of the sub expression? */
2537: nodes = &mctx->state_log[sl_str]->nodes;
2538: cls_node = find_subexp_node (dfa, nodes, subexp_num, 0);
2539: if (cls_node == -1)
2540: continue; /* No. */
2541: if (sub_top->path == NULL)
2542: {
2543: sub_top->path = calloc (sizeof (state_array_t),
2544: sl_str - sub_top->str_idx + 1);
2545: if (sub_top->path == NULL)
2546: return REG_ESPACE;
2547: }
2548: /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2549: in the current context? */
2550: err = check_arrival (preg, mctx, sub_top->path, sub_top->node,
2551: sub_top->str_idx, cls_node, sl_str, 0);
2552: if (err == REG_NOMATCH)
2553: continue;
2554: if (BE (err != REG_NOERROR, 0))
2555: return err;
2556: sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2557: if (BE (sub_last == NULL, 0))
2558: return REG_ESPACE;
2559: err = get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node,
2560: bkref_str_idx);
2561: if (err == REG_NOMATCH)
2562: continue;
2563: }
2564: }
2565: return REG_NOERROR;
2566: }
2567:
2568: /* Helper functions for get_subexp(). */
2569:
2570: /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2571: If it can arrive, register the sub expression expressed with SUB_TOP
2572: and SUB_LAST. */
2573:
2574: static reg_errcode_t
2575: get_subexp_sub (preg, mctx, sub_top, sub_last, bkref_node, bkref_str)
2576: const regex_t *preg;
2577: re_match_context_t *mctx;
2578: re_sub_match_top_t *sub_top;
2579: re_sub_match_last_t *sub_last;
2580: int bkref_node, bkref_str;
2581: {
2582: reg_errcode_t err;
2583: int to_idx;
2584: /* Can the subexpression arrive the back reference? */
2585: err = check_arrival (preg, mctx, &sub_last->path, sub_last->node,
2586: sub_last->str_idx, bkref_node, bkref_str, 1);
2587: if (err != REG_NOERROR)
2588: return err;
2589: err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2590: sub_last->str_idx);
2591: if (BE (err != REG_NOERROR, 0))
2592: return err;
2593: to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2594: clean_state_log_if_need (mctx, to_idx);
2595: return REG_NOERROR;
2596: }
2597:
2598: /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2599: Search '(' if FL_OPEN, or search ')' otherwise.
2600: TODO: This function isn't efficient...
2601: Because there might be more than one nodes whose types are
2602: OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2603: nodes.
2604: E.g. RE: (a){2} */
2605:
2606: static int
2607: find_subexp_node (dfa, nodes, subexp_idx, fl_open)
2608: re_dfa_t *dfa;
2609: re_node_set *nodes;
2610: int subexp_idx, fl_open;
2611: {
2612: int cls_idx;
2613: for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2614: {
2615: int cls_node = nodes->elems[cls_idx];
2616: re_token_t *node = dfa->nodes + cls_node;
2617: if (((fl_open && node->type == OP_OPEN_SUBEXP)
2618: || (!fl_open && node->type == OP_CLOSE_SUBEXP))
2619: && node->opr.idx == subexp_idx)
2620: return cls_node;
2621: }
2622: return -1;
2623: }
2624:
2625: /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2626: LAST_NODE at LAST_STR. We record the path onto PATH since it will be
2627: heavily reused.
2628: Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise. */
2629:
2630: static reg_errcode_t
2631: check_arrival (preg, mctx, path, top_node, top_str, last_node, last_str,
2632: fl_open)
2633: const regex_t *preg;
2634: re_match_context_t *mctx;
2635: state_array_t *path;
2636: int top_node, top_str, last_node, last_str, fl_open;
2637: {
2638: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2639: reg_errcode_t err;
2640: int subexp_num, backup_cur_idx, str_idx, null_cnt;
2641: re_dfastate_t *cur_state = NULL;
2642: re_node_set *cur_nodes, next_nodes;
2643: re_dfastate_t **backup_state_log;
2644: unsigned int context;
2645:
2646: subexp_num = dfa->nodes[top_node].opr.idx;
2647: /* Extend the buffer if we need. */
2648: if (path->alloc < last_str + mctx->max_mb_elem_len + 1)
2649: {
2650: re_dfastate_t **new_array;
2651: int old_alloc = path->alloc;
2652: path->alloc += last_str + mctx->max_mb_elem_len + 1;
2653: new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2654: if (new_array == NULL)
2655: return REG_ESPACE;
2656: path->array = new_array;
2657: memset (new_array + old_alloc, '\0',
2658: sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2659: }
2660:
2661: str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2662:
2663: /* Temporary modify MCTX. */
2664: backup_state_log = mctx->state_log;
2665: backup_cur_idx = mctx->input->cur_idx;
2666: mctx->state_log = path->array;
2667: mctx->input->cur_idx = str_idx;
2668:
2669: /* Setup initial node set. */
2670: context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
2671: preg->newline_anchor);
2672: if (str_idx == top_str)
2673: {
2674: err = re_node_set_init_1 (&next_nodes, top_node);
2675: if (BE (err != REG_NOERROR, 0))
2676: return err;
2677: err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, fl_open);
2678: if (BE (err != REG_NOERROR, 0))
2679: {
2680: re_node_set_free (&next_nodes);
2681: return err;
2682: }
2683: }
2684: else
2685: {
2686: cur_state = mctx->state_log[str_idx];
2687: if (cur_state && cur_state->has_backref)
2688: {
2689: err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2690: if (BE ( err != REG_NOERROR, 0))
2691: return err;
2692: }
2693: else
2694: re_node_set_init_empty (&next_nodes);
2695: }
2696: if (str_idx == top_str || (cur_state && cur_state->has_backref))
2697: {
2698: if (next_nodes.nelem)
2699: {
2700: err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str,
2701: subexp_num, fl_open);
2702: if (BE ( err != REG_NOERROR, 0))
2703: {
2704: re_node_set_free (&next_nodes);
2705: return err;
2706: }
2707: }
2708: cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2709: if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2710: {
2711: re_node_set_free (&next_nodes);
2712: return err;
2713: }
2714: mctx->state_log[str_idx] = cur_state;
2715: }
2716:
2717: for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2718: {
2719: re_node_set_empty (&next_nodes);
2720: if (mctx->state_log[str_idx + 1])
2721: {
2722: err = re_node_set_merge (&next_nodes,
2723: &mctx->state_log[str_idx + 1]->nodes);
2724: if (BE (err != REG_NOERROR, 0))
2725: {
2726: re_node_set_free (&next_nodes);
2727: return err;
2728: }
2729: }
2730: if (cur_state)
2731: {
2732: err = check_arrival_add_next_nodes(preg, dfa, mctx, str_idx,
2733: &cur_state->nodes, &next_nodes);
2734: if (BE (err != REG_NOERROR, 0))
2735: {
2736: re_node_set_free (&next_nodes);
2737: return err;
2738: }
2739: }
2740: ++str_idx;
2741: if (next_nodes.nelem)
2742: {
2743: err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num,
2744: fl_open);
2745: if (BE (err != REG_NOERROR, 0))
2746: {
2747: re_node_set_free (&next_nodes);
2748: return err;
2749: }
2750: err = expand_bkref_cache (preg, mctx, &next_nodes, str_idx, last_str,
2751: subexp_num, fl_open);
2752: if (BE ( err != REG_NOERROR, 0))
2753: {
2754: re_node_set_free (&next_nodes);
2755: return err;
2756: }
2757: }
2758: context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
2759: preg->newline_anchor);
2760: cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2761: if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2762: {
2763: re_node_set_free (&next_nodes);
2764: return err;
2765: }
2766: mctx->state_log[str_idx] = cur_state;
2767: null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2768: }
2769: re_node_set_free (&next_nodes);
2770: cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2771: : &mctx->state_log[last_str]->nodes);
2772: path->next_idx = str_idx;
2773:
2774: /* Fix MCTX. */
2775: mctx->state_log = backup_state_log;
2776: mctx->input->cur_idx = backup_cur_idx;
2777:
2778: if (cur_nodes == NULL)
2779: return REG_NOMATCH;
2780: /* Then check the current node set has the node LAST_NODE. */
2781: return (re_node_set_contains (cur_nodes, last_node)
2782: || re_node_set_contains (cur_nodes, last_node) ? REG_NOERROR
2783: : REG_NOMATCH);
2784: }
2785:
2786: /* Helper functions for check_arrival. */
2787:
2788: /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2789: to NEXT_NODES.
2790: TODO: This function is similar to the functions transit_state*(),
2791: however this function has many additional works.
2792: Can't we unify them? */
2793:
2794: static reg_errcode_t
2795: check_arrival_add_next_nodes (preg, dfa, mctx, str_idx, cur_nodes, next_nodes)
2796: const regex_t *preg;
2797: re_dfa_t *dfa;
2798: re_match_context_t *mctx;
2799: int str_idx;
2800: re_node_set *cur_nodes, *next_nodes;
2801: {
2802: int cur_idx;
2803: reg_errcode_t err;
2804: re_node_set union_set;
2805: re_node_set_init_empty (&union_set);
2806: for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2807: {
2808: int naccepted = 0;
2809: int cur_node = cur_nodes->elems[cur_idx];
2810: re_token_type_t type = dfa->nodes[cur_node].type;
2811: if (IS_EPSILON_NODE(type))
2812: continue;
2813: #ifdef RE_ENABLE_I18N
2814: /* If the node may accept `multi byte'. */
2815: if (ACCEPT_MB_NODE (type))
2816: {
2817: naccepted = check_node_accept_bytes (preg, cur_node, mctx->input,
2818: str_idx);
2819: if (naccepted > 1)
2820: {
2821: re_dfastate_t *dest_state;
2822: int next_node = dfa->nexts[cur_node];
2823: int next_idx = str_idx + naccepted;
2824: dest_state = mctx->state_log[next_idx];
2825: re_node_set_empty (&union_set);
2826: if (dest_state)
2827: {
2828: err = re_node_set_merge (&union_set, &dest_state->nodes);
2829: if (BE (err != REG_NOERROR, 0))
2830: {
2831: re_node_set_free (&union_set);
2832: return err;
2833: }
2834: err = re_node_set_insert (&union_set, next_node);
2835: if (BE (err < 0, 0))
2836: {
2837: re_node_set_free (&union_set);
2838: return REG_ESPACE;
2839: }
2840: }
2841: else
2842: {
2843: err = re_node_set_insert (&union_set, next_node);
2844: if (BE (err < 0, 0))
2845: {
2846: re_node_set_free (&union_set);
2847: return REG_ESPACE;
2848: }
2849: }
2850: mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
2851: &union_set);
2852: if (BE (mctx->state_log[next_idx] == NULL
2853: && err != REG_NOERROR, 0))
2854: {
2855: re_node_set_free (&union_set);
2856: return err;
2857: }
2858: }
2859: }
2860: #endif /* RE_ENABLE_I18N */
2861: if (naccepted
2862: || check_node_accept (preg, dfa->nodes + cur_node, mctx,
2863: str_idx))
2864: {
2865: err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
2866: if (BE (err < 0, 0))
2867: {
2868: re_node_set_free (&union_set);
2869: return REG_ESPACE;
2870: }
2871: }
2872: }
2873: re_node_set_free (&union_set);
2874: return REG_NOERROR;
2875: }
2876:
2877: /* For all the nodes in CUR_NODES, add the epsilon closures of them to
2878: CUR_NODES, however exclude the nodes which are:
2879: - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
2880: - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
2881: */
2882:
2883: static reg_errcode_t
2884: check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, fl_open)
2885: re_dfa_t *dfa;
2886: re_node_set *cur_nodes;
2887: int ex_subexp, fl_open;
2888: {
2889: reg_errcode_t err;
2890: int idx, outside_node;
2891: re_node_set new_nodes;
2892: #ifdef DEBUG
2893: assert (cur_nodes->nelem);
2894: #endif
2895: err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
2896: if (BE (err != REG_NOERROR, 0))
2897: return err;
2898: /* Create a new node set NEW_NODES with the nodes which are epsilon
2899: closures of the node in CUR_NODES. */
2900:
2901: for (idx = 0; idx < cur_nodes->nelem; ++idx)
2902: {
2903: int cur_node = cur_nodes->elems[idx];
2904: re_node_set *eclosure = dfa->eclosures + cur_node;
2905: outside_node = find_subexp_node (dfa, eclosure, ex_subexp, fl_open);
2906: if (outside_node == -1)
2907: {
2908: /* There are no problematic nodes, just merge them. */
2909: err = re_node_set_merge (&new_nodes, eclosure);
2910: if (BE (err != REG_NOERROR, 0))
2911: {
2912: re_node_set_free (&new_nodes);
2913: return err;
2914: }
2915: }
2916: else
2917: {
2918: /* There are problematic nodes, re-calculate incrementally. */
2919: err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
2920: ex_subexp, fl_open);
2921: if (BE (err != REG_NOERROR, 0))
2922: {
2923: re_node_set_free (&new_nodes);
2924: return err;
2925: }
2926: }
2927: }
2928: re_node_set_free (cur_nodes);
2929: *cur_nodes = new_nodes;
2930: return REG_NOERROR;
2931: }
2932:
2933: /* Helper function for check_arrival_expand_ecl.
2934: Check incrementally the epsilon closure of TARGET, and if it isn't
2935: problematic append it to DST_NODES. */
2936:
2937: static reg_errcode_t
2938: check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, fl_open)
2939: re_dfa_t *dfa;
2940: int target, ex_subexp, fl_open;
2941: re_node_set *dst_nodes;
2942: {
2943: int cur_node, type;
2944: for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
2945: {
2946: int err;
2947: type = dfa->nodes[cur_node].type;
2948:
2949: if (((type == OP_OPEN_SUBEXP && fl_open)
2950: || (type == OP_CLOSE_SUBEXP && !fl_open))
2951: && dfa->nodes[cur_node].opr.idx == ex_subexp)
2952: {
2953: if (!fl_open)
2954: {
2955: err = re_node_set_insert (dst_nodes, cur_node);
2956: if (BE (err == -1, 0))
2957: return REG_ESPACE;
2958: }
2959: break;
2960: }
2961: err = re_node_set_insert (dst_nodes, cur_node);
2962: if (BE (err == -1, 0))
2963: return REG_ESPACE;
2964: if (dfa->edests[cur_node].nelem == 0)
2965: break;
2966: if (dfa->edests[cur_node].nelem == 2)
2967: {
2968: err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
2969: dfa->edests[cur_node].elems[1],
2970: ex_subexp, fl_open);
2971: if (BE (err != REG_NOERROR, 0))
2972: return err;
2973: }
2974: cur_node = dfa->edests[cur_node].elems[0];
2975: }
2976: return REG_NOERROR;
2977: }
2978:
2979:
2980: /* For all the back references in the current state, calculate the
2981: destination of the back references by the appropriate entry
2982: in MCTX->BKREF_ENTS. */
2983:
2984: static reg_errcode_t
2985: expand_bkref_cache (preg, mctx, cur_nodes, cur_str, last_str, subexp_num,
2986: fl_open)
2987: const regex_t *preg;
2988: re_match_context_t *mctx;
2989: int cur_str, last_str, subexp_num, fl_open;
2990: re_node_set *cur_nodes;
2991: {
2992: reg_errcode_t err;
2993: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2994: int cache_idx, cache_idx_start;
2995: /* The current state. */
2996:
2997: cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
2998: for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx)
2999: {
3000: int to_idx, next_node;
3001: struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
3002: if (ent->str_idx > cur_str)
3003: break;
3004: /* Is this entry ENT is appropriate? */
3005: if (!re_node_set_contains (cur_nodes, ent->node))
3006: continue; /* No. */
3007:
3008: to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3009: /* Calculate the destination of the back reference, and append it
3010: to MCTX->STATE_LOG. */
3011: if (to_idx == cur_str)
3012: {
3013: /* The backreference did epsilon transit, we must re-check all the
3014: node in the current state. */
3015: re_node_set new_dests;
3016: reg_errcode_t err2, err3;
3017: next_node = dfa->edests[ent->node].elems[0];
3018: if (re_node_set_contains (cur_nodes, next_node))
3019: continue;
3020: err = re_node_set_init_1 (&new_dests, next_node);
3021: err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num,
3022: fl_open);
3023: err3 = re_node_set_merge (cur_nodes, &new_dests);
3024: re_node_set_free (&new_dests);
3025: if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3026: || err3 != REG_NOERROR, 0))
3027: {
3028: err = (err != REG_NOERROR ? err
3029: : (err2 != REG_NOERROR ? err2 : err3));
3030: return err;
3031: }
3032: /* TODO: It is still inefficient... */
3033: cache_idx = cache_idx_start - 1;
3034: continue;
3035: }
3036: else
3037: {
3038: re_node_set union_set;
3039: next_node = dfa->nexts[ent->node];
3040: if (mctx->state_log[to_idx])
3041: {
3042: int ret;
3043: if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3044: next_node))
3045: continue;
3046: err = re_node_set_init_copy (&union_set,
3047: &mctx->state_log[to_idx]->nodes);
3048: ret = re_node_set_insert (&union_set, next_node);
3049: if (BE (err != REG_NOERROR || ret < 0, 0))
3050: {
3051: re_node_set_free (&union_set);
3052: err = err != REG_NOERROR ? err : REG_ESPACE;
3053: return err;
3054: }
3055: }
3056: else
3057: {
3058: err = re_node_set_init_1 (&union_set, next_node);
3059: if (BE (err != REG_NOERROR, 0))
3060: return err;
3061: }
3062: mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3063: re_node_set_free (&union_set);
3064: if (BE (mctx->state_log[to_idx] == NULL
3065: && err != REG_NOERROR, 0))
3066: return err;
3067: }
3068: }
3069: return REG_NOERROR;
3070: }
3071:
3072: /* Build transition table for the state.
3073: Return the new table if succeeded, otherwise return NULL. */
3074:
3075: static re_dfastate_t **
3076: build_trtable (preg, state, fl_search)
3077: const regex_t *preg;
3078: const re_dfastate_t *state;
3079: int fl_search;
3080: {
3081: reg_errcode_t err;
3082: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3083: int i, j, k, ch;
3084: int dests_node_malloced = 0, dest_states_malloced = 0;
3085: int ndests; /* Number of the destination states from `state'. */
3086: re_dfastate_t **trtable;
3087: re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3088: re_node_set follows, *dests_node;
3089: bitset *dests_ch;
3090: bitset acceptable;
3091:
3092: /* We build DFA states which corresponds to the destination nodes
3093: from `state'. `dests_node[i]' represents the nodes which i-th
3094: destination state contains, and `dests_ch[i]' represents the
3095: characters which i-th destination state accepts. */
3096: #ifdef _LIBC
3097: if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3098: dests_node = (re_node_set *)
3099: alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3100: else
3101: #endif
3102: {
3103: dests_node = (re_node_set *)
3104: malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3105: if (BE (dests_node == NULL, 0))
3106: return NULL;
3107: dests_node_malloced = 1;
3108: }
3109: dests_ch = (bitset *) (dests_node + SBC_MAX);
3110:
3111: /* Initialize transiton table. */
3112: trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3113: if (BE (trtable == NULL, 0))
3114: {
3115: if (dests_node_malloced)
3116: free (dests_node);
3117: return NULL;
3118: }
3119:
3120: /* At first, group all nodes belonging to `state' into several
3121: destinations. */
3122: ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch);
3123: if (BE (ndests <= 0, 0))
3124: {
3125: if (dests_node_malloced)
3126: free (dests_node);
3127: /* Return NULL in case of an error, trtable otherwise. */
3128: if (ndests == 0)
3129: return trtable;
3130: free (trtable);
3131: return NULL;
3132: }
3133:
3134: err = re_node_set_alloc (&follows, ndests + 1);
3135: if (BE (err != REG_NOERROR, 0))
3136: goto out_free;
3137:
3138: #ifdef _LIBC
3139: if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3140: + ndests * 3 * sizeof (re_dfastate_t *)))
3141: dest_states = (re_dfastate_t **)
3142: alloca (ndests * 3 * sizeof (re_dfastate_t *));
3143: else
3144: #endif
3145: {
3146: dest_states = (re_dfastate_t **)
3147: malloc (ndests * 3 * sizeof (re_dfastate_t *));
3148: if (BE (dest_states == NULL, 0))
3149: {
3150: out_free:
3151: if (dest_states_malloced)
3152: free (dest_states);
3153: re_node_set_free (&follows);
3154: for (i = 0; i < ndests; ++i)
3155: re_node_set_free (dests_node + i);
3156: free (trtable);
3157: if (dests_node_malloced)
3158: free (dests_node);
3159: return NULL;
3160: }
3161: dest_states_malloced = 1;
3162: }
3163: dest_states_word = dest_states + ndests;
3164: dest_states_nl = dest_states_word + ndests;
3165: bitset_empty (acceptable);
3166:
3167: /* Then build the states for all destinations. */
3168: for (i = 0; i < ndests; ++i)
3169: {
3170: int next_node;
3171: re_node_set_empty (&follows);
3172: /* Merge the follows of this destination states. */
3173: for (j = 0; j < dests_node[i].nelem; ++j)
3174: {
3175: next_node = dfa->nexts[dests_node[i].elems[j]];
3176: if (next_node != -1)
3177: {
3178: err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3179: if (BE (err != REG_NOERROR, 0))
3180: goto out_free;
3181: }
3182: }
3183: /* If search flag is set, merge the initial state. */
3184: if (fl_search)
3185: {
3186: #ifdef RE_ENABLE_I18N
3187: int not_initial = 0;
3188: for (j = 0; j < follows.nelem; ++j)
3189: if (dfa->nodes[follows.elems[j]].type == CHARACTER)
3190: {
3191: not_initial = dfa->nodes[follows.elems[j]].mb_partial;
3192: break;
3193: }
3194: if (!not_initial)
3195: #endif
3196: {
3197: err = re_node_set_merge (&follows,
3198: dfa->init_state->entrance_nodes);
3199: if (BE (err != REG_NOERROR, 0))
3200: goto out_free;
3201: }
3202: }
3203: dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3204: if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3205: goto out_free;
3206: /* If the new state has context constraint,
3207: build appropriate states for these contexts. */
3208: if (dest_states[i]->has_constraint)
3209: {
3210: dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3211: CONTEXT_WORD);
3212: if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3213: goto out_free;
3214: dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3215: CONTEXT_NEWLINE);
3216: if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3217: goto out_free;
3218: }
3219: else
3220: {
3221: dest_states_word[i] = dest_states[i];
3222: dest_states_nl[i] = dest_states[i];
3223: }
3224: bitset_merge (acceptable, dests_ch[i]);
3225: }
3226:
3227: /* Update the transition table. */
3228: /* For all characters ch...: */
3229: for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
3230: for (j = 0; j < UINT_BITS; ++j, ++ch)
3231: if ((acceptable[i] >> j) & 1)
3232: {
3233: /* The current state accepts the character ch. */
3234: if (IS_WORD_CHAR (ch))
3235: {
3236: for (k = 0; k < ndests; ++k)
3237: if ((dests_ch[k][i] >> j) & 1)
3238: {
3239: /* k-th destination accepts the word character ch. */
3240: trtable[ch] = dest_states_word[k];
3241: /* There must be only one destination which accepts
3242: character ch. See group_nodes_into_DFAstates. */
3243: break;
3244: }
3245: }
3246: else /* not WORD_CHAR */
3247: {
3248: for (k = 0; k < ndests; ++k)
3249: if ((dests_ch[k][i] >> j) & 1)
3250: {
3251: /* k-th destination accepts the non-word character ch. */
3252: trtable[ch] = dest_states[k];
3253: /* There must be only one destination which accepts
3254: character ch. See group_nodes_into_DFAstates. */
3255: break;
3256: }
3257: }
3258: }
3259: /* new line */
3260: if (bitset_contain (acceptable, NEWLINE_CHAR))
3261: {
3262: /* The current state accepts newline character. */
3263: for (k = 0; k < ndests; ++k)
3264: if (bitset_contain (dests_ch[k], NEWLINE_CHAR))
3265: {
3266: /* k-th destination accepts newline character. */
3267: trtable[NEWLINE_CHAR] = dest_states_nl[k];
3268: /* There must be only one destination which accepts
3269: newline. See group_nodes_into_DFAstates. */
3270: break;
3271: }
3272: }
3273:
3274: if (dest_states_malloced)
3275: free (dest_states);
3276:
3277: re_node_set_free (&follows);
3278: for (i = 0; i < ndests; ++i)
3279: re_node_set_free (dests_node + i);
3280:
3281: if (dests_node_malloced)
3282: free (dests_node);
3283:
3284: return trtable;
3285: }
3286:
3287: /* Group all nodes belonging to STATE into several destinations.
3288: Then for all destinations, set the nodes belonging to the destination
3289: to DESTS_NODE[i] and set the characters accepted by the destination
3290: to DEST_CH[i]. This function return the number of destinations. */
3291:
3292: static int
3293: group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
3294: const regex_t *preg;
3295: const re_dfastate_t *state;
3296: re_node_set *dests_node;
3297: bitset *dests_ch;
3298: {
3299: reg_errcode_t err;
3300: const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3301: int i, j, k;
3302: int ndests; /* Number of the destinations from `state'. */
3303: bitset accepts; /* Characters a node can accept. */
3304: const re_node_set *cur_nodes = &state->nodes;
3305: bitset_empty (accepts);
3306: ndests = 0;
3307:
3308: /* For all the nodes belonging to `state', */
3309: for (i = 0; i < cur_nodes->nelem; ++i)
3310: {
3311: re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3312: re_token_type_t type = node->type;
3313: unsigned int constraint = node->constraint;
3314:
3315: /* Enumerate all single byte character this node can accept. */
3316: if (type == CHARACTER)
3317: bitset_set (accepts, node->opr.c);
3318: else if (type == SIMPLE_BRACKET)
3319: {
3320: bitset_merge (accepts, node->opr.sbcset);
3321: }
3322: else if (type == OP_PERIOD)
3323: {
3324: bitset_set_all (accepts);
3325: if (!(preg->syntax & RE_DOT_NEWLINE))
3326: bitset_clear (accepts, '\n');
3327: if (preg->syntax & RE_DOT_NOT_NULL)
3328: bitset_clear (accepts, '\0');
3329: }
3330: else
3331: continue;
3332:
3333: /* Check the `accepts' and sift the characters which are not
3334: match it the context. */
3335: if (constraint)
3336: {
3337: if (constraint & NEXT_WORD_CONSTRAINT)
3338: for (j = 0; j < BITSET_UINTS; ++j)
3339: accepts[j] &= dfa->word_char[j];
3340: if (constraint & NEXT_NOTWORD_CONSTRAINT)
3341: for (j = 0; j < BITSET_UINTS; ++j)
3342: accepts[j] &= ~dfa->word_char[j];
3343: if (constraint & NEXT_NEWLINE_CONSTRAINT)
3344: {
3345: int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3346: bitset_empty (accepts);
3347: if (accepts_newline)
3348: bitset_set (accepts, NEWLINE_CHAR);
3349: else
3350: continue;
3351: }
3352: }
3353:
3354: /* Then divide `accepts' into DFA states, or create a new
3355: state. */
3356: for (j = 0; j < ndests; ++j)
3357: {
3358: bitset intersec; /* Intersection sets, see below. */
3359: bitset remains;
3360: /* Flags, see below. */
3361: int has_intersec, not_subset, not_consumed;
3362:
3363: /* Optimization, skip if this state doesn't accept the character. */
3364: if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3365: continue;
3366:
3367: /* Enumerate the intersection set of this state and `accepts'. */
3368: has_intersec = 0;
3369: for (k = 0; k < BITSET_UINTS; ++k)
3370: has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3371: /* And skip if the intersection set is empty. */
3372: if (!has_intersec)
3373: continue;
3374:
3375: /* Then check if this state is a subset of `accepts'. */
3376: not_subset = not_consumed = 0;
3377: for (k = 0; k < BITSET_UINTS; ++k)
3378: {
3379: not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3380: not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3381: }
3382:
3383: /* If this state isn't a subset of `accepts', create a
3384: new group state, which has the `remains'. */
3385: if (not_subset)
3386: {
3387: bitset_copy (dests_ch[ndests], remains);
3388: bitset_copy (dests_ch[j], intersec);
3389: err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3390: if (BE (err != REG_NOERROR, 0))
3391: goto error_return;
3392: ++ndests;
3393: }
3394:
3395: /* Put the position in the current group. */
3396: err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3397: if (BE (err < 0, 0))
3398: goto error_return;
3399:
3400: /* If all characters are consumed, go to next node. */
3401: if (!not_consumed)
3402: break;
3403: }
3404: /* Some characters remain, create a new group. */
3405: if (j == ndests)
3406: {
3407: bitset_copy (dests_ch[ndests], accepts);
3408: err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3409: if (BE (err != REG_NOERROR, 0))
3410: goto error_return;
3411: ++ndests;
3412: bitset_empty (accepts);
3413: }
3414: }
3415: return ndests;
3416: error_return:
3417: for (j = 0; j < ndests; ++j)
3418: re_node_set_free (dests_node + j);
3419: return -1;
3420: }
3421:
3422: #ifdef RE_ENABLE_I18N
3423: /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3424: Return the number of the bytes the node accepts.
3425: STR_IDX is the current index of the input string.
3426:
3427: This function handles the nodes which can accept one character, or
3428: one collating element like '.', '[a-z]', opposite to the other nodes
3429: can only accept one byte. */
3430:
3431: static int
3432: check_node_accept_bytes (preg, node_idx, input, str_idx)
3433: const regex_t *preg;
3434: int node_idx, str_idx;
3435: const re_string_t *input;
3436: {
3437: const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
3438: const re_token_t *node = dfa->nodes + node_idx;
3439: int elem_len = re_string_elem_size_at (input, str_idx);
3440: int char_len = re_string_char_size_at (input, str_idx);
3441: int i;
3442: # ifdef _LIBC
3443: int j;
3444: uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3445: # endif /* _LIBC */
3446: if (elem_len <= 1 && char_len <= 1)
3447: return 0;
3448: if (node->type == OP_PERIOD)
3449: {
3450: /* '.' accepts any one character except the following two cases. */
3451: if ((!(preg->syntax & RE_DOT_NEWLINE) &&
3452: re_string_byte_at (input, str_idx) == '\n') ||
3453: ((preg->syntax & RE_DOT_NOT_NULL) &&
3454: re_string_byte_at (input, str_idx) == '\0'))
3455: return 0;
3456: return char_len;
3457: }
3458: else if (node->type == COMPLEX_BRACKET)
3459: {
3460: const re_charset_t *cset = node->opr.mbcset;
3461: # ifdef _LIBC
3462: const unsigned char *pin = ((char *) re_string_get_buffer (input)
3463: + str_idx);
3464: # endif /* _LIBC */
3465: int match_len = 0;
3466: wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3467: ? re_string_wchar_at (input, str_idx) : 0);
3468:
3469: /* match with multibyte character? */
3470: for (i = 0; i < cset->nmbchars; ++i)
3471: if (wc == cset->mbchars[i])
3472: {
3473: match_len = char_len;
3474: goto check_node_accept_bytes_match;
3475: }
3476: /* match with character_class? */
3477: for (i = 0; i < cset->nchar_classes; ++i)
3478: {
3479: wctype_t wt = cset->char_classes[i];
3480: if (__iswctype (wc, wt))
3481: {
3482: match_len = char_len;
3483: goto check_node_accept_bytes_match;
3484: }
3485: }
3486:
3487: # ifdef _LIBC
3488: if (nrules != 0)
3489: {
3490: unsigned int in_collseq = 0;
3491: const int32_t *table, *indirect;
3492: const unsigned char *weights, *extra;
3493: const char *collseqwc;
3494: int32_t idx;
3495: /* This #include defines a local function! */
3496: # include <locale/weight.h>
3497:
3498: /* match with collating_symbol? */
3499: if (cset->ncoll_syms)
3500: extra = (const unsigned char *)
3501: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3502: for (i = 0; i < cset->ncoll_syms; ++i)
3503: {
3504: const unsigned char *coll_sym = extra + cset->coll_syms[i];
3505: /* Compare the length of input collating element and
3506: the length of current collating element. */
3507: if (*coll_sym != elem_len)
3508: continue;
3509: /* Compare each bytes. */
3510: for (j = 0; j < *coll_sym; j++)
3511: if (pin[j] != coll_sym[1 + j])
3512: break;
3513: if (j == *coll_sym)
3514: {
3515: /* Match if every bytes is equal. */
3516: match_len = j;
3517: goto check_node_accept_bytes_match;
3518: }
3519: }
3520:
3521: if (cset->nranges)
3522: {
3523: if (elem_len <= char_len)
3524: {
3525: collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3526: in_collseq = collseq_table_lookup (collseqwc, wc);
3527: }
3528: else
3529: in_collseq = find_collation_sequence_value (pin, elem_len);
3530: }
3531: /* match with range expression? */
3532: for (i = 0; i < cset->nranges; ++i)
3533: if (cset->range_starts[i] <= in_collseq
3534: && in_collseq <= cset->range_ends[i])
3535: {
3536: match_len = elem_len;
3537: goto check_node_accept_bytes_match;
3538: }
3539:
3540: /* match with equivalence_class? */
3541: if (cset->nequiv_classes)
3542: {
3543: const unsigned char *cp = pin;
3544: table = (const int32_t *)
3545: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3546: weights = (const unsigned char *)
3547: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3548: extra = (const unsigned char *)
3549: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3550: indirect = (const int32_t *)
3551: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3552: idx = findidx (&cp);
3553: if (idx > 0)
3554: for (i = 0; i < cset->nequiv_classes; ++i)
3555: {
3556: int32_t equiv_class_idx = cset->equiv_classes[i];
3557: size_t weight_len = weights[idx];
3558: if (weight_len == weights[equiv_class_idx])
3559: {
3560: int cnt = 0;
3561: while (cnt <= weight_len
3562: && (weights[equiv_class_idx + 1 + cnt]
3563: == weights[idx + 1 + cnt]))
3564: ++cnt;
3565: if (cnt > weight_len)
3566: {
3567: match_len = elem_len;
3568: goto check_node_accept_bytes_match;
3569: }
3570: }
3571: }
3572: }
3573: }
3574: else
3575: # endif /* _LIBC */
3576: {
3577: /* match with range expression? */
3578: #if __GNUC__ >= 2
3579: wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3580: #else
3581: wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3582: cmp_buf[2] = wc;
3583: #endif
3584: for (i = 0; i < cset->nranges; ++i)
3585: {
3586: cmp_buf[0] = cset->range_starts[i];
3587: cmp_buf[4] = cset->range_ends[i];
3588: if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3589: && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3590: {
3591: match_len = char_len;
3592: goto check_node_accept_bytes_match;
3593: }
3594: }
3595: }
3596: check_node_accept_bytes_match:
3597: if (!cset->non_match)
3598: return match_len;
3599: else
3600: {
3601: if (match_len > 0)
3602: return 0;
3603: else
3604: return (elem_len > char_len) ? elem_len : char_len;
3605: }
3606: }
3607: return 0;
3608: }
3609:
3610: # ifdef _LIBC
3611: static unsigned int
3612: find_collation_sequence_value (mbs, mbs_len)
3613: const unsigned char *mbs;
3614: size_t mbs_len;
3615: {
3616: uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3617: if (nrules == 0)
3618: {
3619: if (mbs_len == 1)
3620: {
3621: /* No valid character. Match it as a single byte character. */
3622: const unsigned char *collseq = (const unsigned char *)
3623: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3624: return collseq[mbs[0]];
3625: }
3626: return UINT_MAX;
3627: }
3628: else
3629: {
3630: int32_t idx;
3631: const unsigned char *extra = (const unsigned char *)
3632: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3633:
3634: for (idx = 0; ;)
3635: {
3636: int mbs_cnt, found = 0;
3637: int32_t elem_mbs_len;
3638: /* Skip the name of collating element name. */
3639: idx = idx + extra[idx] + 1;
3640: elem_mbs_len = extra[idx++];
3641: if (mbs_len == elem_mbs_len)
3642: {
3643: for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3644: if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3645: break;
3646: if (mbs_cnt == elem_mbs_len)
3647: /* Found the entry. */
3648: found = 1;
3649: }
3650: /* Skip the byte sequence of the collating element. */
3651: idx += elem_mbs_len;
3652: /* Adjust for the alignment. */
3653: idx = (idx + 3) & ~3;
3654: /* Skip the collation sequence value. */
3655: idx += sizeof (uint32_t);
3656: /* Skip the wide char sequence of the collating element. */
3657: idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3658: /* If we found the entry, return the sequence value. */
3659: if (found)
3660: return *(uint32_t *) (extra + idx);
3661: /* Skip the collation sequence value. */
3662: idx += sizeof (uint32_t);
3663: }
3664: }
3665: }
3666: # endif /* _LIBC */
3667: #endif /* RE_ENABLE_I18N */
3668:
3669: /* Check whether the node accepts the byte which is IDX-th
3670: byte of the INPUT. */
3671:
3672: static int
3673: check_node_accept (preg, node, mctx, idx)
3674: const regex_t *preg;
3675: const re_token_t *node;
3676: const re_match_context_t *mctx;
3677: int idx;
3678: {
3679: unsigned char ch;
3680: if (node->constraint)
3681: {
3682: /* The node has constraints. Check whether the current context
3683: satisfies the constraints. */
3684: unsigned int context = re_string_context_at (mctx->input, idx,
3685: mctx->eflags,
3686: preg->newline_anchor);
3687: if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3688: return 0;
3689: }
3690: ch = re_string_byte_at (mctx->input, idx);
3691: if (node->type == CHARACTER)
3692: return node->opr.c == ch;
3693: else if (node->type == SIMPLE_BRACKET)
3694: return bitset_contain (node->opr.sbcset, ch);
3695: else if (node->type == OP_PERIOD)
3696: return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
3697: || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));
3698: else
3699: return 0;
3700: }
3701:
3702: /* Extend the buffers, if the buffers have run out. */
3703:
3704: static reg_errcode_t
3705: extend_buffers (mctx)
3706: re_match_context_t *mctx;
3707: {
3708: reg_errcode_t ret;
3709: re_string_t *pstr = mctx->input;
3710:
3711: /* Double the lengthes of the buffers. */
3712: ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3713: if (BE (ret != REG_NOERROR, 0))
3714: return ret;
3715:
3716: if (mctx->state_log != NULL)
3717: {
3718: /* And double the length of state_log. */
3719: re_dfastate_t **new_array;
3720: new_array = re_realloc (mctx->state_log, re_dfastate_t *,
3721: pstr->bufs_len * 2);
3722: if (BE (new_array == NULL, 0))
3723: return REG_ESPACE;
3724: mctx->state_log = new_array;
3725: }
3726:
3727: /* Then reconstruct the buffers. */
3728: if (pstr->icase)
3729: {
3730: #ifdef RE_ENABLE_I18N
3731: if (MB_CUR_MAX > 1)
3732: build_wcs_upper_buffer (pstr);
3733: else
3734: #endif /* RE_ENABLE_I18N */
3735: build_upper_buffer (pstr);
3736: }
3737: else
3738: {
3739: #ifdef RE_ENABLE_I18N
3740: if (MB_CUR_MAX > 1)
3741: build_wcs_buffer (pstr);
3742: else
3743: #endif /* RE_ENABLE_I18N */
3744: {
3745: if (pstr->trans != NULL)
3746: re_string_translate_buffer (pstr);
3747: else
3748: pstr->valid_len = pstr->bufs_len;
3749: }
3750: }
3751: return REG_NOERROR;
3752: }
3753:
3754:
3755: /* Functions for matching context. */
3756:
3757: /* Initialize MCTX. */
3758:
3759: static reg_errcode_t
3760: match_ctx_init (mctx, eflags, input, n)
3761: re_match_context_t *mctx;
3762: int eflags, n;
3763: re_string_t *input;
3764: {
3765: mctx->eflags = eflags;
3766: mctx->input = input;
3767: mctx->match_last = -1;
3768: if (n > 0)
3769: {
3770: mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
3771: mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
3772: if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
3773: return REG_ESPACE;
3774: }
3775: else
3776: mctx->bkref_ents = NULL;
3777: mctx->nbkref_ents = 0;
3778: mctx->abkref_ents = n;
3779: mctx->max_mb_elem_len = 1;
3780: mctx->nsub_tops = 0;
3781: mctx->asub_tops = n;
3782: return REG_NOERROR;
3783: }
3784:
3785: /* Clean the entries which depend on the current input in MCTX.
3786: This function must be invoked when the matcher changes the start index
3787: of the input, or changes the input string. */
3788:
3789: static void
3790: match_ctx_clean (mctx)
3791: re_match_context_t *mctx;
3792: {
3793: match_ctx_free_subtops (mctx);
3794: mctx->nsub_tops = 0;
3795: mctx->nbkref_ents = 0;
3796: }
3797:
3798: /* Free all the memory associated with MCTX. */
3799:
3800: static void
3801: match_ctx_free (mctx)
3802: re_match_context_t *mctx;
3803: {
3804: match_ctx_free_subtops (mctx);
3805: re_free (mctx->sub_tops);
3806: re_free (mctx->bkref_ents);
3807: }
3808:
3809: /* Free all the memory associated with MCTX->SUB_TOPS. */
3810:
3811: static void
3812: match_ctx_free_subtops (mctx)
3813: re_match_context_t *mctx;
3814: {
3815: int st_idx;
3816: for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
3817: {
3818: int sl_idx;
3819: re_sub_match_top_t *top = mctx->sub_tops[st_idx];
3820: for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
3821: {
3822: re_sub_match_last_t *last = top->lasts[sl_idx];
3823: re_free (last->path.array);
3824: re_free (last);
3825: }
3826: re_free (top->lasts);
3827: if (top->path)
3828: {
3829: re_free (top->path->array);
3830: re_free (top->path);
3831: }
3832: free (top);
3833: }
3834: }
3835:
3836: /* Add a new backreference entry to MCTX.
3837: Note that we assume that caller never call this function with duplicate
3838: entry, and call with STR_IDX which isn't smaller than any existing entry.
3839: */
3840:
3841: static reg_errcode_t
3842: match_ctx_add_entry (mctx, node, str_idx, from, to)
3843: re_match_context_t *mctx;
3844: int node, str_idx, from, to;
3845: {
3846: if (mctx->nbkref_ents >= mctx->abkref_ents)
3847: {
3848: struct re_backref_cache_entry* new_entry;
3849: new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
3850: mctx->abkref_ents * 2);
3851: if (BE (new_entry == NULL, 0))
3852: {
3853: re_free (mctx->bkref_ents);
3854: return REG_ESPACE;
3855: }
3856: mctx->bkref_ents = new_entry;
3857: memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
3858: sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
3859: mctx->abkref_ents *= 2;
3860: }
3861: mctx->bkref_ents[mctx->nbkref_ents].node = node;
3862: mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
3863: mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
3864: mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
3865: mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
3866: if (mctx->max_mb_elem_len < to - from)
3867: mctx->max_mb_elem_len = to - from;
3868: return REG_NOERROR;
3869: }
3870:
3871: /* Search for the first entry which has the same str_idx.
3872: Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX. */
3873:
3874: static int
3875: search_cur_bkref_entry (mctx, str_idx)
3876: re_match_context_t *mctx;
3877: int str_idx;
3878: {
3879: int left, right, mid;
3880: right = mctx->nbkref_ents;
3881: for (left = 0; left < right;)
3882: {
3883: mid = (left + right) / 2;
3884: if (mctx->bkref_ents[mid].str_idx < str_idx)
3885: left = mid + 1;
3886: else
3887: right = mid;
3888: }
3889: return left;
3890: }
3891:
3892: static void
3893: match_ctx_clear_flag (mctx)
3894: re_match_context_t *mctx;
3895: {
3896: int i;
3897: for (i = 0; i < mctx->nbkref_ents; ++i)
3898: {
3899: mctx->bkref_ents[i].flag = 0;
3900: }
3901: }
3902:
3903: /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
3904: at STR_IDX. */
3905:
3906: static reg_errcode_t
3907: match_ctx_add_subtop (mctx, node, str_idx)
3908: re_match_context_t *mctx;
3909: int node, str_idx;
3910: {
3911: #ifdef DEBUG
3912: assert (mctx->sub_tops != NULL);
3913: assert (mctx->asub_tops > 0);
3914: #endif
3915: if (mctx->nsub_tops == mctx->asub_tops)
3916: {
3917: re_sub_match_top_t **new_array;
3918: mctx->asub_tops *= 2;
3919: new_array = re_realloc (mctx->sub_tops, re_sub_match_top_t *,
3920: mctx->asub_tops);
3921: if (BE (new_array == NULL, 0))
3922: return REG_ESPACE;
3923: mctx->sub_tops = new_array;
3924: }
3925: mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
3926: if (mctx->sub_tops[mctx->nsub_tops] == NULL)
3927: return REG_ESPACE;
3928: mctx->sub_tops[mctx->nsub_tops]->node = node;
3929: mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
3930: return REG_NOERROR;
3931: }
3932:
3933: /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
3934: at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP. */
3935:
3936: static re_sub_match_last_t *
3937: match_ctx_add_sublast (subtop, node, str_idx)
3938: re_sub_match_top_t *subtop;
3939: int node, str_idx;
3940: {
3941: re_sub_match_last_t *new_entry;
3942: if (subtop->nlasts == subtop->alasts)
3943: {
3944: re_sub_match_last_t **new_array;
3945: subtop->alasts = 2 * subtop->alasts + 1;
3946: new_array = re_realloc (subtop->lasts, re_sub_match_last_t *,
3947: subtop->alasts);
3948: if (BE (new_array == NULL, 0))
3949: return NULL;
3950: subtop->lasts = new_array;
3951: }
3952: new_entry = calloc (1, sizeof (re_sub_match_last_t));
3953: if (BE (new_entry == NULL, 0))
3954: return NULL;
3955: subtop->lasts[subtop->nlasts] = new_entry;
3956: new_entry->node = node;
3957: new_entry->str_idx = str_idx;
3958: ++subtop->nlasts;
3959: return new_entry;
3960: }
3961:
3962: static void
3963: sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
3964: check_subexp)
3965: re_sift_context_t *sctx;
3966: re_dfastate_t **sifted_sts, **limited_sts;
3967: int last_node, last_str_idx, check_subexp;
3968: {
3969: sctx->sifted_states = sifted_sts;
3970: sctx->limited_states = limited_sts;
3971: sctx->last_node = last_node;
3972: sctx->last_str_idx = last_str_idx;
3973: sctx->check_subexp = check_subexp;
3974: sctx->cur_bkref = -1;
3975: sctx->cls_subexp_idx = -1;
3976: re_node_set_init_empty (&sctx->limits);
3977: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>