Annotation of embedaddon/smartmontools/regex/regcomp.c, revision 1.1.1.1
1.1 misho 1: /* Extended regular expression matching and search library.
2: Copyright (C) 2002, 2003 Free Software Foundation, Inc.
3: This file is part of the GNU C Library.
4: Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5:
6: The GNU C Library is free software; you can redistribute it and/or
7: modify it under the terms of the GNU Lesser General Public
8: License as published by the Free Software Foundation; either
9: version 2.1 of the License, or (at your option) any later version.
10:
11: The GNU C Library is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14: Lesser General Public License for more details.
15:
16: You should have received a copy of the GNU Lesser General Public
17: License along with the GNU C Library; if not, write to the Free
18: Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19: 02111-1307 USA. */
20:
21: static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22: int length, reg_syntax_t syntax);
23: static void re_compile_fastmap_iter (regex_t *bufp,
24: const re_dfastate_t *init_state,
25: char *fastmap);
26: static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27: static reg_errcode_t init_word_char (re_dfa_t *dfa);
28: #ifdef RE_ENABLE_I18N
29: static void free_charset (re_charset_t *cset);
30: #endif /* RE_ENABLE_I18N */
31: static void free_workarea_compile (regex_t *preg);
32: static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33: static reg_errcode_t analyze (re_dfa_t *dfa);
34: static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
35: static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
36: static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
37: static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
38: static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
39: int top_clone_node, int root_node,
40: unsigned int constraint);
41: static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
42: unsigned int constraint);
43: static int search_duplicated_node (re_dfa_t *dfa, int org_node,
44: unsigned int constraint);
45: static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
46: static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
47: int node, int root);
48: static void calc_inveclosure (re_dfa_t *dfa);
49: static int fetch_number (re_string_t *input, re_token_t *token,
50: reg_syntax_t syntax);
51: static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
52: static int peek_token (re_token_t *token, re_string_t *input,
53: reg_syntax_t syntax);
54: static int peek_token_bracket (re_token_t *token, re_string_t *input,
55: reg_syntax_t syntax);
56: static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
57: reg_syntax_t syntax, reg_errcode_t *err);
58: static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
59: re_token_t *token, reg_syntax_t syntax,
60: int nest, reg_errcode_t *err);
61: static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
62: re_token_t *token, reg_syntax_t syntax,
63: int nest, reg_errcode_t *err);
64: static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
65: re_token_t *token, reg_syntax_t syntax,
66: int nest, reg_errcode_t *err);
67: static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
68: re_token_t *token, reg_syntax_t syntax,
69: int nest, reg_errcode_t *err);
70: static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
71: re_dfa_t *dfa, re_token_t *token,
72: reg_syntax_t syntax, reg_errcode_t *err);
73: static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
74: re_token_t *token, reg_syntax_t syntax,
75: reg_errcode_t *err);
76: static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
77: re_string_t *regexp,
78: re_token_t *token, int token_len,
79: re_dfa_t *dfa,
80: reg_syntax_t syntax);
81: static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
82: re_string_t *regexp,
83: re_token_t *token);
84: #ifndef _LIBC
85: # ifdef RE_ENABLE_I18N
86: static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
87: re_charset_t *mbcset, int *range_alloc,
88: bracket_elem_t *start_elem,
89: bracket_elem_t *end_elem);
90: static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
91: re_charset_t *mbcset,
92: int *coll_sym_alloc,
93: const unsigned char *name);
94: # else /* not RE_ENABLE_I18N */
95: static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
96: bracket_elem_t *start_elem,
97: bracket_elem_t *end_elem);
98: static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
99: const unsigned char *name);
100: # endif /* not RE_ENABLE_I18N */
101: #endif /* not _LIBC */
102: #ifdef RE_ENABLE_I18N
103: static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
104: re_charset_t *mbcset,
105: int *equiv_class_alloc,
106: const unsigned char *name);
107: static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
108: re_charset_t *mbcset,
109: int *char_class_alloc,
110: const unsigned char *class_name,
111: reg_syntax_t syntax);
112: #else /* not RE_ENABLE_I18N */
113: static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
114: const unsigned char *name);
115: static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
116: const unsigned char *class_name,
117: reg_syntax_t syntax);
118: #endif /* not RE_ENABLE_I18N */
119: static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
120: static void free_bin_tree (bin_tree_t *tree);
121: static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
122: re_token_type_t type, int index);
123: static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
124:
125: /* This table gives an error message for each of the error codes listed
126: in regex.h. Obviously the order here has to be same as there.
127: POSIX doesn't require that we do anything for REG_NOERROR,
128: but why not be nice? */
129:
130: const char __re_error_msgid[] attribute_hidden =
131: {
132: #define REG_NOERROR_IDX 0
133: gettext_noop ("Success") /* REG_NOERROR */
134: "\0"
135: #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136: gettext_noop ("No match") /* REG_NOMATCH */
137: "\0"
138: #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139: gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140: "\0"
141: #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142: gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143: "\0"
144: #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145: gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146: "\0"
147: #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148: gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149: "\0"
150: #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151: gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152: "\0"
153: #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154: gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155: "\0"
156: #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157: gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158: "\0"
159: #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160: gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161: "\0"
162: #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163: gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164: "\0"
165: #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166: gettext_noop ("Invalid range end") /* REG_ERANGE */
167: "\0"
168: #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169: gettext_noop ("Memory exhausted") /* REG_ESPACE */
170: "\0"
171: #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172: gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173: "\0"
174: #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175: gettext_noop ("Premature end of regular expression") /* REG_EEND */
176: "\0"
177: #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178: gettext_noop ("Regular expression too big") /* REG_ESIZE */
179: "\0"
180: #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181: gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182: };
183:
184: const size_t __re_error_msgid_idx[] attribute_hidden =
185: {
186: REG_NOERROR_IDX,
187: REG_NOMATCH_IDX,
188: REG_BADPAT_IDX,
189: REG_ECOLLATE_IDX,
190: REG_ECTYPE_IDX,
191: REG_EESCAPE_IDX,
192: REG_ESUBREG_IDX,
193: REG_EBRACK_IDX,
194: REG_EPAREN_IDX,
195: REG_EBRACE_IDX,
196: REG_BADBR_IDX,
197: REG_ERANGE_IDX,
198: REG_ESPACE_IDX,
199: REG_BADRPT_IDX,
200: REG_EEND_IDX,
201: REG_ESIZE_IDX,
202: REG_ERPAREN_IDX
203: };
204:
205: /* Entry points for GNU code. */
206:
207: /* re_compile_pattern is the GNU regular expression compiler: it
208: compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209: Returns 0 if the pattern was valid, otherwise an error string.
210:
211: Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212: are set in BUFP on entry. */
213:
214: const char *
215: re_compile_pattern (pattern, length, bufp)
216: const char *pattern;
217: size_t length;
218: struct re_pattern_buffer *bufp;
219: {
220: reg_errcode_t ret;
221:
222: /* And GNU code determines whether or not to get register information
223: by passing null for the REGS argument to re_match, etc., not by
224: setting no_sub. */
225: bufp->no_sub = 0;
226:
227: /* Match anchors at newline. */
228: bufp->newline_anchor = 1;
229:
230: ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231:
232: if (!ret)
233: return NULL;
234: return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235: }
236: #ifdef _LIBC
237: weak_alias (__re_compile_pattern, re_compile_pattern)
238: #endif
239:
240: /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
241: also be assigned to arbitrarily: each pattern buffer stores its own
242: syntax, so it can be changed between regex compilations. */
243: /* This has no initializer because initialized variables in Emacs
244: become read-only after dumping. */
245: reg_syntax_t re_syntax_options;
246:
247:
248: /* Specify the precise syntax of regexps for compilation. This provides
249: for compatibility for various utilities which historically have
250: different, incompatible syntaxes.
251:
252: The argument SYNTAX is a bit mask comprised of the various bits
253: defined in regex.h. We return the old syntax. */
254:
255: reg_syntax_t
256: re_set_syntax (syntax)
257: reg_syntax_t syntax;
258: {
259: reg_syntax_t ret = re_syntax_options;
260:
261: re_syntax_options = syntax;
262: return ret;
263: }
264: #ifdef _LIBC
265: weak_alias (__re_set_syntax, re_set_syntax)
266: #endif
267:
268: int
269: re_compile_fastmap (bufp)
270: struct re_pattern_buffer *bufp;
271: {
272: re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273: char *fastmap = bufp->fastmap;
274:
275: memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276: re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277: if (dfa->init_state != dfa->init_state_word)
278: re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279: if (dfa->init_state != dfa->init_state_nl)
280: re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281: if (dfa->init_state != dfa->init_state_begbuf)
282: re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283: bufp->fastmap_accurate = 1;
284: return 0;
285: }
286: #ifdef _LIBC
287: weak_alias (__re_compile_fastmap, re_compile_fastmap)
288: #endif
289:
290: static inline void
291: re_set_fastmap (char *fastmap, int icase, int ch)
292: {
293: fastmap[ch] = 1;
294: if (icase)
295: fastmap[tolower (ch)] = 1;
296: }
297:
298: /* Helper function for re_compile_fastmap.
299: Compile fastmap for the initial_state INIT_STATE. */
300:
301: static void
302: re_compile_fastmap_iter (bufp, init_state, fastmap)
303: regex_t *bufp;
304: const re_dfastate_t *init_state;
305: char *fastmap;
306: {
307: re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
308: int node_cnt;
309: int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
310: for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
311: {
312: int node = init_state->nodes.elems[node_cnt];
313: re_token_type_t type = dfa->nodes[node].type;
314:
315: if (type == CHARACTER)
316: re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317: else if (type == SIMPLE_BRACKET)
318: {
319: int i, j, ch;
320: for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
321: for (j = 0; j < UINT_BITS; ++j, ++ch)
322: if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
323: re_set_fastmap (fastmap, icase, ch);
324: }
325: #ifdef RE_ENABLE_I18N
326: else if (type == COMPLEX_BRACKET)
327: {
328: int i;
329: re_charset_t *cset = dfa->nodes[node].opr.mbcset;
330: if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
331: || cset->nranges || cset->nchar_classes)
332: {
333: # ifdef _LIBC
334: if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
335: {
336: /* In this case we want to catch the bytes which are
337: the first byte of any collation elements.
338: e.g. In da_DK, we want to catch 'a' since "aa"
339: is a valid collation element, and don't catch
340: 'b' since 'b' is the only collation element
341: which starts from 'b'. */
342: int j, ch;
343: const int32_t *table = (const int32_t *)
344: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
345: for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
346: for (j = 0; j < UINT_BITS; ++j, ++ch)
347: if (table[ch] < 0)
348: re_set_fastmap (fastmap, icase, ch);
349: }
350: # else
351: if (MB_CUR_MAX > 1)
352: for (i = 0; i < SBC_MAX; ++i)
353: if (__btowc (i) == WEOF)
354: re_set_fastmap (fastmap, icase, i);
355: # endif /* not _LIBC */
356: }
357: for (i = 0; i < cset->nmbchars; ++i)
358: {
359: char buf[256];
360: mbstate_t state;
361: memset (&state, '\0', sizeof (state));
362: __wcrtomb (buf, cset->mbchars[i], &state);
363: re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
364: }
365: }
366: #endif /* RE_ENABLE_I18N */
367: else if (type == END_OF_RE || type == OP_PERIOD)
368: {
369: memset (fastmap, '\1', sizeof (char) * SBC_MAX);
370: if (type == END_OF_RE)
371: bufp->can_be_null = 1;
372: return;
373: }
374: }
375: }
376:
377: /* Entry point for POSIX code. */
378: /* regcomp takes a regular expression as a string and compiles it.
379:
380: PREG is a regex_t *. We do not expect any fields to be initialized,
381: since POSIX says we shouldn't. Thus, we set
382:
383: `buffer' to the compiled pattern;
384: `used' to the length of the compiled pattern;
385: `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
386: REG_EXTENDED bit in CFLAGS is set; otherwise, to
387: RE_SYNTAX_POSIX_BASIC;
388: `newline_anchor' to REG_NEWLINE being set in CFLAGS;
389: `fastmap' to an allocated space for the fastmap;
390: `fastmap_accurate' to zero;
391: `re_nsub' to the number of subexpressions in PATTERN.
392:
393: PATTERN is the address of the pattern string.
394:
395: CFLAGS is a series of bits which affect compilation.
396:
397: If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
398: use POSIX basic syntax.
399:
400: If REG_NEWLINE is set, then . and [^...] don't match newline.
401: Also, regexec will try a match beginning after every newline.
402:
403: If REG_ICASE is set, then we considers upper- and lowercase
404: versions of letters to be equivalent when matching.
405:
406: If REG_NOSUB is set, then when PREG is passed to regexec, that
407: routine will report only success or failure, and nothing about the
408: registers.
409:
410: It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
411: the return codes and their meanings.) */
412:
413: int
414: regcomp (preg, pattern, cflags)
415: regex_t *__restrict preg;
416: const char *__restrict pattern;
417: int cflags;
418: {
419: reg_errcode_t ret;
420: reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
421: : RE_SYNTAX_POSIX_BASIC);
422:
423: preg->buffer = NULL;
424: preg->allocated = 0;
425: preg->used = 0;
426:
427: /* Try to allocate space for the fastmap. */
428: preg->fastmap = re_malloc (char, SBC_MAX);
429: if (BE (preg->fastmap == NULL, 0))
430: return REG_ESPACE;
431:
432: syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
433:
434: /* If REG_NEWLINE is set, newlines are treated differently. */
435: if (cflags & REG_NEWLINE)
436: { /* REG_NEWLINE implies neither . nor [^...] match newline. */
437: syntax &= ~RE_DOT_NEWLINE;
438: syntax |= RE_HAT_LISTS_NOT_NEWLINE;
439: /* It also changes the matching behavior. */
440: preg->newline_anchor = 1;
441: }
442: else
443: preg->newline_anchor = 0;
444: preg->no_sub = !!(cflags & REG_NOSUB);
445: preg->translate = NULL;
446:
447: ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
448:
449: /* POSIX doesn't distinguish between an unmatched open-group and an
450: unmatched close-group: both are REG_EPAREN. */
451: if (ret == REG_ERPAREN)
452: ret = REG_EPAREN;
453:
454: /* We have already checked preg->fastmap != NULL. */
455: if (BE (ret == REG_NOERROR, 1))
456: /* Compute the fastmap now, since regexec cannot modify the pattern
457: buffer. This function nevers fails in this implementation. */
458: (void) re_compile_fastmap (preg);
459: else
460: {
461: /* Some error occurred while compiling the expression. */
462: re_free (preg->fastmap);
463: preg->fastmap = NULL;
464: }
465:
466: return (int) ret;
467: }
468: #ifdef _LIBC
469: weak_alias (__regcomp, regcomp)
470: #endif
471:
472: /* Returns a message corresponding to an error code, ERRCODE, returned
473: from either regcomp or regexec. We don't use PREG here. */
474:
475: size_t
476: regerror (errcode, preg, errbuf, errbuf_size)
477: int errcode;
478: const regex_t *preg;
479: char *errbuf;
480: size_t errbuf_size;
481: {
482: const char *msg;
483: size_t msg_size;
484:
485: if (BE (errcode < 0
486: || errcode >= (int) (sizeof (__re_error_msgid_idx)
487: / sizeof (__re_error_msgid_idx[0])), 0))
488: /* Only error codes returned by the rest of the code should be passed
489: to this routine. If we are given anything else, or if other regex
490: code generates an invalid error code, then the program has a bug.
491: Dump core so we can fix it. */
492: abort ();
493:
494: msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
495:
496: msg_size = strlen (msg) + 1; /* Includes the null. */
497:
498: if (BE (errbuf_size != 0, 1))
499: {
500: if (BE (msg_size > errbuf_size, 0))
501: {
502: #if defined HAVE_MEMPCPY || defined _LIBC
503: *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
504: #else
505: memcpy (errbuf, msg, errbuf_size - 1);
506: errbuf[errbuf_size - 1] = 0;
507: #endif
508: }
509: else
510: memcpy (errbuf, msg, msg_size);
511: }
512:
513: return msg_size;
514: }
515: #ifdef _LIBC
516: weak_alias (__regerror, regerror)
517: #endif
518:
519:
520: static void
521: free_dfa_content (re_dfa_t *dfa)
522: {
523: int i, j;
524:
525: re_free (dfa->subexps);
526:
527: for (i = 0; i < dfa->nodes_len; ++i)
528: {
529: re_token_t *node = dfa->nodes + i;
530: #ifdef RE_ENABLE_I18N
531: if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
532: free_charset (node->opr.mbcset);
533: else
534: #endif /* RE_ENABLE_I18N */
535: if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
536: re_free (node->opr.sbcset);
537: }
538: re_free (dfa->nexts);
539: for (i = 0; i < dfa->nodes_len; ++i)
540: {
541: if (dfa->eclosures != NULL)
542: re_node_set_free (dfa->eclosures + i);
543: if (dfa->inveclosures != NULL)
544: re_node_set_free (dfa->inveclosures + i);
545: if (dfa->edests != NULL)
546: re_node_set_free (dfa->edests + i);
547: }
548: re_free (dfa->edests);
549: re_free (dfa->eclosures);
550: re_free (dfa->inveclosures);
551: re_free (dfa->nodes);
552:
553: for (i = 0; i <= dfa->state_hash_mask; ++i)
554: {
555: struct re_state_table_entry *entry = dfa->state_table + i;
556: for (j = 0; j < entry->num; ++j)
557: {
558: re_dfastate_t *state = entry->array[j];
559: free_state (state);
560: }
561: re_free (entry->array);
562: }
563: re_free (dfa->state_table);
564:
565: if (dfa->word_char != NULL)
566: re_free (dfa->word_char);
567: #ifdef DEBUG
568: re_free (dfa->re_str);
569: #endif
570:
571: re_free (dfa);
572: }
573:
574:
575: /* Free dynamically allocated space used by PREG. */
576:
577: void
578: regfree (preg)
579: regex_t *preg;
580: {
581: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
582: if (BE (dfa != NULL, 1))
583: free_dfa_content (dfa);
584:
585: re_free (preg->fastmap);
586: }
587: #ifdef _LIBC
588: weak_alias (__regfree, regfree)
589: #endif
590:
591: /* Entry points compatible with 4.2 BSD regex library. We don't define
592: them unless specifically requested. */
593:
594: #if defined _REGEX_RE_COMP || defined _LIBC
595:
596: /* BSD has one and only one pattern buffer. */
597: static struct re_pattern_buffer re_comp_buf;
598:
599: char *
600: # ifdef _LIBC
601: /* Make these definitions weak in libc, so POSIX programs can redefine
602: these names if they don't use our functions, and still use
603: regcomp/regexec above without link errors. */
604: weak_function
605: # endif
606: re_comp (s)
607: const char *s;
608: {
609: reg_errcode_t ret;
610: char *fastmap;
611:
612: if (!s)
613: {
614: if (!re_comp_buf.buffer)
615: return gettext ("No previous regular expression");
616: return 0;
617: }
618:
619: if (re_comp_buf.buffer)
620: {
621: fastmap = re_comp_buf.fastmap;
622: re_comp_buf.fastmap = NULL;
623: __regfree (&re_comp_buf);
624: memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
625: re_comp_buf.fastmap = fastmap;
626: }
627:
628: if (re_comp_buf.fastmap == NULL)
629: {
630: re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
631: if (re_comp_buf.fastmap == NULL)
632: return (char *) gettext (__re_error_msgid
633: + __re_error_msgid_idx[(int) REG_ESPACE]);
634: }
635:
636: /* Since `re_exec' always passes NULL for the `regs' argument, we
637: don't need to initialize the pattern buffer fields which affect it. */
638:
639: /* Match anchors at newlines. */
640: re_comp_buf.newline_anchor = 1;
641:
642: ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
643:
644: if (!ret)
645: return NULL;
646:
647: /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
648: return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
649: }
650:
651: #ifdef _LIBC
652: libc_freeres_fn (free_mem)
653: {
654: __regfree (&re_comp_buf);
655: }
656: #endif
657:
658: #endif /* _REGEX_RE_COMP */
659:
660: /* Internal entry point.
661: Compile the regular expression PATTERN, whose length is LENGTH.
662: SYNTAX indicate regular expression's syntax. */
663:
664: static reg_errcode_t
665: re_compile_internal (preg, pattern, length, syntax)
666: regex_t *preg;
667: const char * pattern;
668: int length;
669: reg_syntax_t syntax;
670: {
671: reg_errcode_t err = REG_NOERROR;
672: re_dfa_t *dfa;
673: re_string_t regexp;
674:
675: /* Initialize the pattern buffer. */
676: preg->fastmap_accurate = 0;
677: preg->syntax = syntax;
678: preg->not_bol = preg->not_eol = 0;
679: preg->used = 0;
680: preg->re_nsub = 0;
681: preg->can_be_null = 0;
682: preg->regs_allocated = REGS_UNALLOCATED;
683:
684: /* Initialize the dfa. */
685: dfa = (re_dfa_t *) preg->buffer;
686: if (preg->allocated < sizeof (re_dfa_t))
687: {
688: /* If zero allocated, but buffer is non-null, try to realloc
689: enough space. This loses if buffer's address is bogus, but
690: that is the user's responsibility. If ->buffer is NULL this
691: is a simple allocation. */
692: dfa = re_realloc (preg->buffer, re_dfa_t, 1);
693: if (dfa == NULL)
694: return REG_ESPACE;
695: preg->allocated = sizeof (re_dfa_t);
696: }
697: preg->buffer = (unsigned char *) dfa;
698: preg->used = sizeof (re_dfa_t);
699:
700: err = init_dfa (dfa, length);
701: if (BE (err != REG_NOERROR, 0))
702: {
703: re_free (dfa);
704: preg->buffer = NULL;
705: preg->allocated = 0;
706: return err;
707: }
708: #ifdef DEBUG
709: dfa->re_str = re_malloc (char, length + 1);
710: strncpy (dfa->re_str, pattern, length + 1);
711: #endif
712:
713: err = re_string_construct (®exp, pattern, length, preg->translate,
714: syntax & RE_ICASE);
715: if (BE (err != REG_NOERROR, 0))
716: {
717: re_free (dfa);
718: preg->buffer = NULL;
719: preg->allocated = 0;
720: return err;
721: }
722:
723: /* Parse the regular expression, and build a structure tree. */
724: preg->re_nsub = 0;
725: dfa->str_tree = parse (®exp, preg, syntax, &err);
726: if (BE (dfa->str_tree == NULL, 0))
727: goto re_compile_internal_free_return;
728:
729: /* Analyze the tree and collect information which is necessary to
730: create the dfa. */
731: err = analyze (dfa);
732: if (BE (err != REG_NOERROR, 0))
733: goto re_compile_internal_free_return;
734:
735: /* Then create the initial state of the dfa. */
736: err = create_initial_state (dfa);
737:
738: /* Release work areas. */
739: free_workarea_compile (preg);
740: re_string_destruct (®exp);
741:
742: if (BE (err != REG_NOERROR, 0))
743: {
744: re_compile_internal_free_return:
745: free_dfa_content (dfa);
746: preg->buffer = NULL;
747: preg->allocated = 0;
748: }
749:
750: return err;
751: }
752:
753: /* Initialize DFA. We use the length of the regular expression PAT_LEN
754: as the initial length of some arrays. */
755:
756: static reg_errcode_t
757: init_dfa (dfa, pat_len)
758: re_dfa_t *dfa;
759: int pat_len;
760: {
761: int table_size;
762:
763: memset (dfa, '\0', sizeof (re_dfa_t));
764:
765: dfa->nodes_alloc = pat_len + 1;
766: dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
767:
768: dfa->states_alloc = pat_len + 1;
769:
770: /* table_size = 2 ^ ceil(log pat_len) */
771: for (table_size = 1; table_size > 0; table_size <<= 1)
772: if (table_size > pat_len)
773: break;
774:
775: dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
776: dfa->state_hash_mask = table_size - 1;
777:
778: dfa->subexps_alloc = 1;
779: dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
780: dfa->word_char = NULL;
781:
782: if (BE (dfa->nodes == NULL || dfa->state_table == NULL
783: || dfa->subexps == NULL, 0))
784: {
785: /* We don't bother to free anything which was allocated. Very
786: soon the process will go down anyway. */
787: dfa->subexps = NULL;
788: dfa->state_table = NULL;
789: dfa->nodes = NULL;
790: return REG_ESPACE;
791: }
792: return REG_NOERROR;
793: }
794:
795: /* Initialize WORD_CHAR table, which indicate which character is
796: "word". In this case "word" means that it is the word construction
797: character used by some operators like "\<", "\>", etc. */
798:
799: static reg_errcode_t
800: init_word_char (dfa)
801: re_dfa_t *dfa;
802: {
803: int i, j, ch;
804: dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
805: if (BE (dfa->word_char == NULL, 0))
806: return REG_ESPACE;
807: for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
808: for (j = 0; j < UINT_BITS; ++j, ++ch)
809: if (isalnum (ch) || ch == '_')
810: dfa->word_char[i] |= 1 << j;
811: return REG_NOERROR;
812: }
813:
814: /* Free the work area which are only used while compiling. */
815:
816: static void
817: free_workarea_compile (preg)
818: regex_t *preg;
819: {
820: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
821: free_bin_tree (dfa->str_tree);
822: dfa->str_tree = NULL;
823: re_free (dfa->org_indices);
824: dfa->org_indices = NULL;
825: }
826:
827: /* Create initial states for all contexts. */
828:
829: static reg_errcode_t
830: create_initial_state (dfa)
831: re_dfa_t *dfa;
832: {
833: int first, i;
834: reg_errcode_t err;
835: re_node_set init_nodes;
836:
837: /* Initial states have the epsilon closure of the node which is
838: the first node of the regular expression. */
839: first = dfa->str_tree->first;
840: dfa->init_node = first;
841: err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
842: if (BE (err != REG_NOERROR, 0))
843: return err;
844:
845: /* The back-references which are in initial states can epsilon transit,
846: since in this case all of the subexpressions can be null.
847: Then we add epsilon closures of the nodes which are the next nodes of
848: the back-references. */
849: if (dfa->nbackref > 0)
850: for (i = 0; i < init_nodes.nelem; ++i)
851: {
852: int node_idx = init_nodes.elems[i];
853: re_token_type_t type = dfa->nodes[node_idx].type;
854:
855: int clexp_idx;
856: if (type != OP_BACK_REF)
857: continue;
858: for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
859: {
860: re_token_t *clexp_node;
861: clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
862: if (clexp_node->type == OP_CLOSE_SUBEXP
863: && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
864: break;
865: }
866: if (clexp_idx == init_nodes.nelem)
867: continue;
868:
869: if (type == OP_BACK_REF)
870: {
871: int dest_idx = dfa->edests[node_idx].elems[0];
872: if (!re_node_set_contains (&init_nodes, dest_idx))
873: {
874: re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
875: i = 0;
876: }
877: }
878: }
879:
880: /* It must be the first time to invoke acquire_state. */
881: dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
882: /* We don't check ERR here, since the initial state must not be NULL. */
883: if (BE (dfa->init_state == NULL, 0))
884: return err;
885: if (dfa->init_state->has_constraint)
886: {
887: dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
888: CONTEXT_WORD);
889: dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
890: CONTEXT_NEWLINE);
891: dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
892: &init_nodes,
893: CONTEXT_NEWLINE
894: | CONTEXT_BEGBUF);
895: if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
896: || dfa->init_state_begbuf == NULL, 0))
897: return err;
898: }
899: else
900: dfa->init_state_word = dfa->init_state_nl
901: = dfa->init_state_begbuf = dfa->init_state;
902:
903: re_node_set_free (&init_nodes);
904: return REG_NOERROR;
905: }
906:
907: /* Analyze the structure tree, and calculate "first", "next", "edest",
908: "eclosure", and "inveclosure". */
909:
910: static reg_errcode_t
911: analyze (dfa)
912: re_dfa_t *dfa;
913: {
914: int i;
915: reg_errcode_t ret;
916:
917: /* Allocate arrays. */
918: dfa->nexts = re_malloc (int, dfa->nodes_alloc);
919: dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
920: dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
921: dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
922: dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
923: if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
924: || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
925: return REG_ESPACE;
926: /* Initialize them. */
927: for (i = 0; i < dfa->nodes_len; ++i)
928: {
929: dfa->nexts[i] = -1;
930: re_node_set_init_empty (dfa->edests + i);
931: re_node_set_init_empty (dfa->eclosures + i);
932: re_node_set_init_empty (dfa->inveclosures + i);
933: }
934:
935: ret = analyze_tree (dfa, dfa->str_tree);
936: if (BE (ret == REG_NOERROR, 1))
937: {
938: ret = calc_eclosure (dfa);
939: if (ret == REG_NOERROR)
940: calc_inveclosure (dfa);
941: }
942: return ret;
943: }
944:
945: /* Helper functions for analyze.
946: This function calculate "first", "next", and "edest" for the subtree
947: whose root is NODE. */
948:
949: static reg_errcode_t
950: analyze_tree (dfa, node)
951: re_dfa_t *dfa;
952: bin_tree_t *node;
953: {
954: reg_errcode_t ret;
955: if (node->first == -1)
956: calc_first (dfa, node);
957: if (node->next == -1)
958: calc_next (dfa, node);
959: if (node->eclosure.nelem == 0)
960: calc_epsdest (dfa, node);
961: /* Calculate "first" etc. for the left child. */
962: if (node->left != NULL)
963: {
964: ret = analyze_tree (dfa, node->left);
965: if (BE (ret != REG_NOERROR, 0))
966: return ret;
967: }
968: /* Calculate "first" etc. for the right child. */
969: if (node->right != NULL)
970: {
971: ret = analyze_tree (dfa, node->right);
972: if (BE (ret != REG_NOERROR, 0))
973: return ret;
974: }
975: return REG_NOERROR;
976: }
977:
978: /* Calculate "first" for the node NODE. */
979: static void
980: calc_first (dfa, node)
981: re_dfa_t *dfa;
982: bin_tree_t *node;
983: {
984: int idx, type;
985: idx = node->node_idx;
986: type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
987:
988: switch (type)
989: {
990: #ifdef DEBUG
991: case OP_OPEN_BRACKET:
992: case OP_CLOSE_BRACKET:
993: case OP_OPEN_DUP_NUM:
994: case OP_CLOSE_DUP_NUM:
995: case OP_NON_MATCH_LIST:
996: case OP_OPEN_COLL_ELEM:
997: case OP_CLOSE_COLL_ELEM:
998: case OP_OPEN_EQUIV_CLASS:
999: case OP_CLOSE_EQUIV_CLASS:
1000: case OP_OPEN_CHAR_CLASS:
1001: case OP_CLOSE_CHAR_CLASS:
1002: /* These must not be appeared here. */
1003: assert (0);
1004: #endif
1005: case END_OF_RE:
1006: case CHARACTER:
1007: case OP_PERIOD:
1008: case OP_DUP_ASTERISK:
1009: case OP_DUP_QUESTION:
1010: #ifdef RE_ENABLE_I18N
1011: case COMPLEX_BRACKET:
1012: #endif /* RE_ENABLE_I18N */
1013: case SIMPLE_BRACKET:
1014: case OP_BACK_REF:
1015: case ANCHOR:
1016: case OP_OPEN_SUBEXP:
1017: case OP_CLOSE_SUBEXP:
1018: node->first = idx;
1019: break;
1020: case OP_DUP_PLUS:
1021: #ifdef DEBUG
1022: assert (node->left != NULL);
1023: #endif
1024: if (node->left->first == -1)
1025: calc_first (dfa, node->left);
1026: node->first = node->left->first;
1027: break;
1028: case OP_ALT:
1029: node->first = idx;
1030: break;
1031: /* else fall through */
1032: default:
1033: #ifdef DEBUG
1034: assert (node->left != NULL);
1035: #endif
1036: if (node->left->first == -1)
1037: calc_first (dfa, node->left);
1038: node->first = node->left->first;
1039: break;
1040: }
1041: }
1042:
1043: /* Calculate "next" for the node NODE. */
1044:
1045: static void
1046: calc_next (dfa, node)
1047: re_dfa_t *dfa;
1048: bin_tree_t *node;
1049: {
1050: int idx, type;
1051: bin_tree_t *parent = node->parent;
1052: if (parent == NULL)
1053: {
1054: node->next = -1;
1055: idx = node->node_idx;
1056: if (node->type == 0)
1057: dfa->nexts[idx] = node->next;
1058: return;
1059: }
1060:
1061: idx = parent->node_idx;
1062: type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1063:
1064: switch (type)
1065: {
1066: case OP_DUP_ASTERISK:
1067: case OP_DUP_PLUS:
1068: node->next = idx;
1069: break;
1070: case CONCAT:
1071: if (parent->left == node)
1072: {
1073: if (parent->right->first == -1)
1074: calc_first (dfa, parent->right);
1075: node->next = parent->right->first;
1076: break;
1077: }
1078: /* else fall through */
1079: default:
1080: if (parent->next == -1)
1081: calc_next (dfa, parent);
1082: node->next = parent->next;
1083: break;
1084: }
1085: idx = node->node_idx;
1086: if (node->type == 0)
1087: dfa->nexts[idx] = node->next;
1088: }
1089:
1090: /* Calculate "edest" for the node NODE. */
1091:
1092: static void
1093: calc_epsdest (dfa, node)
1094: re_dfa_t *dfa;
1095: bin_tree_t *node;
1096: {
1097: int idx;
1098: idx = node->node_idx;
1099: if (node->type == 0)
1100: {
1101: if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1102: || dfa->nodes[idx].type == OP_DUP_PLUS
1103: || dfa->nodes[idx].type == OP_DUP_QUESTION)
1104: {
1105: if (node->left->first == -1)
1106: calc_first (dfa, node->left);
1107: if (node->next == -1)
1108: calc_next (dfa, node);
1109: re_node_set_init_2 (dfa->edests + idx, node->left->first,
1110: node->next);
1111: }
1112: else if (dfa->nodes[idx].type == OP_ALT)
1113: {
1114: int left, right;
1115: if (node->left != NULL)
1116: {
1117: if (node->left->first == -1)
1118: calc_first (dfa, node->left);
1119: left = node->left->first;
1120: }
1121: else
1122: {
1123: if (node->next == -1)
1124: calc_next (dfa, node);
1125: left = node->next;
1126: }
1127: if (node->right != NULL)
1128: {
1129: if (node->right->first == -1)
1130: calc_first (dfa, node->right);
1131: right = node->right->first;
1132: }
1133: else
1134: {
1135: if (node->next == -1)
1136: calc_next (dfa, node);
1137: right = node->next;
1138: }
1139: re_node_set_init_2 (dfa->edests + idx, left, right);
1140: }
1141: else if (dfa->nodes[idx].type == ANCHOR
1142: || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1143: || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1144: || dfa->nodes[idx].type == OP_BACK_REF)
1145: re_node_set_init_1 (dfa->edests + idx, node->next);
1146: }
1147: }
1148:
1149: /* Duplicate the epsilon closure of the node ROOT_NODE.
1150: Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1151: to their own constraint. */
1152:
1153: static reg_errcode_t
1154: duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1155: init_constraint)
1156: re_dfa_t *dfa;
1157: int top_org_node, top_clone_node, root_node;
1158: unsigned int init_constraint;
1159: {
1160: reg_errcode_t err;
1161: int org_node, clone_node, ret;
1162: unsigned int constraint = init_constraint;
1163: for (org_node = top_org_node, clone_node = top_clone_node;;)
1164: {
1165: int org_dest, clone_dest;
1166: if (dfa->nodes[org_node].type == OP_BACK_REF)
1167: {
1168: /* If the back reference epsilon-transit, its destination must
1169: also have the constraint. Then duplicate the epsilon closure
1170: of the destination of the back reference, and store it in
1171: edests of the back reference. */
1172: org_dest = dfa->nexts[org_node];
1173: re_node_set_empty (dfa->edests + clone_node);
1174: err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1175: if (BE (err != REG_NOERROR, 0))
1176: return err;
1177: dfa->nexts[clone_node] = dfa->nexts[org_node];
1178: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1179: if (BE (ret < 0, 0))
1180: return REG_ESPACE;
1181: }
1182: else if (dfa->edests[org_node].nelem == 0)
1183: {
1184: /* In case of the node can't epsilon-transit, don't duplicate the
1185: destination and store the original destination as the
1186: destination of the node. */
1187: dfa->nexts[clone_node] = dfa->nexts[org_node];
1188: break;
1189: }
1190: else if (dfa->edests[org_node].nelem == 1)
1191: {
1192: /* In case of the node can epsilon-transit, and it has only one
1193: destination. */
1194: org_dest = dfa->edests[org_node].elems[0];
1195: re_node_set_empty (dfa->edests + clone_node);
1196: if (dfa->nodes[org_node].type == ANCHOR)
1197: {
1198: /* In case of the node has another constraint, append it. */
1199: if (org_node == root_node && clone_node != org_node)
1200: {
1201: /* ...but if the node is root_node itself, it means the
1202: epsilon closure have a loop, then tie it to the
1203: destination of the root_node. */
1204: ret = re_node_set_insert (dfa->edests + clone_node,
1205: org_dest);
1206: if (BE (ret < 0, 0))
1207: return REG_ESPACE;
1208: break;
1209: }
1210: constraint |= dfa->nodes[org_node].opr.ctx_type;
1211: }
1212: err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1213: if (BE (err != REG_NOERROR, 0))
1214: return err;
1215: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1216: if (BE (ret < 0, 0))
1217: return REG_ESPACE;
1218: }
1219: else /* dfa->edests[org_node].nelem == 2 */
1220: {
1221: /* In case of the node can epsilon-transit, and it has two
1222: destinations. E.g. '|', '*', '+', '?'. */
1223: org_dest = dfa->edests[org_node].elems[0];
1224: re_node_set_empty (dfa->edests + clone_node);
1225: /* Search for a duplicated node which satisfies the constraint. */
1226: clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1227: if (clone_dest == -1)
1228: {
1229: /* There are no such a duplicated node, create a new one. */
1230: err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1231: if (BE (err != REG_NOERROR, 0))
1232: return err;
1233: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1234: if (BE (ret < 0, 0))
1235: return REG_ESPACE;
1236: err = duplicate_node_closure (dfa, org_dest, clone_dest,
1237: root_node, constraint);
1238: if (BE (err != REG_NOERROR, 0))
1239: return err;
1240: }
1241: else
1242: {
1243: /* There are a duplicated node which satisfy the constraint,
1244: use it to avoid infinite loop. */
1245: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1246: if (BE (ret < 0, 0))
1247: return REG_ESPACE;
1248: }
1249:
1250: org_dest = dfa->edests[org_node].elems[1];
1251: err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1252: if (BE (err != REG_NOERROR, 0))
1253: return err;
1254: ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1255: if (BE (ret < 0, 0))
1256: return REG_ESPACE;
1257: }
1258: org_node = org_dest;
1259: clone_node = clone_dest;
1260: }
1261: return REG_NOERROR;
1262: }
1263:
1264: /* Search for a node which is duplicated from the node ORG_NODE, and
1265: satisfies the constraint CONSTRAINT. */
1266:
1267: static int
1268: search_duplicated_node (dfa, org_node, constraint)
1269: re_dfa_t *dfa;
1270: int org_node;
1271: unsigned int constraint;
1272: {
1273: int idx;
1274: for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1275: {
1276: if (org_node == dfa->org_indices[idx]
1277: && constraint == dfa->nodes[idx].constraint)
1278: return idx; /* Found. */
1279: }
1280: return -1; /* Not found. */
1281: }
1282:
1283: /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1284: The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1285: otherwise return the error code. */
1286:
1287: static reg_errcode_t
1288: duplicate_node (new_idx, dfa, org_idx, constraint)
1289: re_dfa_t *dfa;
1290: int *new_idx, org_idx;
1291: unsigned int constraint;
1292: {
1293: re_token_t dup;
1294: int dup_idx;
1295:
1296: dup = dfa->nodes[org_idx];
1297: dup_idx = re_dfa_add_node (dfa, dup, 1);
1298: if (BE (dup_idx == -1, 0))
1299: return REG_ESPACE;
1300: dfa->nodes[dup_idx].constraint = constraint;
1301: if (dfa->nodes[org_idx].type == ANCHOR)
1302: dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1303: dfa->nodes[dup_idx].duplicated = 1;
1304: re_node_set_init_empty (dfa->edests + dup_idx);
1305: re_node_set_init_empty (dfa->eclosures + dup_idx);
1306: re_node_set_init_empty (dfa->inveclosures + dup_idx);
1307:
1308: /* Store the index of the original node. */
1309: dfa->org_indices[dup_idx] = org_idx;
1310: *new_idx = dup_idx;
1311: return REG_NOERROR;
1312: }
1313:
1314: static void
1315: calc_inveclosure (dfa)
1316: re_dfa_t *dfa;
1317: {
1318: int src, idx, dest;
1319: for (src = 0; src < dfa->nodes_len; ++src)
1320: {
1321: for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1322: {
1323: dest = dfa->eclosures[src].elems[idx];
1324: re_node_set_insert (dfa->inveclosures + dest, src);
1325: }
1326: }
1327: }
1328:
1329: /* Calculate "eclosure" for all the node in DFA. */
1330:
1331: static reg_errcode_t
1332: calc_eclosure (dfa)
1333: re_dfa_t *dfa;
1334: {
1335: int node_idx, incomplete;
1336: #ifdef DEBUG
1337: assert (dfa->nodes_len > 0);
1338: #endif
1339: incomplete = 0;
1340: /* For each nodes, calculate epsilon closure. */
1341: for (node_idx = 0; ; ++node_idx)
1342: {
1343: reg_errcode_t err;
1344: re_node_set eclosure_elem;
1345: if (node_idx == dfa->nodes_len)
1346: {
1347: if (!incomplete)
1348: break;
1349: incomplete = 0;
1350: node_idx = 0;
1351: }
1352:
1353: #ifdef DEBUG
1354: assert (dfa->eclosures[node_idx].nelem != -1);
1355: #endif
1356: /* If we have already calculated, skip it. */
1357: if (dfa->eclosures[node_idx].nelem != 0)
1358: continue;
1359: /* Calculate epsilon closure of `node_idx'. */
1360: err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1361: if (BE (err != REG_NOERROR, 0))
1362: return err;
1363:
1364: if (dfa->eclosures[node_idx].nelem == 0)
1365: {
1366: incomplete = 1;
1367: re_node_set_free (&eclosure_elem);
1368: }
1369: }
1370: return REG_NOERROR;
1371: }
1372:
1373: /* Calculate epsilon closure of NODE. */
1374:
1375: static reg_errcode_t
1376: calc_eclosure_iter (new_set, dfa, node, root)
1377: re_node_set *new_set;
1378: re_dfa_t *dfa;
1379: int node, root;
1380: {
1381: reg_errcode_t err;
1382: unsigned int constraint;
1383: int i, incomplete;
1384: re_node_set eclosure;
1385: incomplete = 0;
1386: err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1387: if (BE (err != REG_NOERROR, 0))
1388: return err;
1389:
1390: /* This indicates that we are calculating this node now.
1391: We reference this value to avoid infinite loop. */
1392: dfa->eclosures[node].nelem = -1;
1393:
1394: constraint = ((dfa->nodes[node].type == ANCHOR)
1395: ? dfa->nodes[node].opr.ctx_type : 0);
1396: /* If the current node has constraints, duplicate all nodes.
1397: Since they must inherit the constraints. */
1398: if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1399: {
1400: int org_node, cur_node;
1401: org_node = cur_node = node;
1402: err = duplicate_node_closure (dfa, node, node, node, constraint);
1403: if (BE (err != REG_NOERROR, 0))
1404: return err;
1405: }
1406:
1407: /* Expand each epsilon destination nodes. */
1408: if (IS_EPSILON_NODE(dfa->nodes[node].type))
1409: for (i = 0; i < dfa->edests[node].nelem; ++i)
1410: {
1411: re_node_set eclosure_elem;
1412: int edest = dfa->edests[node].elems[i];
1413: /* If calculating the epsilon closure of `edest' is in progress,
1414: return intermediate result. */
1415: if (dfa->eclosures[edest].nelem == -1)
1416: {
1417: incomplete = 1;
1418: continue;
1419: }
1420: /* If we haven't calculated the epsilon closure of `edest' yet,
1421: calculate now. Otherwise use calculated epsilon closure. */
1422: if (dfa->eclosures[edest].nelem == 0)
1423: {
1424: err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1425: if (BE (err != REG_NOERROR, 0))
1426: return err;
1427: }
1428: else
1429: eclosure_elem = dfa->eclosures[edest];
1430: /* Merge the epsilon closure of `edest'. */
1431: re_node_set_merge (&eclosure, &eclosure_elem);
1432: /* If the epsilon closure of `edest' is incomplete,
1433: the epsilon closure of this node is also incomplete. */
1434: if (dfa->eclosures[edest].nelem == 0)
1435: {
1436: incomplete = 1;
1437: re_node_set_free (&eclosure_elem);
1438: }
1439: }
1440:
1441: /* Epsilon closures include itself. */
1442: re_node_set_insert (&eclosure, node);
1443: if (incomplete && !root)
1444: dfa->eclosures[node].nelem = 0;
1445: else
1446: dfa->eclosures[node] = eclosure;
1447: *new_set = eclosure;
1448: return REG_NOERROR;
1449: }
1450:
1451: /* Functions for token which are used in the parser. */
1452:
1453: /* Fetch a token from INPUT.
1454: We must not use this function inside bracket expressions. */
1455:
1456: static re_token_t
1457: fetch_token (input, syntax)
1458: re_string_t *input;
1459: reg_syntax_t syntax;
1460: {
1461: re_token_t token;
1462: int consumed_byte;
1463: consumed_byte = peek_token (&token, input, syntax);
1464: re_string_skip_bytes (input, consumed_byte);
1465: return token;
1466: }
1467:
1468: /* Peek a token from INPUT, and return the length of the token.
1469: We must not use this function inside bracket expressions. */
1470:
1471: static int
1472: peek_token (token, input, syntax)
1473: re_token_t *token;
1474: re_string_t *input;
1475: reg_syntax_t syntax;
1476: {
1477: unsigned char c;
1478:
1479: if (re_string_eoi (input))
1480: {
1481: token->type = END_OF_RE;
1482: return 0;
1483: }
1484:
1485: c = re_string_peek_byte (input, 0);
1486: token->opr.c = c;
1487:
1488: #ifdef RE_ENABLE_I18N
1489: token->mb_partial = 0;
1490: if (MB_CUR_MAX > 1 &&
1491: !re_string_first_byte (input, re_string_cur_idx (input)))
1492: {
1493: token->type = CHARACTER;
1494: token->mb_partial = 1;
1495: return 1;
1496: }
1497: #endif
1498: if (c == '\\')
1499: {
1500: unsigned char c2;
1501: if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1502: {
1503: token->type = BACK_SLASH;
1504: return 1;
1505: }
1506:
1507: c2 = re_string_peek_byte_case (input, 1);
1508: token->opr.c = c2;
1509: token->type = CHARACTER;
1510: switch (c2)
1511: {
1512: case '|':
1513: if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1514: token->type = OP_ALT;
1515: break;
1516: case '1': case '2': case '3': case '4': case '5':
1517: case '6': case '7': case '8': case '9':
1518: if (!(syntax & RE_NO_BK_REFS))
1519: {
1520: token->type = OP_BACK_REF;
1521: token->opr.idx = c2 - '0';
1522: }
1523: break;
1524: case '<':
1525: if (!(syntax & RE_NO_GNU_OPS))
1526: {
1527: token->type = ANCHOR;
1528: token->opr.idx = WORD_FIRST;
1529: }
1530: break;
1531: case '>':
1532: if (!(syntax & RE_NO_GNU_OPS))
1533: {
1534: token->type = ANCHOR;
1535: token->opr.idx = WORD_LAST;
1536: }
1537: break;
1538: case 'b':
1539: if (!(syntax & RE_NO_GNU_OPS))
1540: {
1541: token->type = ANCHOR;
1542: token->opr.idx = WORD_DELIM;
1543: }
1544: break;
1545: case 'B':
1546: if (!(syntax & RE_NO_GNU_OPS))
1547: {
1548: token->type = ANCHOR;
1549: token->opr.idx = INSIDE_WORD;
1550: }
1551: break;
1552: case 'w':
1553: if (!(syntax & RE_NO_GNU_OPS))
1554: token->type = OP_WORD;
1555: break;
1556: case 'W':
1557: if (!(syntax & RE_NO_GNU_OPS))
1558: token->type = OP_NOTWORD;
1559: break;
1560: case '`':
1561: if (!(syntax & RE_NO_GNU_OPS))
1562: {
1563: token->type = ANCHOR;
1564: token->opr.idx = BUF_FIRST;
1565: }
1566: break;
1567: case '\'':
1568: if (!(syntax & RE_NO_GNU_OPS))
1569: {
1570: token->type = ANCHOR;
1571: token->opr.idx = BUF_LAST;
1572: }
1573: break;
1574: case '(':
1575: if (!(syntax & RE_NO_BK_PARENS))
1576: token->type = OP_OPEN_SUBEXP;
1577: break;
1578: case ')':
1579: if (!(syntax & RE_NO_BK_PARENS))
1580: token->type = OP_CLOSE_SUBEXP;
1581: break;
1582: case '+':
1583: if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1584: token->type = OP_DUP_PLUS;
1585: break;
1586: case '?':
1587: if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1588: token->type = OP_DUP_QUESTION;
1589: break;
1590: case '{':
1591: if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1592: token->type = OP_OPEN_DUP_NUM;
1593: break;
1594: case '}':
1595: if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1596: token->type = OP_CLOSE_DUP_NUM;
1597: break;
1598: default:
1599: break;
1600: }
1601: return 2;
1602: }
1603:
1604: token->type = CHARACTER;
1605: switch (c)
1606: {
1607: case '\n':
1608: if (syntax & RE_NEWLINE_ALT)
1609: token->type = OP_ALT;
1610: break;
1611: case '|':
1612: if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1613: token->type = OP_ALT;
1614: break;
1615: case '*':
1616: token->type = OP_DUP_ASTERISK;
1617: break;
1618: case '+':
1619: if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1620: token->type = OP_DUP_PLUS;
1621: break;
1622: case '?':
1623: if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1624: token->type = OP_DUP_QUESTION;
1625: break;
1626: case '{':
1627: if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1628: token->type = OP_OPEN_DUP_NUM;
1629: break;
1630: case '}':
1631: if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1632: token->type = OP_CLOSE_DUP_NUM;
1633: break;
1634: case '(':
1635: if (syntax & RE_NO_BK_PARENS)
1636: token->type = OP_OPEN_SUBEXP;
1637: break;
1638: case ')':
1639: if (syntax & RE_NO_BK_PARENS)
1640: token->type = OP_CLOSE_SUBEXP;
1641: break;
1642: case '[':
1643: token->type = OP_OPEN_BRACKET;
1644: break;
1645: case '.':
1646: token->type = OP_PERIOD;
1647: break;
1648: case '^':
1649: if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1650: re_string_cur_idx (input) != 0)
1651: {
1652: char prev = re_string_peek_byte (input, -1);
1653: if (prev != '|' && prev != '(' &&
1654: (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1655: break;
1656: }
1657: token->type = ANCHOR;
1658: token->opr.idx = LINE_FIRST;
1659: break;
1660: case '$':
1661: if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1662: re_string_cur_idx (input) + 1 != re_string_length (input))
1663: {
1664: re_token_t next;
1665: re_string_skip_bytes (input, 1);
1666: peek_token (&next, input, syntax);
1667: re_string_skip_bytes (input, -1);
1668: if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1669: break;
1670: }
1671: token->type = ANCHOR;
1672: token->opr.idx = LINE_LAST;
1673: break;
1674: default:
1675: break;
1676: }
1677: return 1;
1678: }
1679:
1680: /* Peek a token from INPUT, and return the length of the token.
1681: We must not use this function out of bracket expressions. */
1682:
1683: static int
1684: peek_token_bracket (token, input, syntax)
1685: re_token_t *token;
1686: re_string_t *input;
1687: reg_syntax_t syntax;
1688: {
1689: unsigned char c;
1690: if (re_string_eoi (input))
1691: {
1692: token->type = END_OF_RE;
1693: return 0;
1694: }
1695: c = re_string_peek_byte (input, 0);
1696: token->opr.c = c;
1697:
1698: #ifdef RE_ENABLE_I18N
1699: if (MB_CUR_MAX > 1 &&
1700: !re_string_first_byte (input, re_string_cur_idx (input)))
1701: {
1702: token->type = CHARACTER;
1703: return 1;
1704: }
1705: #endif /* RE_ENABLE_I18N */
1706:
1707: if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1708: {
1709: /* In this case, '\' escape a character. */
1710: unsigned char c2;
1711: re_string_skip_bytes (input, 1);
1712: c2 = re_string_peek_byte (input, 0);
1713: token->opr.c = c2;
1714: token->type = CHARACTER;
1715: return 1;
1716: }
1717: if (c == '[') /* '[' is a special char in a bracket exps. */
1718: {
1719: unsigned char c2;
1720: int token_len;
1721: c2 = re_string_peek_byte (input, 1);
1722: token->opr.c = c2;
1723: token_len = 2;
1724: switch (c2)
1725: {
1726: case '.':
1727: token->type = OP_OPEN_COLL_ELEM;
1728: break;
1729: case '=':
1730: token->type = OP_OPEN_EQUIV_CLASS;
1731: break;
1732: case ':':
1733: if (syntax & RE_CHAR_CLASSES)
1734: {
1735: token->type = OP_OPEN_CHAR_CLASS;
1736: break;
1737: }
1738: /* else fall through. */
1739: default:
1740: token->type = CHARACTER;
1741: token->opr.c = c;
1742: token_len = 1;
1743: break;
1744: }
1745: return token_len;
1746: }
1747: switch (c)
1748: {
1749: case '-':
1750: token->type = OP_CHARSET_RANGE;
1751: break;
1752: case ']':
1753: token->type = OP_CLOSE_BRACKET;
1754: break;
1755: case '^':
1756: token->type = OP_NON_MATCH_LIST;
1757: break;
1758: default:
1759: token->type = CHARACTER;
1760: }
1761: return 1;
1762: }
1763:
1764: /* Functions for parser. */
1765:
1766: /* Entry point of the parser.
1767: Parse the regular expression REGEXP and return the structure tree.
1768: If an error is occured, ERR is set by error code, and return NULL.
1769: This function build the following tree, from regular expression <reg_exp>:
1770: CAT
1771: / \
1772: / \
1773: <reg_exp> EOR
1774:
1775: CAT means concatenation.
1776: EOR means end of regular expression. */
1777:
1778: static bin_tree_t *
1779: parse (regexp, preg, syntax, err)
1780: re_string_t *regexp;
1781: regex_t *preg;
1782: reg_syntax_t syntax;
1783: reg_errcode_t *err;
1784: {
1785: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1786: bin_tree_t *tree, *eor, *root;
1787: re_token_t current_token;
1788: int new_idx;
1789: current_token = fetch_token (regexp, syntax);
1790: tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err);
1791: if (BE (*err != REG_NOERROR && tree == NULL, 0))
1792: return NULL;
1793: new_idx = re_dfa_add_node (dfa, current_token, 0);
1794: eor = create_tree (NULL, NULL, 0, new_idx);
1795: if (tree != NULL)
1796: root = create_tree (tree, eor, CONCAT, 0);
1797: else
1798: root = eor;
1799: if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1800: {
1801: *err = REG_ESPACE;
1802: return NULL;
1803: }
1804: return root;
1805: }
1806:
1807: /* This function build the following tree, from regular expression
1808: <branch1>|<branch2>:
1809: ALT
1810: / \
1811: / \
1812: <branch1> <branch2>
1813:
1814: ALT means alternative, which represents the operator `|'. */
1815:
1816: static bin_tree_t *
1817: parse_reg_exp (regexp, preg, token, syntax, nest, err)
1818: re_string_t *regexp;
1819: regex_t *preg;
1820: re_token_t *token;
1821: reg_syntax_t syntax;
1822: int nest;
1823: reg_errcode_t *err;
1824: {
1825: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1826: bin_tree_t *tree, *branch = NULL;
1827: int new_idx;
1828: tree = parse_branch (regexp, preg, token, syntax, nest, err);
1829: if (BE (*err != REG_NOERROR && tree == NULL, 0))
1830: return NULL;
1831:
1832: while (token->type == OP_ALT)
1833: {
1834: re_token_t alt_token = *token;
1835: new_idx = re_dfa_add_node (dfa, alt_token, 0);
1836: *token = fetch_token (regexp, syntax);
1837: if (token->type != OP_ALT && token->type != END_OF_RE
1838: && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1839: {
1840: branch = parse_branch (regexp, preg, token, syntax, nest, err);
1841: if (BE (*err != REG_NOERROR && branch == NULL, 0))
1842: {
1843: free_bin_tree (tree);
1844: return NULL;
1845: }
1846: }
1847: else
1848: branch = NULL;
1849: tree = create_tree (tree, branch, 0, new_idx);
1850: if (BE (new_idx == -1 || tree == NULL, 0))
1851: {
1852: *err = REG_ESPACE;
1853: return NULL;
1854: }
1855: dfa->has_plural_match = 1;
1856: }
1857: return tree;
1858: }
1859:
1860: /* This function build the following tree, from regular expression
1861: <exp1><exp2>:
1862: CAT
1863: / \
1864: / \
1865: <exp1> <exp2>
1866:
1867: CAT means concatenation. */
1868:
1869: static bin_tree_t *
1870: parse_branch (regexp, preg, token, syntax, nest, err)
1871: re_string_t *regexp;
1872: regex_t *preg;
1873: re_token_t *token;
1874: reg_syntax_t syntax;
1875: int nest;
1876: reg_errcode_t *err;
1877: {
1878: bin_tree_t *tree, *exp;
1879: tree = parse_expression (regexp, preg, token, syntax, nest, err);
1880: if (BE (*err != REG_NOERROR && tree == NULL, 0))
1881: return NULL;
1882:
1883: while (token->type != OP_ALT && token->type != END_OF_RE
1884: && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1885: {
1886: exp = parse_expression (regexp, preg, token, syntax, nest, err);
1887: if (BE (*err != REG_NOERROR && exp == NULL, 0))
1888: {
1889: free_bin_tree (tree);
1890: return NULL;
1891: }
1892: if (tree != NULL && exp != NULL)
1893: {
1894: tree = create_tree (tree, exp, CONCAT, 0);
1895: if (tree == NULL)
1896: {
1897: *err = REG_ESPACE;
1898: return NULL;
1899: }
1900: }
1901: else if (tree == NULL)
1902: tree = exp;
1903: /* Otherwise exp == NULL, we don't need to create new tree. */
1904: }
1905: return tree;
1906: }
1907:
1908: /* This function build the following tree, from regular expression a*:
1909: *
1910: |
1911: a
1912: */
1913:
1914: static bin_tree_t *
1915: parse_expression (regexp, preg, token, syntax, nest, err)
1916: re_string_t *regexp;
1917: regex_t *preg;
1918: re_token_t *token;
1919: reg_syntax_t syntax;
1920: int nest;
1921: reg_errcode_t *err;
1922: {
1923: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1924: bin_tree_t *tree;
1925: int new_idx;
1926: switch (token->type)
1927: {
1928: case CHARACTER:
1929: new_idx = re_dfa_add_node (dfa, *token, 0);
1930: tree = create_tree (NULL, NULL, 0, new_idx);
1931: if (BE (new_idx == -1 || tree == NULL, 0))
1932: {
1933: *err = REG_ESPACE;
1934: return NULL;
1935: }
1936: #ifdef RE_ENABLE_I18N
1937: if (MB_CUR_MAX > 1)
1938: {
1939: while (!re_string_eoi (regexp)
1940: && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1941: {
1942: bin_tree_t *mbc_remain;
1943: *token = fetch_token (regexp, syntax);
1944: new_idx = re_dfa_add_node (dfa, *token, 0);
1945: mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1946: tree = create_tree (tree, mbc_remain, CONCAT, 0);
1947: if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1948: {
1949: *err = REG_ESPACE;
1950: return NULL;
1951: }
1952: }
1953: }
1954: #endif
1955: break;
1956: case OP_OPEN_SUBEXP:
1957: tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1958: if (BE (*err != REG_NOERROR && tree == NULL, 0))
1959: return NULL;
1960: break;
1961: case OP_OPEN_BRACKET:
1962: tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1963: if (BE (*err != REG_NOERROR && tree == NULL, 0))
1964: return NULL;
1965: break;
1966: case OP_BACK_REF:
1967: if (BE (preg->re_nsub < token->opr.idx
1968: || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1969: {
1970: *err = REG_ESUBREG;
1971: return NULL;
1972: }
1973: dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
1974: new_idx = re_dfa_add_node (dfa, *token, 0);
1975: tree = create_tree (NULL, NULL, 0, new_idx);
1976: if (BE (new_idx == -1 || tree == NULL, 0))
1977: {
1978: *err = REG_ESPACE;
1979: return NULL;
1980: }
1981: ++dfa->nbackref;
1982: dfa->has_mb_node = 1;
1983: break;
1984: case OP_DUP_ASTERISK:
1985: case OP_DUP_PLUS:
1986: case OP_DUP_QUESTION:
1987: case OP_OPEN_DUP_NUM:
1988: if (syntax & RE_CONTEXT_INVALID_OPS)
1989: {
1990: *err = REG_BADRPT;
1991: return NULL;
1992: }
1993: else if (syntax & RE_CONTEXT_INDEP_OPS)
1994: {
1995: *token = fetch_token (regexp, syntax);
1996: return parse_expression (regexp, preg, token, syntax, nest, err);
1997: }
1998: /* else fall through */
1999: case OP_CLOSE_SUBEXP:
2000: if ((token->type == OP_CLOSE_SUBEXP) &&
2001: !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2002: {
2003: *err = REG_ERPAREN;
2004: return NULL;
2005: }
2006: /* else fall through */
2007: case OP_CLOSE_DUP_NUM:
2008: /* We treat it as a normal character. */
2009:
2010: /* Then we can these characters as normal characters. */
2011: token->type = CHARACTER;
2012: new_idx = re_dfa_add_node (dfa, *token, 0);
2013: tree = create_tree (NULL, NULL, 0, new_idx);
2014: if (BE (new_idx == -1 || tree == NULL, 0))
2015: {
2016: *err = REG_ESPACE;
2017: return NULL;
2018: }
2019: break;
2020: case ANCHOR:
2021: if (dfa->word_char == NULL)
2022: {
2023: *err = init_word_char (dfa);
2024: if (BE (*err != REG_NOERROR, 0))
2025: return NULL;
2026: }
2027: if (token->opr.ctx_type == WORD_DELIM)
2028: {
2029: bin_tree_t *tree_first, *tree_last;
2030: int idx_first, idx_last;
2031: token->opr.ctx_type = WORD_FIRST;
2032: idx_first = re_dfa_add_node (dfa, *token, 0);
2033: tree_first = create_tree (NULL, NULL, 0, idx_first);
2034: token->opr.ctx_type = WORD_LAST;
2035: idx_last = re_dfa_add_node (dfa, *token, 0);
2036: tree_last = create_tree (NULL, NULL, 0, idx_last);
2037: token->type = OP_ALT;
2038: new_idx = re_dfa_add_node (dfa, *token, 0);
2039: tree = create_tree (tree_first, tree_last, 0, new_idx);
2040: if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2041: || tree_first == NULL || tree_last == NULL
2042: || tree == NULL, 0))
2043: {
2044: *err = REG_ESPACE;
2045: return NULL;
2046: }
2047: }
2048: else
2049: {
2050: new_idx = re_dfa_add_node (dfa, *token, 0);
2051: tree = create_tree (NULL, NULL, 0, new_idx);
2052: if (BE (new_idx == -1 || tree == NULL, 0))
2053: {
2054: *err = REG_ESPACE;
2055: return NULL;
2056: }
2057: }
2058: /* We must return here, since ANCHORs can't be followed
2059: by repetition operators.
2060: eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2061: it must not be "<ANCHOR(^)><REPEAT(*)>". */
2062: *token = fetch_token (regexp, syntax);
2063: return tree;
2064: case OP_PERIOD:
2065: new_idx = re_dfa_add_node (dfa, *token, 0);
2066: tree = create_tree (NULL, NULL, 0, new_idx);
2067: if (BE (new_idx == -1 || tree == NULL, 0))
2068: {
2069: *err = REG_ESPACE;
2070: return NULL;
2071: }
2072: if (MB_CUR_MAX > 1)
2073: dfa->has_mb_node = 1;
2074: break;
2075: case OP_WORD:
2076: tree = build_word_op (dfa, 0, err);
2077: if (BE (*err != REG_NOERROR && tree == NULL, 0))
2078: return NULL;
2079: break;
2080: case OP_NOTWORD:
2081: tree = build_word_op (dfa, 1, err);
2082: if (BE (*err != REG_NOERROR && tree == NULL, 0))
2083: return NULL;
2084: break;
2085: case OP_ALT:
2086: case END_OF_RE:
2087: return NULL;
2088: case BACK_SLASH:
2089: *err = REG_EESCAPE;
2090: return NULL;
2091: default:
2092: /* Must not happen? */
2093: #ifdef DEBUG
2094: assert (0);
2095: #endif
2096: return NULL;
2097: }
2098: *token = fetch_token (regexp, syntax);
2099:
2100: while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2101: || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2102: {
2103: tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2104: if (BE (*err != REG_NOERROR && tree == NULL, 0))
2105: return NULL;
2106: dfa->has_plural_match = 1;
2107: }
2108:
2109: return tree;
2110: }
2111:
2112: /* This function build the following tree, from regular expression
2113: (<reg_exp>):
2114: SUBEXP
2115: |
2116: <reg_exp>
2117: */
2118:
2119: static bin_tree_t *
2120: parse_sub_exp (regexp, preg, token, syntax, nest, err)
2121: re_string_t *regexp;
2122: regex_t *preg;
2123: re_token_t *token;
2124: reg_syntax_t syntax;
2125: int nest;
2126: reg_errcode_t *err;
2127: {
2128: re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2129: bin_tree_t *tree, *left_par, *right_par;
2130: size_t cur_nsub;
2131: int new_idx;
2132: cur_nsub = preg->re_nsub++;
2133: if (dfa->subexps_alloc < preg->re_nsub)
2134: {
2135: re_subexp_t *new_array;
2136: dfa->subexps_alloc *= 2;
2137: new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2138: if (BE (new_array == NULL, 0))
2139: {
2140: dfa->subexps_alloc /= 2;
2141: *err = REG_ESPACE;
2142: return NULL;
2143: }
2144: dfa->subexps = new_array;
2145: }
2146: dfa->subexps[cur_nsub].start = dfa->nodes_len;
2147: dfa->subexps[cur_nsub].end = -1;
2148:
2149: new_idx = re_dfa_add_node (dfa, *token, 0);
2150: left_par = create_tree (NULL, NULL, 0, new_idx);
2151: if (BE (new_idx == -1 || left_par == NULL, 0))
2152: {
2153: *err = REG_ESPACE;
2154: return NULL;
2155: }
2156: dfa->nodes[new_idx].opr.idx = cur_nsub;
2157: *token = fetch_token (regexp, syntax);
2158:
2159: /* The subexpression may be a null string. */
2160: if (token->type == OP_CLOSE_SUBEXP)
2161: tree = NULL;
2162: else
2163: {
2164: tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2165: if (BE (*err != REG_NOERROR && tree == NULL, 0))
2166: return NULL;
2167: }
2168: if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2169: {
2170: free_bin_tree (tree);
2171: *err = REG_BADPAT;
2172: return NULL;
2173: }
2174: new_idx = re_dfa_add_node (dfa, *token, 0);
2175: dfa->subexps[cur_nsub].end = dfa->nodes_len;
2176: right_par = create_tree (NULL, NULL, 0, new_idx);
2177: tree = ((tree == NULL) ? right_par
2178: : create_tree (tree, right_par, CONCAT, 0));
2179: tree = create_tree (left_par, tree, CONCAT, 0);
2180: if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2181: {
2182: *err = REG_ESPACE;
2183: return NULL;
2184: }
2185: dfa->nodes[new_idx].opr.idx = cur_nsub;
2186:
2187: return tree;
2188: }
2189:
2190: /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2191:
2192: static bin_tree_t *
2193: parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2194: bin_tree_t *dup_elem;
2195: re_string_t *regexp;
2196: re_dfa_t *dfa;
2197: re_token_t *token;
2198: reg_syntax_t syntax;
2199: reg_errcode_t *err;
2200: {
2201: re_token_t dup_token;
2202: bin_tree_t *tree = dup_elem, *work_tree;
2203: int new_idx, start_idx = re_string_cur_idx (regexp);
2204: re_token_t start_token = *token;
2205: if (token->type == OP_OPEN_DUP_NUM)
2206: {
2207: int i;
2208: int end = 0;
2209: int start = fetch_number (regexp, token, syntax);
2210: bin_tree_t *elem;
2211: if (start == -1)
2212: {
2213: if (token->type == CHARACTER && token->opr.c == ',')
2214: start = 0; /* We treat "{,m}" as "{0,m}". */
2215: else
2216: {
2217: *err = REG_BADBR; /* <re>{} is invalid. */
2218: return NULL;
2219: }
2220: }
2221: if (BE (start != -2, 1))
2222: {
2223: /* We treat "{n}" as "{n,n}". */
2224: end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2225: : ((token->type == CHARACTER && token->opr.c == ',')
2226: ? fetch_number (regexp, token, syntax) : -2));
2227: }
2228: if (BE (start == -2 || end == -2, 0))
2229: {
2230: /* Invalid sequence. */
2231: if (token->type == OP_CLOSE_DUP_NUM)
2232: goto parse_dup_op_invalid_interval;
2233: else
2234: goto parse_dup_op_ebrace;
2235: }
2236: if (BE (start == 0 && end == 0, 0))
2237: {
2238: /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2239: *token = fetch_token (regexp, syntax);
2240: free_bin_tree (dup_elem);
2241: return NULL;
2242: }
2243:
2244: /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2245: elem = tree;
2246: for (i = 0; i < start; ++i)
2247: if (i != 0)
2248: {
2249: work_tree = duplicate_tree (elem, dfa);
2250: tree = create_tree (tree, work_tree, CONCAT, 0);
2251: if (BE (work_tree == NULL || tree == NULL, 0))
2252: goto parse_dup_op_espace;
2253: }
2254:
2255: if (end == -1)
2256: {
2257: /* We treat "<re>{0,}" as "<re>*". */
2258: dup_token.type = OP_DUP_ASTERISK;
2259: if (start > 0)
2260: {
2261: elem = duplicate_tree (elem, dfa);
2262: new_idx = re_dfa_add_node (dfa, dup_token, 0);
2263: work_tree = create_tree (elem, NULL, 0, new_idx);
2264: tree = create_tree (tree, work_tree, CONCAT, 0);
2265: if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2266: || tree == NULL, 0))
2267: goto parse_dup_op_espace;
2268: }
2269: else
2270: {
2271: new_idx = re_dfa_add_node (dfa, dup_token, 0);
2272: tree = create_tree (elem, NULL, 0, new_idx);
2273: if (BE (new_idx == -1 || tree == NULL, 0))
2274: goto parse_dup_op_espace;
2275: }
2276: }
2277: else if (end - start > 0)
2278: {
2279: /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2280: dup_token.type = OP_DUP_QUESTION;
2281: if (start > 0)
2282: {
2283: elem = duplicate_tree (elem, dfa);
2284: new_idx = re_dfa_add_node (dfa, dup_token, 0);
2285: elem = create_tree (elem, NULL, 0, new_idx);
2286: tree = create_tree (tree, elem, CONCAT, 0);
2287: if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2288: goto parse_dup_op_espace;
2289: }
2290: else
2291: {
2292: new_idx = re_dfa_add_node (dfa, dup_token, 0);
2293: tree = elem = create_tree (elem, NULL, 0, new_idx);
2294: if (BE (new_idx == -1 || tree == NULL, 0))
2295: goto parse_dup_op_espace;
2296: }
2297: for (i = 1; i < end - start; ++i)
2298: {
2299: work_tree = duplicate_tree (elem, dfa);
2300: tree = create_tree (tree, work_tree, CONCAT, 0);
2301: if (BE (work_tree == NULL || tree == NULL, 0))
2302: {
2303: *err = REG_ESPACE;
2304: return NULL;
2305: }
2306: }
2307: }
2308: }
2309: else
2310: {
2311: new_idx = re_dfa_add_node (dfa, *token, 0);
2312: tree = create_tree (tree, NULL, 0, new_idx);
2313: if (BE (new_idx == -1 || tree == NULL, 0))
2314: {
2315: *err = REG_ESPACE;
2316: return NULL;
2317: }
2318: }
2319: *token = fetch_token (regexp, syntax);
2320: return tree;
2321:
2322: parse_dup_op_espace:
2323: free_bin_tree (tree);
2324: *err = REG_ESPACE;
2325: return NULL;
2326:
2327: parse_dup_op_ebrace:
2328: if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2329: {
2330: *err = REG_EBRACE;
2331: return NULL;
2332: }
2333: goto parse_dup_op_rollback;
2334: parse_dup_op_invalid_interval:
2335: if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2336: {
2337: *err = REG_BADBR;
2338: return NULL;
2339: }
2340: parse_dup_op_rollback:
2341: re_string_set_index (regexp, start_idx);
2342: *token = start_token;
2343: token->type = CHARACTER;
2344: return dup_elem;
2345: }
2346:
2347: /* Size of the names for collating symbol/equivalence_class/character_class.
2348: I'm not sure, but maybe enough. */
2349: #define BRACKET_NAME_BUF_SIZE 32
2350:
2351: #ifndef _LIBC
2352: /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2353: Build the range expression which starts from START_ELEM, and ends
2354: at END_ELEM. The result are written to MBCSET and SBCSET.
2355: RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2356: mbcset->range_ends, is a pointer argument sinse we may
2357: update it. */
2358:
2359: static reg_errcode_t
2360: # ifdef RE_ENABLE_I18N
2361: build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2362: re_charset_t *mbcset;
2363: int *range_alloc;
2364: # else /* not RE_ENABLE_I18N */
2365: build_range_exp (sbcset, start_elem, end_elem)
2366: # endif /* not RE_ENABLE_I18N */
2367: re_bitset_ptr_t sbcset;
2368: bracket_elem_t *start_elem, *end_elem;
2369: {
2370: unsigned int start_ch, end_ch;
2371: /* Equivalence Classes and Character Classes can't be a range start/end. */
2372: if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2373: || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2374: 0))
2375: return REG_ERANGE;
2376:
2377: /* We can handle no multi character collating elements without libc
2378: support. */
2379: if (BE ((start_elem->type == COLL_SYM
2380: && strlen ((char *) start_elem->opr.name) > 1)
2381: || (end_elem->type == COLL_SYM
2382: && strlen ((char *) end_elem->opr.name) > 1), 0))
2383: return REG_ECOLLATE;
2384:
2385: # ifdef RE_ENABLE_I18N
2386: {
2387: wchar_t wc, start_wc, end_wc;
2388: wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2389:
2390: start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2391: : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2392: : 0));
2393: end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2394: : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2395: : 0));
2396: start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2397: ? __btowc (start_ch) : start_elem->opr.wch);
2398: end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2399: ? __btowc (end_ch) : end_elem->opr.wch);
2400: cmp_buf[0] = start_wc;
2401: cmp_buf[4] = end_wc;
2402: if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2403: return REG_ERANGE;
2404:
2405: /* Check the space of the arrays. */
2406: if (*range_alloc == mbcset->nranges)
2407: {
2408: /* There are not enough space, need realloc. */
2409: wchar_t *new_array_start, *new_array_end;
2410: int new_nranges;
2411:
2412: /* +1 in case of mbcset->nranges is 0. */
2413: new_nranges = 2 * mbcset->nranges + 1;
2414: /* Use realloc since mbcset->range_starts and mbcset->range_ends
2415: are NULL if *range_alloc == 0. */
2416: new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2417: new_nranges);
2418: new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2419: new_nranges);
2420:
2421: if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2422: return REG_ESPACE;
2423:
2424: mbcset->range_starts = new_array_start;
2425: mbcset->range_ends = new_array_end;
2426: *range_alloc = new_nranges;
2427: }
2428:
2429: mbcset->range_starts[mbcset->nranges] = start_wc;
2430: mbcset->range_ends[mbcset->nranges++] = end_wc;
2431:
2432: /* Build the table for single byte characters. */
2433: for (wc = 0; wc <= SBC_MAX; ++wc)
2434: {
2435: cmp_buf[2] = wc;
2436: if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2437: && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2438: bitset_set (sbcset, wc);
2439: }
2440: }
2441: # else /* not RE_ENABLE_I18N */
2442: {
2443: unsigned int ch;
2444: start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2445: : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2446: : 0));
2447: end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2448: : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2449: : 0));
2450: if (start_ch > end_ch)
2451: return REG_ERANGE;
2452: /* Build the table for single byte characters. */
2453: for (ch = 0; ch <= SBC_MAX; ++ch)
2454: if (start_ch <= ch && ch <= end_ch)
2455: bitset_set (sbcset, ch);
2456: }
2457: # endif /* not RE_ENABLE_I18N */
2458: return REG_NOERROR;
2459: }
2460: #endif /* not _LIBC */
2461:
2462: #ifndef _LIBC
2463: /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2464: Build the collating element which is represented by NAME.
2465: The result are written to MBCSET and SBCSET.
2466: COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2467: pointer argument since we may update it. */
2468:
2469: static reg_errcode_t
2470: # ifdef RE_ENABLE_I18N
2471: build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2472: re_charset_t *mbcset;
2473: int *coll_sym_alloc;
2474: # else /* not RE_ENABLE_I18N */
2475: build_collating_symbol (sbcset, name)
2476: # endif /* not RE_ENABLE_I18N */
2477: re_bitset_ptr_t sbcset;
2478: const unsigned char *name;
2479: {
2480: size_t name_len = strlen ((const char *) name);
2481: if (BE (name_len != 1, 0))
2482: return REG_ECOLLATE;
2483: else
2484: {
2485: bitset_set (sbcset, name[0]);
2486: return REG_NOERROR;
2487: }
2488: }
2489: #endif /* not _LIBC */
2490:
2491: /* This function parse bracket expression like "[abc]", "[a-c]",
2492: "[[.a-a.]]" etc. */
2493:
2494: static bin_tree_t *
2495: parse_bracket_exp (regexp, dfa, token, syntax, err)
2496: re_string_t *regexp;
2497: re_dfa_t *dfa;
2498: re_token_t *token;
2499: reg_syntax_t syntax;
2500: reg_errcode_t *err;
2501: {
2502: #ifdef _LIBC
2503: const unsigned char *collseqmb;
2504: const char *collseqwc;
2505: uint32_t nrules;
2506: int32_t table_size;
2507: const int32_t *symb_table;
2508: const unsigned char *extra;
2509:
2510: /* Local function for parse_bracket_exp used in _LIBC environement.
2511: Seek the collating symbol entry correspondings to NAME.
2512: Return the index of the symbol in the SYMB_TABLE. */
2513:
2514: static inline int32_t
2515: seek_collating_symbol_entry (name, name_len)
2516: const unsigned char *name;
2517: size_t name_len;
2518: {
2519: int32_t hash = elem_hash ((const char *) name, name_len);
2520: int32_t elem = hash % table_size;
2521: int32_t second = hash % (table_size - 2);
2522: while (symb_table[2 * elem] != 0)
2523: {
2524: /* First compare the hashing value. */
2525: if (symb_table[2 * elem] == hash
2526: /* Compare the length of the name. */
2527: && name_len == extra[symb_table[2 * elem + 1]]
2528: /* Compare the name. */
2529: && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2530: name_len) == 0)
2531: {
2532: /* Yep, this is the entry. */
2533: break;
2534: }
2535:
2536: /* Next entry. */
2537: elem += second;
2538: }
2539: return elem;
2540: }
2541:
2542: /* Local function for parse_bracket_exp used in _LIBC environement.
2543: Look up the collation sequence value of BR_ELEM.
2544: Return the value if succeeded, UINT_MAX otherwise. */
2545:
2546: static inline unsigned int
2547: lookup_collation_sequence_value (br_elem)
2548: bracket_elem_t *br_elem;
2549: {
2550: if (br_elem->type == SB_CHAR)
2551: {
2552: /*
2553: if (MB_CUR_MAX == 1)
2554: */
2555: if (nrules == 0)
2556: return collseqmb[br_elem->opr.ch];
2557: else
2558: {
2559: wint_t wc = __btowc (br_elem->opr.ch);
2560: return collseq_table_lookup (collseqwc, wc);
2561: }
2562: }
2563: else if (br_elem->type == MB_CHAR)
2564: {
2565: return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2566: }
2567: else if (br_elem->type == COLL_SYM)
2568: {
2569: size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2570: if (nrules != 0)
2571: {
2572: int32_t elem, idx;
2573: elem = seek_collating_symbol_entry (br_elem->opr.name,
2574: sym_name_len);
2575: if (symb_table[2 * elem] != 0)
2576: {
2577: /* We found the entry. */
2578: idx = symb_table[2 * elem + 1];
2579: /* Skip the name of collating element name. */
2580: idx += 1 + extra[idx];
2581: /* Skip the byte sequence of the collating element. */
2582: idx += 1 + extra[idx];
2583: /* Adjust for the alignment. */
2584: idx = (idx + 3) & ~3;
2585: /* Skip the multibyte collation sequence value. */
2586: idx += sizeof (unsigned int);
2587: /* Skip the wide char sequence of the collating element. */
2588: idx += sizeof (unsigned int) *
2589: (1 + *(unsigned int *) (extra + idx));
2590: /* Return the collation sequence value. */
2591: return *(unsigned int *) (extra + idx);
2592: }
2593: else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2594: {
2595: /* No valid character. Match it as a single byte
2596: character. */
2597: return collseqmb[br_elem->opr.name[0]];
2598: }
2599: }
2600: else if (sym_name_len == 1)
2601: return collseqmb[br_elem->opr.name[0]];
2602: }
2603: return UINT_MAX;
2604: }
2605:
2606: /* Local function for parse_bracket_exp used in _LIBC environement.
2607: Build the range expression which starts from START_ELEM, and ends
2608: at END_ELEM. The result are written to MBCSET and SBCSET.
2609: RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2610: mbcset->range_ends, is a pointer argument sinse we may
2611: update it. */
2612:
2613: static inline reg_errcode_t
2614: # ifdef RE_ENABLE_I18N
2615: build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2616: re_charset_t *mbcset;
2617: int *range_alloc;
2618: # else /* not RE_ENABLE_I18N */
2619: build_range_exp (sbcset, start_elem, end_elem)
2620: # endif /* not RE_ENABLE_I18N */
2621: re_bitset_ptr_t sbcset;
2622: bracket_elem_t *start_elem, *end_elem;
2623: {
2624: unsigned int ch;
2625: uint32_t start_collseq;
2626: uint32_t end_collseq;
2627:
2628: # ifdef RE_ENABLE_I18N
2629: /* Check the space of the arrays. */
2630: if (*range_alloc == mbcset->nranges)
2631: {
2632: /* There are not enough space, need realloc. */
2633: uint32_t *new_array_start;
2634: uint32_t *new_array_end;
2635: int new_nranges;
2636:
2637: /* +1 in case of mbcset->nranges is 0. */
2638: new_nranges = 2 * mbcset->nranges + 1;
2639: /* Use realloc since mbcset->range_starts and mbcset->range_ends
2640: are NULL if *range_alloc == 0. */
2641: new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2642: new_nranges);
2643: new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2644: new_nranges);
2645:
2646: if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2647: return REG_ESPACE;
2648:
2649: mbcset->range_starts = new_array_start;
2650: mbcset->range_ends = new_array_end;
2651: *range_alloc = new_nranges;
2652: }
2653: # endif /* RE_ENABLE_I18N */
2654:
2655: /* Equivalence Classes and Character Classes can't be a range
2656: start/end. */
2657: if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2658: || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2659: 0))
2660: return REG_ERANGE;
2661:
2662: start_collseq = lookup_collation_sequence_value (start_elem);
2663: end_collseq = lookup_collation_sequence_value (end_elem);
2664: /* Check start/end collation sequence values. */
2665: if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2666: return REG_ECOLLATE;
2667: if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2668: return REG_ERANGE;
2669:
2670: # ifdef RE_ENABLE_I18N
2671: /* Got valid collation sequence values, add them as a new entry. */
2672: mbcset->range_starts[mbcset->nranges] = start_collseq;
2673: mbcset->range_ends[mbcset->nranges++] = end_collseq;
2674: # endif /* RE_ENABLE_I18N */
2675:
2676: /* Build the table for single byte characters. */
2677: for (ch = 0; ch <= SBC_MAX; ch++)
2678: {
2679: uint32_t ch_collseq;
2680: /*
2681: if (MB_CUR_MAX == 1)
2682: */
2683: if (nrules == 0)
2684: ch_collseq = collseqmb[ch];
2685: else
2686: ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2687: if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2688: bitset_set (sbcset, ch);
2689: }
2690: return REG_NOERROR;
2691: }
2692:
2693: /* Local function for parse_bracket_exp used in _LIBC environement.
2694: Build the collating element which is represented by NAME.
2695: The result are written to MBCSET and SBCSET.
2696: COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2697: pointer argument sinse we may update it. */
2698:
2699: static inline reg_errcode_t
2700: # ifdef RE_ENABLE_I18N
2701: build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2702: re_charset_t *mbcset;
2703: int *coll_sym_alloc;
2704: # else /* not RE_ENABLE_I18N */
2705: build_collating_symbol (sbcset, name)
2706: # endif /* not RE_ENABLE_I18N */
2707: re_bitset_ptr_t sbcset;
2708: const unsigned char *name;
2709: {
2710: int32_t elem, idx;
2711: size_t name_len = strlen ((const char *) name);
2712: if (nrules != 0)
2713: {
2714: elem = seek_collating_symbol_entry (name, name_len);
2715: if (symb_table[2 * elem] != 0)
2716: {
2717: /* We found the entry. */
2718: idx = symb_table[2 * elem + 1];
2719: /* Skip the name of collating element name. */
2720: idx += 1 + extra[idx];
2721: }
2722: else if (symb_table[2 * elem] == 0 && name_len == 1)
2723: {
2724: /* No valid character, treat it as a normal
2725: character. */
2726: bitset_set (sbcset, name[0]);
2727: return REG_NOERROR;
2728: }
2729: else
2730: return REG_ECOLLATE;
2731:
2732: # ifdef RE_ENABLE_I18N
2733: /* Got valid collation sequence, add it as a new entry. */
2734: /* Check the space of the arrays. */
2735: if (*coll_sym_alloc == mbcset->ncoll_syms)
2736: {
2737: /* Not enough, realloc it. */
2738: /* +1 in case of mbcset->ncoll_syms is 0. */
2739: *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2740: /* Use realloc since mbcset->coll_syms is NULL
2741: if *alloc == 0. */
2742: mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2743: *coll_sym_alloc);
2744: if (BE (mbcset->coll_syms == NULL, 0))
2745: return REG_ESPACE;
2746: }
2747: mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2748: # endif /* RE_ENABLE_I18N */
2749: return REG_NOERROR;
2750: }
2751: else
2752: {
2753: if (BE (name_len != 1, 0))
2754: return REG_ECOLLATE;
2755: else
2756: {
2757: bitset_set (sbcset, name[0]);
2758: return REG_NOERROR;
2759: }
2760: }
2761: }
2762: #endif
2763:
2764: re_token_t br_token;
2765: re_bitset_ptr_t sbcset;
2766: #ifdef RE_ENABLE_I18N
2767: re_charset_t *mbcset;
2768: int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2769: int equiv_class_alloc = 0, char_class_alloc = 0;
2770: #else /* not RE_ENABLE_I18N */
2771: int non_match = 0;
2772: #endif /* not RE_ENABLE_I18N */
2773: bin_tree_t *work_tree;
2774: int token_len, new_idx;
2775: #ifdef _LIBC
2776: collseqmb = (const unsigned char *)
2777: _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2778: nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2779: if (nrules)
2780: {
2781: /*
2782: if (MB_CUR_MAX > 1)
2783: */
2784: collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2785: table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2786: symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2787: _NL_COLLATE_SYMB_TABLEMB);
2788: extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2789: _NL_COLLATE_SYMB_EXTRAMB);
2790: }
2791: #endif
2792: sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2793: #ifdef RE_ENABLE_I18N
2794: mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2795: #endif /* RE_ENABLE_I18N */
2796: #ifdef RE_ENABLE_I18N
2797: if (BE (sbcset == NULL || mbcset == NULL, 0))
2798: #else
2799: if (BE (sbcset == NULL, 0))
2800: #endif /* RE_ENABLE_I18N */
2801: {
2802: *err = REG_ESPACE;
2803: return NULL;
2804: }
2805:
2806: token_len = peek_token_bracket (token, regexp, syntax);
2807: if (BE (token->type == END_OF_RE, 0))
2808: {
2809: *err = REG_BADPAT;
2810: goto parse_bracket_exp_free_return;
2811: }
2812: if (token->type == OP_NON_MATCH_LIST)
2813: {
2814: #ifdef RE_ENABLE_I18N
2815: int i;
2816: mbcset->non_match = 1;
2817: #else /* not RE_ENABLE_I18N */
2818: non_match = 1;
2819: #endif /* not RE_ENABLE_I18N */
2820: if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2821: bitset_set (sbcset, '\0');
2822: re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2823: token_len = peek_token_bracket (token, regexp, syntax);
2824: if (BE (token->type == END_OF_RE, 0))
2825: {
2826: *err = REG_BADPAT;
2827: goto parse_bracket_exp_free_return;
2828: }
2829: #ifdef RE_ENABLE_I18N
2830: if (MB_CUR_MAX > 1)
2831: for (i = 0; i < SBC_MAX; ++i)
2832: if (__btowc (i) == WEOF)
2833: bitset_set (sbcset, i);
2834: #endif /* RE_ENABLE_I18N */
2835: }
2836:
2837: /* We treat the first ']' as a normal character. */
2838: if (token->type == OP_CLOSE_BRACKET)
2839: token->type = CHARACTER;
2840:
2841: while (1)
2842: {
2843: bracket_elem_t start_elem, end_elem;
2844: unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2845: unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2846: reg_errcode_t ret;
2847: int token_len2 = 0, is_range_exp = 0;
2848: re_token_t token2;
2849:
2850: start_elem.opr.name = start_name_buf;
2851: ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2852: syntax);
2853: if (BE (ret != REG_NOERROR, 0))
2854: {
2855: *err = ret;
2856: goto parse_bracket_exp_free_return;
2857: }
2858:
2859: token_len = peek_token_bracket (token, regexp, syntax);
2860: if (BE (token->type == END_OF_RE, 0))
2861: {
2862: *err = REG_BADPAT;
2863: goto parse_bracket_exp_free_return;
2864: }
2865: if (token->type == OP_CHARSET_RANGE)
2866: {
2867: re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2868: token_len2 = peek_token_bracket (&token2, regexp, syntax);
2869: if (BE (token->type == END_OF_RE, 0))
2870: {
2871: *err = REG_BADPAT;
2872: goto parse_bracket_exp_free_return;
2873: }
2874: if (token2.type == OP_CLOSE_BRACKET)
2875: {
2876: /* We treat the last '-' as a normal character. */
2877: re_string_skip_bytes (regexp, -token_len);
2878: token->type = CHARACTER;
2879: }
2880: else
2881: is_range_exp = 1;
2882: }
2883:
2884: if (is_range_exp == 1)
2885: {
2886: end_elem.opr.name = end_name_buf;
2887: ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2888: dfa, syntax);
2889: if (BE (ret != REG_NOERROR, 0))
2890: {
2891: *err = ret;
2892: goto parse_bracket_exp_free_return;
2893: }
2894:
2895: token_len = peek_token_bracket (token, regexp, syntax);
2896: if (BE (token->type == END_OF_RE, 0))
2897: {
2898: *err = REG_BADPAT;
2899: goto parse_bracket_exp_free_return;
2900: }
2901: *err = build_range_exp (sbcset,
2902: #ifdef RE_ENABLE_I18N
2903: mbcset, &range_alloc,
2904: #endif /* RE_ENABLE_I18N */
2905: &start_elem, &end_elem);
2906: if (BE (*err != REG_NOERROR, 0))
2907: goto parse_bracket_exp_free_return;
2908: }
2909: else
2910: {
2911: switch (start_elem.type)
2912: {
2913: case SB_CHAR:
2914: bitset_set (sbcset, start_elem.opr.ch);
2915: break;
2916: #ifdef RE_ENABLE_I18N
2917: case MB_CHAR:
2918: /* Check whether the array has enough space. */
2919: if (mbchar_alloc == mbcset->nmbchars)
2920: {
2921: /* Not enough, realloc it. */
2922: /* +1 in case of mbcset->nmbchars is 0. */
2923: mbchar_alloc = 2 * mbcset->nmbchars + 1;
2924: /* Use realloc since array is NULL if *alloc == 0. */
2925: mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2926: mbchar_alloc);
2927: if (BE (mbcset->mbchars == NULL, 0))
2928: goto parse_bracket_exp_espace;
2929: }
2930: mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2931: break;
2932: #endif /* RE_ENABLE_I18N */
2933: case EQUIV_CLASS:
2934: *err = build_equiv_class (sbcset,
2935: #ifdef RE_ENABLE_I18N
2936: mbcset, &equiv_class_alloc,
2937: #endif /* RE_ENABLE_I18N */
2938: start_elem.opr.name);
2939: if (BE (*err != REG_NOERROR, 0))
2940: goto parse_bracket_exp_free_return;
2941: break;
2942: case COLL_SYM:
2943: *err = build_collating_symbol (sbcset,
2944: #ifdef RE_ENABLE_I18N
2945: mbcset, &coll_sym_alloc,
2946: #endif /* RE_ENABLE_I18N */
2947: start_elem.opr.name);
2948: if (BE (*err != REG_NOERROR, 0))
2949: goto parse_bracket_exp_free_return;
2950: break;
2951: case CHAR_CLASS:
2952: *err = build_charclass (sbcset,
2953: #ifdef RE_ENABLE_I18N
2954: mbcset, &char_class_alloc,
2955: #endif /* RE_ENABLE_I18N */
2956: start_elem.opr.name, syntax);
2957: if (BE (*err != REG_NOERROR, 0))
2958: goto parse_bracket_exp_free_return;
2959: break;
2960: default:
2961: assert (0);
2962: break;
2963: }
2964: }
2965: if (token->type == OP_CLOSE_BRACKET)
2966: break;
2967: }
2968:
2969: re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2970:
2971: /* If it is non-matching list. */
2972: #ifdef RE_ENABLE_I18N
2973: if (mbcset->non_match)
2974: #else /* not RE_ENABLE_I18N */
2975: if (non_match)
2976: #endif /* not RE_ENABLE_I18N */
2977: bitset_not (sbcset);
2978:
2979: /* Build a tree for simple bracket. */
2980: br_token.type = SIMPLE_BRACKET;
2981: br_token.opr.sbcset = sbcset;
2982: new_idx = re_dfa_add_node (dfa, br_token, 0);
2983: work_tree = create_tree (NULL, NULL, 0, new_idx);
2984: if (BE (new_idx == -1 || work_tree == NULL, 0))
2985: goto parse_bracket_exp_espace;
2986:
2987: #ifdef RE_ENABLE_I18N
2988: if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2989: || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2990: || mbcset->non_match)))
2991: {
2992: re_token_t alt_token;
2993: bin_tree_t *mbc_tree;
2994: /* Build a tree for complex bracket. */
2995: br_token.type = COMPLEX_BRACKET;
2996: br_token.opr.mbcset = mbcset;
2997: dfa->has_mb_node = 1;
2998: new_idx = re_dfa_add_node (dfa, br_token, 0);
2999: mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3000: if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3001: goto parse_bracket_exp_espace;
3002: /* Then join them by ALT node. */
3003: dfa->has_plural_match = 1;
3004: alt_token.type = OP_ALT;
3005: new_idx = re_dfa_add_node (dfa, alt_token, 0);
3006: work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3007: if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3008: return work_tree;
3009: }
3010: else
3011: {
3012: free_charset (mbcset);
3013: return work_tree;
3014: }
3015: #else /* not RE_ENABLE_I18N */
3016: return work_tree;
3017: #endif /* not RE_ENABLE_I18N */
3018:
3019: parse_bracket_exp_espace:
3020: *err = REG_ESPACE;
3021: parse_bracket_exp_free_return:
3022: re_free (sbcset);
3023: #ifdef RE_ENABLE_I18N
3024: free_charset (mbcset);
3025: #endif /* RE_ENABLE_I18N */
3026: return NULL;
3027: }
3028:
3029: /* Parse an element in the bracket expression. */
3030:
3031: static reg_errcode_t
3032: parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3033: bracket_elem_t *elem;
3034: re_string_t *regexp;
3035: re_token_t *token;
3036: int token_len;
3037: re_dfa_t *dfa;
3038: reg_syntax_t syntax;
3039: {
3040: #ifdef RE_ENABLE_I18N
3041: int cur_char_size;
3042: cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3043: if (cur_char_size > 1)
3044: {
3045: elem->type = MB_CHAR;
3046: elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3047: re_string_skip_bytes (regexp, cur_char_size);
3048: return REG_NOERROR;
3049: }
3050: #endif /* RE_ENABLE_I18N */
3051: re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3052: if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3053: || token->type == OP_OPEN_EQUIV_CLASS)
3054: return parse_bracket_symbol (elem, regexp, token);
3055: elem->type = SB_CHAR;
3056: elem->opr.ch = token->opr.c;
3057: return REG_NOERROR;
3058: }
3059:
3060: /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3061: such as [:<character_class>:], [.<collating_element>.], and
3062: [=<equivalent_class>=]. */
3063:
3064: static reg_errcode_t
3065: parse_bracket_symbol (elem, regexp, token)
3066: bracket_elem_t *elem;
3067: re_string_t *regexp;
3068: re_token_t *token;
3069: {
3070: unsigned char ch, delim = token->opr.c;
3071: int i = 0;
3072: for (;; ++i)
3073: {
3074: if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3075: return REG_EBRACK;
3076: if (token->type == OP_OPEN_CHAR_CLASS)
3077: ch = re_string_fetch_byte_case (regexp);
3078: else
3079: ch = re_string_fetch_byte (regexp);
3080: if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3081: break;
3082: elem->opr.name[i] = ch;
3083: }
3084: re_string_skip_bytes (regexp, 1);
3085: elem->opr.name[i] = '\0';
3086: switch (token->type)
3087: {
3088: case OP_OPEN_COLL_ELEM:
3089: elem->type = COLL_SYM;
3090: break;
3091: case OP_OPEN_EQUIV_CLASS:
3092: elem->type = EQUIV_CLASS;
3093: break;
3094: case OP_OPEN_CHAR_CLASS:
3095: elem->type = CHAR_CLASS;
3096: break;
3097: default:
3098: break;
3099: }
3100: return REG_NOERROR;
3101: }
3102:
3103: /* Helper function for parse_bracket_exp.
3104: Build the equivalence class which is represented by NAME.
3105: The result are written to MBCSET and SBCSET.
3106: EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3107: is a pointer argument sinse we may update it. */
3108:
3109: static reg_errcode_t
3110: #ifdef RE_ENABLE_I18N
3111: build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3112: re_charset_t *mbcset;
3113: int *equiv_class_alloc;
3114: #else /* not RE_ENABLE_I18N */
3115: build_equiv_class (sbcset, name)
3116: #endif /* not RE_ENABLE_I18N */
3117: re_bitset_ptr_t sbcset;
3118: const unsigned char *name;
3119: {
3120: #if defined _LIBC && defined RE_ENABLE_I18N
3121: uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3122: if (nrules != 0)
3123: {
3124: const int32_t *table, *indirect;
3125: const unsigned char *weights, *extra, *cp;
3126: unsigned char char_buf[2];
3127: int32_t idx1, idx2;
3128: unsigned int ch;
3129: size_t len;
3130: /* This #include defines a local function! */
3131: # include <locale/weight.h>
3132: /* Calculate the index for equivalence class. */
3133: cp = name;
3134: table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3135: weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3136: _NL_COLLATE_WEIGHTMB);
3137: extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3138: _NL_COLLATE_EXTRAMB);
3139: indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3140: _NL_COLLATE_INDIRECTMB);
3141: idx1 = findidx (&cp);
3142: if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3143: /* This isn't a valid character. */
3144: return REG_ECOLLATE;
3145:
3146: /* Build single byte matcing table for this equivalence class. */
3147: char_buf[1] = (unsigned char) '\0';
3148: len = weights[idx1];
3149: for (ch = 0; ch < SBC_MAX; ++ch)
3150: {
3151: char_buf[0] = ch;
3152: cp = char_buf;
3153: idx2 = findidx (&cp);
3154: /*
3155: idx2 = table[ch];
3156: */
3157: if (idx2 == 0)
3158: /* This isn't a valid character. */
3159: continue;
3160: if (len == weights[idx2])
3161: {
3162: int cnt = 0;
3163: while (cnt <= len &&
3164: weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3165: ++cnt;
3166:
3167: if (cnt > len)
3168: bitset_set (sbcset, ch);
3169: }
3170: }
3171: /* Check whether the array has enough space. */
3172: if (*equiv_class_alloc == mbcset->nequiv_classes)
3173: {
3174: /* Not enough, realloc it. */
3175: /* +1 in case of mbcset->nequiv_classes is 0. */
3176: *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3177: /* Use realloc since the array is NULL if *alloc == 0. */
3178: mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3179: *equiv_class_alloc);
3180: if (BE (mbcset->equiv_classes == NULL, 0))
3181: return REG_ESPACE;
3182: }
3183: mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3184: }
3185: else
3186: #endif /* _LIBC && RE_ENABLE_I18N */
3187: {
3188: if (BE (strlen ((const char *) name) != 1, 0))
3189: return REG_ECOLLATE;
3190: bitset_set (sbcset, *name);
3191: }
3192: return REG_NOERROR;
3193: }
3194:
3195: /* Helper function for parse_bracket_exp.
3196: Build the character class which is represented by NAME.
3197: The result are written to MBCSET and SBCSET.
3198: CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3199: is a pointer argument sinse we may update it. */
3200:
3201: static reg_errcode_t
3202: #ifdef RE_ENABLE_I18N
3203: build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3204: re_charset_t *mbcset;
3205: int *char_class_alloc;
3206: #else /* not RE_ENABLE_I18N */
3207: build_charclass (sbcset, class_name, syntax)
3208: #endif /* not RE_ENABLE_I18N */
3209: re_bitset_ptr_t sbcset;
3210: const unsigned char *class_name;
3211: reg_syntax_t syntax;
3212: {
3213: int i;
3214: const char *name = (const char *) class_name;
3215:
3216: /* In case of REG_ICASE "upper" and "lower" match the both of
3217: upper and lower cases. */
3218: if ((syntax & RE_ICASE)
3219: && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3220: name = "alpha";
3221:
3222: #ifdef RE_ENABLE_I18N
3223: /* Check the space of the arrays. */
3224: if (*char_class_alloc == mbcset->nchar_classes)
3225: {
3226: /* Not enough, realloc it. */
3227: /* +1 in case of mbcset->nchar_classes is 0. */
3228: *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3229: /* Use realloc since array is NULL if *alloc == 0. */
3230: mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3231: *char_class_alloc);
3232: if (BE (mbcset->char_classes == NULL, 0))
3233: return REG_ESPACE;
3234: }
3235: mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3236: #endif /* RE_ENABLE_I18N */
3237:
3238: #define BUILD_CHARCLASS_LOOP(ctype_func)\
3239: for (i = 0; i < SBC_MAX; ++i) \
3240: { \
3241: if (ctype_func (i)) \
3242: bitset_set (sbcset, i); \
3243: }
3244:
3245: if (strcmp (name, "alnum") == 0)
3246: BUILD_CHARCLASS_LOOP (isalnum)
3247: else if (strcmp (name, "cntrl") == 0)
3248: BUILD_CHARCLASS_LOOP (iscntrl)
3249: else if (strcmp (name, "lower") == 0)
3250: BUILD_CHARCLASS_LOOP (islower)
3251: else if (strcmp (name, "space") == 0)
3252: BUILD_CHARCLASS_LOOP (isspace)
3253: else if (strcmp (name, "alpha") == 0)
3254: BUILD_CHARCLASS_LOOP (isalpha)
3255: else if (strcmp (name, "digit") == 0)
3256: BUILD_CHARCLASS_LOOP (isdigit)
3257: else if (strcmp (name, "print") == 0)
3258: BUILD_CHARCLASS_LOOP (isprint)
3259: else if (strcmp (name, "upper") == 0)
3260: BUILD_CHARCLASS_LOOP (isupper)
3261: else if (strcmp (name, "blank") == 0)
3262: BUILD_CHARCLASS_LOOP (isblank)
3263: else if (strcmp (name, "graph") == 0)
3264: BUILD_CHARCLASS_LOOP (isgraph)
3265: else if (strcmp (name, "punct") == 0)
3266: BUILD_CHARCLASS_LOOP (ispunct)
3267: else if (strcmp (name, "xdigit") == 0)
3268: BUILD_CHARCLASS_LOOP (isxdigit)
3269: else
3270: return REG_ECTYPE;
3271:
3272: return REG_NOERROR;
3273: }
3274:
3275: static bin_tree_t *
3276: build_word_op (dfa, not, err)
3277: re_dfa_t *dfa;
3278: int not;
3279: reg_errcode_t *err;
3280: {
3281: re_bitset_ptr_t sbcset;
3282: #ifdef RE_ENABLE_I18N
3283: re_charset_t *mbcset;
3284: int alloc = 0;
3285: #else /* not RE_ENABLE_I18N */
3286: int non_match = 0;
3287: #endif /* not RE_ENABLE_I18N */
3288: reg_errcode_t ret;
3289: re_token_t br_token;
3290: bin_tree_t *tree;
3291: int new_idx;
3292:
3293: sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3294: #ifdef RE_ENABLE_I18N
3295: mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3296: #endif /* RE_ENABLE_I18N */
3297:
3298: #ifdef RE_ENABLE_I18N
3299: if (BE (sbcset == NULL || mbcset == NULL, 0))
3300: #else /* not RE_ENABLE_I18N */
3301: if (BE (sbcset == NULL, 0))
3302: #endif /* not RE_ENABLE_I18N */
3303: {
3304: *err = REG_ESPACE;
3305: return NULL;
3306: }
3307:
3308: if (not)
3309: {
3310: #ifdef RE_ENABLE_I18N
3311: int i;
3312: /*
3313: if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3314: bitset_set(cset->sbcset, '\0');
3315: */
3316: mbcset->non_match = 1;
3317: if (MB_CUR_MAX > 1)
3318: for (i = 0; i < SBC_MAX; ++i)
3319: if (__btowc (i) == WEOF)
3320: bitset_set (sbcset, i);
3321: #else /* not RE_ENABLE_I18N */
3322: non_match = 1;
3323: #endif /* not RE_ENABLE_I18N */
3324: }
3325:
3326: /* We don't care the syntax in this case. */
3327: ret = build_charclass (sbcset,
3328: #ifdef RE_ENABLE_I18N
3329: mbcset, &alloc,
3330: #endif /* RE_ENABLE_I18N */
3331: (const unsigned char *) "alpha", 0);
3332:
3333: if (BE (ret != REG_NOERROR, 0))
3334: {
3335: re_free (sbcset);
3336: #ifdef RE_ENABLE_I18N
3337: free_charset (mbcset);
3338: #endif /* RE_ENABLE_I18N */
3339: *err = ret;
3340: return NULL;
3341: }
3342: /* \w match '_' also. */
3343: bitset_set (sbcset, '_');
3344:
3345: /* If it is non-matching list. */
3346: #ifdef RE_ENABLE_I18N
3347: if (mbcset->non_match)
3348: #else /* not RE_ENABLE_I18N */
3349: if (non_match)
3350: #endif /* not RE_ENABLE_I18N */
3351: bitset_not (sbcset);
3352:
3353: /* Build a tree for simple bracket. */
3354: br_token.type = SIMPLE_BRACKET;
3355: br_token.opr.sbcset = sbcset;
3356: new_idx = re_dfa_add_node (dfa, br_token, 0);
3357: tree = create_tree (NULL, NULL, 0, new_idx);
3358: if (BE (new_idx == -1 || tree == NULL, 0))
3359: goto build_word_op_espace;
3360:
3361: #ifdef RE_ENABLE_I18N
3362: if (MB_CUR_MAX > 1)
3363: {
3364: re_token_t alt_token;
3365: bin_tree_t *mbc_tree;
3366: /* Build a tree for complex bracket. */
3367: br_token.type = COMPLEX_BRACKET;
3368: br_token.opr.mbcset = mbcset;
3369: dfa->has_mb_node = 1;
3370: new_idx = re_dfa_add_node (dfa, br_token, 0);
3371: mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3372: if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3373: goto build_word_op_espace;
3374: /* Then join them by ALT node. */
3375: alt_token.type = OP_ALT;
3376: new_idx = re_dfa_add_node (dfa, alt_token, 0);
3377: tree = create_tree (tree, mbc_tree, 0, new_idx);
3378: if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3379: return tree;
3380: }
3381: else
3382: {
3383: free_charset (mbcset);
3384: return tree;
3385: }
3386: #else /* not RE_ENABLE_I18N */
3387: return tree;
3388: #endif /* not RE_ENABLE_I18N */
3389:
3390: build_word_op_espace:
3391: re_free (sbcset);
3392: #ifdef RE_ENABLE_I18N
3393: free_charset (mbcset);
3394: #endif /* RE_ENABLE_I18N */
3395: *err = REG_ESPACE;
3396: return NULL;
3397: }
3398:
3399: /* This is intended for the expressions like "a{1,3}".
3400: Fetch a number from `input', and return the number.
3401: Return -1, if the number field is empty like "{,1}".
3402: Return -2, If an error is occured. */
3403:
3404: static int
3405: fetch_number (input, token, syntax)
3406: re_string_t *input;
3407: re_token_t *token;
3408: reg_syntax_t syntax;
3409: {
3410: int num = -1;
3411: unsigned char c;
3412: while (1)
3413: {
3414: *token = fetch_token (input, syntax);
3415: c = token->opr.c;
3416: if (BE (token->type == END_OF_RE, 0))
3417: return -2;
3418: if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3419: break;
3420: num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3421: ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3422: num = (num > RE_DUP_MAX) ? -2 : num;
3423: }
3424: return num;
3425: }
3426:
3427: #ifdef RE_ENABLE_I18N
3428: static void
3429: free_charset (re_charset_t *cset)
3430: {
3431: re_free (cset->mbchars);
3432: # ifdef _LIBC
3433: re_free (cset->coll_syms);
3434: re_free (cset->equiv_classes);
3435: re_free (cset->range_starts);
3436: re_free (cset->range_ends);
3437: # endif
3438: re_free (cset->char_classes);
3439: re_free (cset);
3440: }
3441: #endif /* RE_ENABLE_I18N */
3442:
3443: /* Functions for binary tree operation. */
3444:
3445: /* Create a node of tree.
3446: Note: This function automatically free left and right if malloc fails. */
3447:
3448: static bin_tree_t *
3449: create_tree (left, right, type, index)
3450: bin_tree_t *left;
3451: bin_tree_t *right;
3452: re_token_type_t type;
3453: int index;
3454: {
3455: bin_tree_t *tree;
3456: tree = re_malloc (bin_tree_t, 1);
3457: if (BE (tree == NULL, 0))
3458: {
3459: free_bin_tree (left);
3460: free_bin_tree (right);
3461: return NULL;
3462: }
3463: tree->parent = NULL;
3464: tree->left = left;
3465: tree->right = right;
3466: tree->type = type;
3467: tree->node_idx = index;
3468: tree->first = -1;
3469: tree->next = -1;
3470: re_node_set_init_empty (&tree->eclosure);
3471:
3472: if (left != NULL)
3473: left->parent = tree;
3474: if (right != NULL)
3475: right->parent = tree;
3476: return tree;
3477: }
3478:
3479: /* Free the sub tree pointed by TREE. */
3480:
3481: static void
3482: free_bin_tree (tree)
3483: bin_tree_t *tree;
3484: {
3485: if (tree == NULL)
3486: return;
3487: /*re_node_set_free (&tree->eclosure);*/
3488: free_bin_tree (tree->left);
3489: free_bin_tree (tree->right);
3490: re_free (tree);
3491: }
3492:
3493: /* Duplicate the node SRC, and return new node. */
3494:
3495: static bin_tree_t *
3496: duplicate_tree (src, dfa)
3497: const bin_tree_t *src;
3498: re_dfa_t *dfa;
3499: {
3500: bin_tree_t *left = NULL, *right = NULL, *new_tree;
3501: int new_node_idx;
3502: /* Since node indies must be according to Post-order of the tree,
3503: we must duplicate the left at first. */
3504: if (src->left != NULL)
3505: {
3506: left = duplicate_tree (src->left, dfa);
3507: if (left == NULL)
3508: return NULL;
3509: }
3510:
3511: /* Secondaly, duplicate the right. */
3512: if (src->right != NULL)
3513: {
3514: right = duplicate_tree (src->right, dfa);
3515: if (right == NULL)
3516: {
3517: free_bin_tree (left);
3518: return NULL;
3519: }
3520: }
3521:
3522: /* At last, duplicate itself. */
3523: if (src->type == NON_TYPE)
3524: {
3525: new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3526: dfa->nodes[new_node_idx].duplicated = 1;
3527: if (BE (new_node_idx == -1, 0))
3528: {
3529: free_bin_tree (left);
3530: free_bin_tree (right);
3531: return NULL;
3532: }
3533: }
3534: else
3535: new_node_idx = src->type;
3536:
3537: new_tree = create_tree (left, right, src->type, new_node_idx);
3538: if (BE (new_tree == NULL, 0))
3539: {
3540: free_bin_tree (left);
3541: free_bin_tree (right);
3542: }
3543: return new_tree;
3544: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>