File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / smartmontools / regex / regcomp.c
Revision 1.1.1.2 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Mon Jul 22 01:17:36 2013 UTC (10 years, 11 months ago) by misho
Branches: smartmontools, elwix, MAIN
CVS tags: v6_2, v6_1p0, v6_1, HEAD
6.1

    1: /* Extended regular expression matching and search library.
    2:    Copyright (C) 2002, 2003 Free Software Foundation, Inc.
    3:    This file is part of the GNU C Library.
    4:    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
    5: 
    6:    The GNU C Library is free software; you can redistribute it and/or
    7:    modify it under the terms of the GNU Lesser General Public
    8:    License as published by the Free Software Foundation; either
    9:    version 2.1 of the License, or (at your option) any later version.
   10: 
   11:    The GNU C Library is distributed in the hope that it will be useful,
   12:    but WITHOUT ANY WARRANTY; without even the implied warranty of
   13:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   14:    Lesser General Public License for more details.
   15: 
   16:    You should have received a copy of the GNU Lesser General Public
   17:    License along with the GNU C Library; if not, write to the Free
   18:    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
   19:    MA 02110-1301 USA.  */
   20: 
   21: static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
   22: 					  int length, reg_syntax_t syntax);
   23: static void re_compile_fastmap_iter (regex_t *bufp,
   24: 				     const re_dfastate_t *init_state,
   25: 				     char *fastmap);
   26: static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
   27: static reg_errcode_t init_word_char (re_dfa_t *dfa);
   28: #ifdef RE_ENABLE_I18N
   29: static void free_charset (re_charset_t *cset);
   30: #endif /* RE_ENABLE_I18N */
   31: static void free_workarea_compile (regex_t *preg);
   32: static reg_errcode_t create_initial_state (re_dfa_t *dfa);
   33: static reg_errcode_t analyze (re_dfa_t *dfa);
   34: static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
   35: static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
   36: static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
   37: static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
   38: static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
   39: 					     int top_clone_node, int root_node,
   40: 					     unsigned int constraint);
   41: static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
   42: 				     unsigned int constraint);
   43: static int search_duplicated_node (re_dfa_t *dfa, int org_node,
   44: 				   unsigned int constraint);
   45: static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
   46: static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
   47: 					 int node, int root);
   48: static void calc_inveclosure (re_dfa_t *dfa);
   49: static int fetch_number (re_string_t *input, re_token_t *token,
   50: 			 reg_syntax_t syntax);
   51: static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
   52: static int peek_token (re_token_t *token, re_string_t *input,
   53: 			reg_syntax_t syntax);
   54: static int peek_token_bracket (re_token_t *token, re_string_t *input,
   55: 			       reg_syntax_t syntax);
   56: static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
   57: 			  reg_syntax_t syntax, reg_errcode_t *err);
   58: static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
   59: 				  re_token_t *token, reg_syntax_t syntax,
   60: 				  int nest, reg_errcode_t *err);
   61: static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
   62: 				 re_token_t *token, reg_syntax_t syntax,
   63: 				 int nest, reg_errcode_t *err);
   64: static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
   65: 				     re_token_t *token, reg_syntax_t syntax,
   66: 				     int nest, reg_errcode_t *err);
   67: static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
   68: 				  re_token_t *token, reg_syntax_t syntax,
   69: 				  int nest, reg_errcode_t *err);
   70: static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
   71: 				 re_dfa_t *dfa, re_token_t *token,
   72: 				 reg_syntax_t syntax, reg_errcode_t *err);
   73: static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
   74: 				      re_token_t *token, reg_syntax_t syntax,
   75: 				      reg_errcode_t *err);
   76: static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
   77: 					    re_string_t *regexp,
   78: 					    re_token_t *token, int token_len,
   79: 					    re_dfa_t *dfa,
   80: 					    reg_syntax_t syntax);
   81: static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
   82: 					  re_string_t *regexp,
   83: 					  re_token_t *token);
   84: #ifndef _LIBC
   85: # ifdef RE_ENABLE_I18N
   86: static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
   87: 				      re_charset_t *mbcset, int *range_alloc,
   88: 				      bracket_elem_t *start_elem,
   89: 				      bracket_elem_t *end_elem);
   90: static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
   91: 					     re_charset_t *mbcset,
   92: 					     int *coll_sym_alloc,
   93: 					     const unsigned char *name);
   94: # else /* not RE_ENABLE_I18N */
   95: static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
   96: 				      bracket_elem_t *start_elem,
   97: 				      bracket_elem_t *end_elem);
   98: static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
   99: 					     const unsigned char *name);
  100: # endif /* not RE_ENABLE_I18N */
  101: #endif /* not _LIBC */
  102: #ifdef RE_ENABLE_I18N
  103: static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
  104: 					re_charset_t *mbcset,
  105: 					int *equiv_class_alloc,
  106: 					const unsigned char *name);
  107: static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
  108: 				      re_charset_t *mbcset,
  109: 				      int *char_class_alloc,
  110: 				      const unsigned char *class_name,
  111: 				      reg_syntax_t syntax);
  112: #else  /* not RE_ENABLE_I18N */
  113: static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
  114: 					const unsigned char *name);
  115: static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
  116: 				      const unsigned char *class_name,
  117: 				      reg_syntax_t syntax);
  118: #endif /* not RE_ENABLE_I18N */
  119: static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
  120: static void free_bin_tree (bin_tree_t *tree);
  121: static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
  122: 				re_token_type_t type, int index);
  123: static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
  124: 
  125: /* This table gives an error message for each of the error codes listed
  126:    in regex.h.  Obviously the order here has to be same as there.
  127:    POSIX doesn't require that we do anything for REG_NOERROR,
  128:    but why not be nice?  */
  129: 
  130: const char __re_error_msgid[] attribute_hidden =
  131:   {
  132: #define REG_NOERROR_IDX	0
  133:     gettext_noop ("Success")	/* REG_NOERROR */
  134:     "\0"
  135: #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
  136:     gettext_noop ("No match")	/* REG_NOMATCH */
  137:     "\0"
  138: #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
  139:     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
  140:     "\0"
  141: #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
  142:     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
  143:     "\0"
  144: #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
  145:     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
  146:     "\0"
  147: #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
  148:     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
  149:     "\0"
  150: #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
  151:     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
  152:     "\0"
  153: #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
  154:     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
  155:     "\0"
  156: #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
  157:     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
  158:     "\0"
  159: #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
  160:     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
  161:     "\0"
  162: #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
  163:     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
  164:     "\0"
  165: #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
  166:     gettext_noop ("Invalid range end")	/* REG_ERANGE */
  167:     "\0"
  168: #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
  169:     gettext_noop ("Memory exhausted") /* REG_ESPACE */
  170:     "\0"
  171: #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
  172:     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
  173:     "\0"
  174: #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
  175:     gettext_noop ("Premature end of regular expression") /* REG_EEND */
  176:     "\0"
  177: #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
  178:     gettext_noop ("Regular expression too big") /* REG_ESIZE */
  179:     "\0"
  180: #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
  181:     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
  182:   };
  183: 
  184: const size_t __re_error_msgid_idx[] attribute_hidden =
  185:   {
  186:     REG_NOERROR_IDX,
  187:     REG_NOMATCH_IDX,
  188:     REG_BADPAT_IDX,
  189:     REG_ECOLLATE_IDX,
  190:     REG_ECTYPE_IDX,
  191:     REG_EESCAPE_IDX,
  192:     REG_ESUBREG_IDX,
  193:     REG_EBRACK_IDX,
  194:     REG_EPAREN_IDX,
  195:     REG_EBRACE_IDX,
  196:     REG_BADBR_IDX,
  197:     REG_ERANGE_IDX,
  198:     REG_ESPACE_IDX,
  199:     REG_BADRPT_IDX,
  200:     REG_EEND_IDX,
  201:     REG_ESIZE_IDX,
  202:     REG_ERPAREN_IDX
  203:   };
  204: 
  205: /* Entry points for GNU code.  */
  206: 
  207: /* re_compile_pattern is the GNU regular expression compiler: it
  208:    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
  209:    Returns 0 if the pattern was valid, otherwise an error string.
  210: 
  211:    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
  212:    are set in BUFP on entry.  */
  213: 
  214: const char *
  215: re_compile_pattern (pattern, length, bufp)
  216:     const char *pattern;
  217:     size_t length;
  218:     struct re_pattern_buffer *bufp;
  219: {
  220:   reg_errcode_t ret;
  221: 
  222:   /* And GNU code determines whether or not to get register information
  223:      by passing null for the REGS argument to re_match, etc., not by
  224:      setting no_sub.  */
  225:   bufp->no_sub = 0;
  226: 
  227:   /* Match anchors at newline.  */
  228:   bufp->newline_anchor = 1;
  229: 
  230:   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
  231: 
  232:   if (!ret)
  233:     return NULL;
  234:   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
  235: }
  236: #ifdef _LIBC
  237: weak_alias (__re_compile_pattern, re_compile_pattern)
  238: #endif
  239: 
  240: /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
  241:    also be assigned to arbitrarily: each pattern buffer stores its own
  242:    syntax, so it can be changed between regex compilations.  */
  243: /* This has no initializer because initialized variables in Emacs
  244:    become read-only after dumping.  */
  245: reg_syntax_t re_syntax_options;
  246: 
  247: 
  248: /* Specify the precise syntax of regexps for compilation.  This provides
  249:    for compatibility for various utilities which historically have
  250:    different, incompatible syntaxes.
  251: 
  252:    The argument SYNTAX is a bit mask comprised of the various bits
  253:    defined in regex.h.  We return the old syntax.  */
  254: 
  255: reg_syntax_t
  256: re_set_syntax (syntax)
  257:     reg_syntax_t syntax;
  258: {
  259:   reg_syntax_t ret = re_syntax_options;
  260: 
  261:   re_syntax_options = syntax;
  262:   return ret;
  263: }
  264: #ifdef _LIBC
  265: weak_alias (__re_set_syntax, re_set_syntax)
  266: #endif
  267: 
  268: int
  269: re_compile_fastmap (bufp)
  270:     struct re_pattern_buffer *bufp;
  271: {
  272:   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
  273:   char *fastmap = bufp->fastmap;
  274: 
  275:   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
  276:   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
  277:   if (dfa->init_state != dfa->init_state_word)
  278:     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
  279:   if (dfa->init_state != dfa->init_state_nl)
  280:     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
  281:   if (dfa->init_state != dfa->init_state_begbuf)
  282:     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
  283:   bufp->fastmap_accurate = 1;
  284:   return 0;
  285: }
  286: #ifdef _LIBC
  287: weak_alias (__re_compile_fastmap, re_compile_fastmap)
  288: #endif
  289: 
  290: static inline void
  291: re_set_fastmap (char *fastmap, int icase, int ch)
  292: {
  293:   fastmap[ch] = 1;
  294:   if (icase)
  295:     fastmap[tolower (ch)] = 1;
  296: }
  297: 
  298: /* Helper function for re_compile_fastmap.
  299:    Compile fastmap for the initial_state INIT_STATE.  */
  300: 
  301: static void
  302: re_compile_fastmap_iter (bufp, init_state, fastmap)
  303:      regex_t *bufp;
  304:      const re_dfastate_t *init_state;
  305:      char *fastmap;
  306: {
  307:   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
  308:   int node_cnt;
  309:   int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
  310:   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
  311:     {
  312:       int node = init_state->nodes.elems[node_cnt];
  313:       re_token_type_t type = dfa->nodes[node].type;
  314: 
  315:       if (type == CHARACTER)
  316: 	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
  317:       else if (type == SIMPLE_BRACKET)
  318: 	{
  319: 	  int i, j, ch;
  320: 	  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
  321: 	    for (j = 0; j < UINT_BITS; ++j, ++ch)
  322: 	      if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
  323: 		re_set_fastmap (fastmap, icase, ch);
  324: 	}
  325: #ifdef RE_ENABLE_I18N
  326:       else if (type == COMPLEX_BRACKET)
  327: 	{
  328: 	  int i;
  329: 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
  330: 	  if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
  331: 	      || cset->nranges || cset->nchar_classes)
  332: 	    {
  333: # ifdef _LIBC
  334: 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
  335: 		{
  336: 		  /* In this case we want to catch the bytes which are
  337: 		     the first byte of any collation elements.
  338: 		     e.g. In da_DK, we want to catch 'a' since "aa"
  339: 			  is a valid collation element, and don't catch
  340: 			  'b' since 'b' is the only collation element
  341: 			  which starts from 'b'.  */
  342: 		  int j, ch;
  343: 		  const int32_t *table = (const int32_t *)
  344: 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
  345: 		  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
  346: 		    for (j = 0; j < UINT_BITS; ++j, ++ch)
  347: 		      if (table[ch] < 0)
  348: 			re_set_fastmap (fastmap, icase, ch);
  349: 		}
  350: # else
  351: 	      if (MB_CUR_MAX > 1)
  352: 		for (i = 0; i < SBC_MAX; ++i)
  353: 		  if (__btowc (i) == WEOF)
  354: 		    re_set_fastmap (fastmap, icase, i);
  355: # endif /* not _LIBC */
  356: 	    }
  357: 	  for (i = 0; i < cset->nmbchars; ++i)
  358: 	    {
  359: 	      char buf[256];
  360: 	      mbstate_t state;
  361: 	      memset (&state, '\0', sizeof (state));
  362: 	      __wcrtomb (buf, cset->mbchars[i], &state);
  363: 	      re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
  364: 	    }
  365: 	}
  366: #endif /* RE_ENABLE_I18N */
  367:       else if (type == END_OF_RE || type == OP_PERIOD)
  368: 	{
  369: 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
  370: 	  if (type == END_OF_RE)
  371: 	    bufp->can_be_null = 1;
  372: 	  return;
  373: 	}
  374:     }
  375: }
  376: 
  377: /* Entry point for POSIX code.  */
  378: /* regcomp takes a regular expression as a string and compiles it.
  379: 
  380:    PREG is a regex_t *.  We do not expect any fields to be initialized,
  381:    since POSIX says we shouldn't.  Thus, we set
  382: 
  383:      `buffer' to the compiled pattern;
  384:      `used' to the length of the compiled pattern;
  385:      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
  386:        REG_EXTENDED bit in CFLAGS is set; otherwise, to
  387:        RE_SYNTAX_POSIX_BASIC;
  388:      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
  389:      `fastmap' to an allocated space for the fastmap;
  390:      `fastmap_accurate' to zero;
  391:      `re_nsub' to the number of subexpressions in PATTERN.
  392: 
  393:    PATTERN is the address of the pattern string.
  394: 
  395:    CFLAGS is a series of bits which affect compilation.
  396: 
  397:      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
  398:      use POSIX basic syntax.
  399: 
  400:      If REG_NEWLINE is set, then . and [^...] don't match newline.
  401:      Also, regexec will try a match beginning after every newline.
  402: 
  403:      If REG_ICASE is set, then we considers upper- and lowercase
  404:      versions of letters to be equivalent when matching.
  405: 
  406:      If REG_NOSUB is set, then when PREG is passed to regexec, that
  407:      routine will report only success or failure, and nothing about the
  408:      registers.
  409: 
  410:    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
  411:    the return codes and their meanings.)  */
  412: 
  413: int
  414: regcomp (preg, pattern, cflags)
  415:     regex_t *__restrict preg;
  416:     const char *__restrict pattern;
  417:     int cflags;
  418: {
  419:   reg_errcode_t ret;
  420:   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
  421: 			 : RE_SYNTAX_POSIX_BASIC);
  422: 
  423:   preg->buffer = NULL;
  424:   preg->allocated = 0;
  425:   preg->used = 0;
  426: 
  427:   /* Try to allocate space for the fastmap.  */
  428:   preg->fastmap = re_malloc (char, SBC_MAX);
  429:   if (BE (preg->fastmap == NULL, 0))
  430:     return REG_ESPACE;
  431: 
  432:   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
  433: 
  434:   /* If REG_NEWLINE is set, newlines are treated differently.  */
  435:   if (cflags & REG_NEWLINE)
  436:     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
  437:       syntax &= ~RE_DOT_NEWLINE;
  438:       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
  439:       /* It also changes the matching behavior.  */
  440:       preg->newline_anchor = 1;
  441:     }
  442:   else
  443:     preg->newline_anchor = 0;
  444:   preg->no_sub = !!(cflags & REG_NOSUB);
  445:   preg->translate = NULL;
  446: 
  447:   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
  448: 
  449:   /* POSIX doesn't distinguish between an unmatched open-group and an
  450:      unmatched close-group: both are REG_EPAREN.  */
  451:   if (ret == REG_ERPAREN)
  452:     ret = REG_EPAREN;
  453: 
  454:   /* We have already checked preg->fastmap != NULL.  */
  455:   if (BE (ret == REG_NOERROR, 1))
  456:     /* Compute the fastmap now, since regexec cannot modify the pattern
  457:        buffer.  This function nevers fails in this implementation.  */
  458:     (void) re_compile_fastmap (preg);
  459:   else
  460:     {
  461:       /* Some error occurred while compiling the expression.  */
  462:       re_free (preg->fastmap);
  463:       preg->fastmap = NULL;
  464:     }
  465: 
  466:   return (int) ret;
  467: }
  468: #ifdef _LIBC
  469: weak_alias (__regcomp, regcomp)
  470: #endif
  471: 
  472: /* Returns a message corresponding to an error code, ERRCODE, returned
  473:    from either regcomp or regexec.   We don't use PREG here.  */
  474: 
  475: size_t
  476: regerror (errcode, preg, errbuf, errbuf_size)
  477:     int errcode;
  478:     const regex_t *preg;
  479:     char *errbuf;
  480:     size_t errbuf_size;
  481: {
  482:   const char *msg;
  483:   size_t msg_size;
  484: 
  485:   if (BE (errcode < 0
  486: 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
  487: 			       / sizeof (__re_error_msgid_idx[0])), 0))
  488:     /* Only error codes returned by the rest of the code should be passed
  489:        to this routine.  If we are given anything else, or if other regex
  490:        code generates an invalid error code, then the program has a bug.
  491:        Dump core so we can fix it.  */
  492:     abort ();
  493: 
  494:   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
  495: 
  496:   msg_size = strlen (msg) + 1; /* Includes the null.  */
  497: 
  498:   if (BE (errbuf_size != 0, 1))
  499:     {
  500:       if (BE (msg_size > errbuf_size, 0))
  501: 	{
  502: #if defined HAVE_MEMPCPY || defined _LIBC
  503: 	  *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
  504: #else
  505: 	  memcpy (errbuf, msg, errbuf_size - 1);
  506: 	  errbuf[errbuf_size - 1] = 0;
  507: #endif
  508: 	}
  509:       else
  510: 	memcpy (errbuf, msg, msg_size);
  511:     }
  512: 
  513:   return msg_size;
  514: }
  515: #ifdef _LIBC
  516: weak_alias (__regerror, regerror)
  517: #endif
  518: 
  519: 
  520: static void
  521: free_dfa_content (re_dfa_t *dfa)
  522: {
  523:   int i, j;
  524: 
  525:   re_free (dfa->subexps);
  526: 
  527:   for (i = 0; i < dfa->nodes_len; ++i)
  528:     {
  529:       re_token_t *node = dfa->nodes + i;
  530: #ifdef RE_ENABLE_I18N
  531:       if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
  532: 	free_charset (node->opr.mbcset);
  533:       else
  534: #endif /* RE_ENABLE_I18N */
  535: 	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
  536: 	  re_free (node->opr.sbcset);
  537:     }
  538:   re_free (dfa->nexts);
  539:   for (i = 0; i < dfa->nodes_len; ++i)
  540:     {
  541:       if (dfa->eclosures != NULL)
  542: 	re_node_set_free (dfa->eclosures + i);
  543:       if (dfa->inveclosures != NULL)
  544: 	re_node_set_free (dfa->inveclosures + i);
  545:       if (dfa->edests != NULL)
  546: 	re_node_set_free (dfa->edests + i);
  547:     }
  548:   re_free (dfa->edests);
  549:   re_free (dfa->eclosures);
  550:   re_free (dfa->inveclosures);
  551:   re_free (dfa->nodes);
  552: 
  553:   for (i = 0; i <= dfa->state_hash_mask; ++i)
  554:     {
  555:       struct re_state_table_entry *entry = dfa->state_table + i;
  556:       for (j = 0; j < entry->num; ++j)
  557: 	{
  558: 	  re_dfastate_t *state = entry->array[j];
  559: 	  free_state (state);
  560: 	}
  561:       re_free (entry->array);
  562:     }
  563:   re_free (dfa->state_table);
  564: 
  565:   if (dfa->word_char != NULL)
  566:     re_free (dfa->word_char);
  567: #ifdef DEBUG
  568:   re_free (dfa->re_str);
  569: #endif
  570: 
  571:   re_free (dfa);
  572: }
  573: 
  574: 
  575: /* Free dynamically allocated space used by PREG.  */
  576: 
  577: void
  578: regfree (preg)
  579:     regex_t *preg;
  580: {
  581:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
  582:   if (BE (dfa != NULL, 1))
  583:     free_dfa_content (dfa);
  584: 
  585:   re_free (preg->fastmap);
  586: }
  587: #ifdef _LIBC
  588: weak_alias (__regfree, regfree)
  589: #endif
  590: 
  591: /* Entry points compatible with 4.2 BSD regex library.  We don't define
  592:    them unless specifically requested.  */
  593: 
  594: #if defined _REGEX_RE_COMP || defined _LIBC
  595: 
  596: /* BSD has one and only one pattern buffer.  */
  597: static struct re_pattern_buffer re_comp_buf;
  598: 
  599: char *
  600: # ifdef _LIBC
  601: /* Make these definitions weak in libc, so POSIX programs can redefine
  602:    these names if they don't use our functions, and still use
  603:    regcomp/regexec above without link errors.  */
  604: weak_function
  605: # endif
  606: re_comp (s)
  607:      const char *s;
  608: {
  609:   reg_errcode_t ret;
  610:   char *fastmap;
  611: 
  612:   if (!s)
  613:     {
  614:       if (!re_comp_buf.buffer)
  615: 	return gettext ("No previous regular expression");
  616:       return 0;
  617:     }
  618: 
  619:   if (re_comp_buf.buffer)
  620:     {
  621:       fastmap = re_comp_buf.fastmap;
  622:       re_comp_buf.fastmap = NULL;
  623:       __regfree (&re_comp_buf);
  624:       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
  625:       re_comp_buf.fastmap = fastmap;
  626:     }
  627: 
  628:   if (re_comp_buf.fastmap == NULL)
  629:     {
  630:       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
  631:       if (re_comp_buf.fastmap == NULL)
  632: 	return (char *) gettext (__re_error_msgid
  633: 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
  634:     }
  635: 
  636:   /* Since `re_exec' always passes NULL for the `regs' argument, we
  637:      don't need to initialize the pattern buffer fields which affect it.  */
  638: 
  639:   /* Match anchors at newlines.  */
  640:   re_comp_buf.newline_anchor = 1;
  641: 
  642:   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
  643: 
  644:   if (!ret)
  645:     return NULL;
  646: 
  647:   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
  648:   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
  649: }
  650: 
  651: #ifdef _LIBC
  652: libc_freeres_fn (free_mem)
  653: {
  654:   __regfree (&re_comp_buf);
  655: }
  656: #endif
  657: 
  658: #endif /* _REGEX_RE_COMP */
  659: 
  660: /* Internal entry point.
  661:    Compile the regular expression PATTERN, whose length is LENGTH.
  662:    SYNTAX indicate regular expression's syntax.  */
  663: 
  664: static reg_errcode_t
  665: re_compile_internal (preg, pattern, length, syntax)
  666:      regex_t *preg;
  667:      const char * pattern;
  668:      int length;
  669:      reg_syntax_t syntax;
  670: {
  671:   reg_errcode_t err = REG_NOERROR;
  672:   re_dfa_t *dfa;
  673:   re_string_t regexp;
  674: 
  675:   /* Initialize the pattern buffer.  */
  676:   preg->fastmap_accurate = 0;
  677:   preg->syntax = syntax;
  678:   preg->not_bol = preg->not_eol = 0;
  679:   preg->used = 0;
  680:   preg->re_nsub = 0;
  681:   preg->can_be_null = 0;
  682:   preg->regs_allocated = REGS_UNALLOCATED;
  683: 
  684:   /* Initialize the dfa.  */
  685:   dfa = (re_dfa_t *) preg->buffer;
  686:   if (preg->allocated < sizeof (re_dfa_t))
  687:     {
  688:       /* If zero allocated, but buffer is non-null, try to realloc
  689: 	 enough space.  This loses if buffer's address is bogus, but
  690: 	 that is the user's responsibility.  If ->buffer is NULL this
  691: 	 is a simple allocation.  */
  692:       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
  693:       if (dfa == NULL)
  694: 	return REG_ESPACE;
  695:       preg->allocated = sizeof (re_dfa_t);
  696:     }
  697:   preg->buffer = (unsigned char *) dfa;
  698:   preg->used = sizeof (re_dfa_t);
  699: 
  700:   err = init_dfa (dfa, length);
  701:   if (BE (err != REG_NOERROR, 0))
  702:     {
  703:       re_free (dfa);
  704:       preg->buffer = NULL;
  705:       preg->allocated = 0;
  706:       return err;
  707:     }
  708: #ifdef DEBUG
  709:   dfa->re_str = re_malloc (char, length + 1);
  710:   strncpy (dfa->re_str, pattern, length + 1);
  711: #endif
  712: 
  713:   err = re_string_construct (&regexp, pattern, length, preg->translate,
  714: 			     syntax & RE_ICASE);
  715:   if (BE (err != REG_NOERROR, 0))
  716:     {
  717:       re_free (dfa);
  718:       preg->buffer = NULL;
  719:       preg->allocated = 0;
  720:       return err;
  721:     }
  722: 
  723:   /* Parse the regular expression, and build a structure tree.  */
  724:   preg->re_nsub = 0;
  725:   dfa->str_tree = parse (&regexp, preg, syntax, &err);
  726:   if (BE (dfa->str_tree == NULL, 0))
  727:     goto re_compile_internal_free_return;
  728: 
  729:   /* Analyze the tree and collect information which is necessary to
  730:      create the dfa.  */
  731:   err = analyze (dfa);
  732:   if (BE (err != REG_NOERROR, 0))
  733:     goto re_compile_internal_free_return;
  734: 
  735:   /* Then create the initial state of the dfa.  */
  736:   err = create_initial_state (dfa);
  737: 
  738:   /* Release work areas.  */
  739:   free_workarea_compile (preg);
  740:   re_string_destruct (&regexp);
  741: 
  742:   if (BE (err != REG_NOERROR, 0))
  743:     {
  744:     re_compile_internal_free_return:
  745:       free_dfa_content (dfa);
  746:       preg->buffer = NULL;
  747:       preg->allocated = 0;
  748:     }
  749: 
  750:   return err;
  751: }
  752: 
  753: /* Initialize DFA.  We use the length of the regular expression PAT_LEN
  754:    as the initial length of some arrays.  */
  755: 
  756: static reg_errcode_t
  757: init_dfa (dfa, pat_len)
  758:      re_dfa_t *dfa;
  759:      int pat_len;
  760: {
  761:   int table_size;
  762: 
  763:   memset (dfa, '\0', sizeof (re_dfa_t));
  764: 
  765:   dfa->nodes_alloc = pat_len + 1;
  766:   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
  767: 
  768:   dfa->states_alloc = pat_len + 1;
  769: 
  770:   /*  table_size = 2 ^ ceil(log pat_len) */
  771:   for (table_size = 1; table_size > 0; table_size <<= 1)
  772:     if (table_size > pat_len)
  773:       break;
  774: 
  775:   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
  776:   dfa->state_hash_mask = table_size - 1;
  777: 
  778:   dfa->subexps_alloc = 1;
  779:   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
  780:   dfa->word_char = NULL;
  781: 
  782:   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
  783: 	  || dfa->subexps == NULL, 0))
  784:     {
  785:       /* We don't bother to free anything which was allocated.  Very
  786: 	 soon the process will go down anyway.  */
  787:       dfa->subexps = NULL;
  788:       dfa->state_table = NULL;
  789:       dfa->nodes = NULL;
  790:       return REG_ESPACE;
  791:     }
  792:   return REG_NOERROR;
  793: }
  794: 
  795: /* Initialize WORD_CHAR table, which indicate which character is
  796:    "word".  In this case "word" means that it is the word construction
  797:    character used by some operators like "\<", "\>", etc.  */
  798: 
  799: static reg_errcode_t
  800: init_word_char (dfa)
  801:      re_dfa_t *dfa;
  802: {
  803:   int i, j, ch;
  804:   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
  805:   if (BE (dfa->word_char == NULL, 0))
  806:     return REG_ESPACE;
  807:   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
  808:     for (j = 0; j < UINT_BITS; ++j, ++ch)
  809:       if (isalnum (ch) || ch == '_')
  810: 	dfa->word_char[i] |= 1 << j;
  811:   return REG_NOERROR;
  812: }
  813: 
  814: /* Free the work area which are only used while compiling.  */
  815: 
  816: static void
  817: free_workarea_compile (preg)
  818:      regex_t *preg;
  819: {
  820:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
  821:   free_bin_tree (dfa->str_tree);
  822:   dfa->str_tree = NULL;
  823:   re_free (dfa->org_indices);
  824:   dfa->org_indices = NULL;
  825: }
  826: 
  827: /* Create initial states for all contexts.  */
  828: 
  829: static reg_errcode_t
  830: create_initial_state (dfa)
  831:      re_dfa_t *dfa;
  832: {
  833:   int first, i;
  834:   reg_errcode_t err;
  835:   re_node_set init_nodes;
  836: 
  837:   /* Initial states have the epsilon closure of the node which is
  838:      the first node of the regular expression.  */
  839:   first = dfa->str_tree->first;
  840:   dfa->init_node = first;
  841:   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
  842:   if (BE (err != REG_NOERROR, 0))
  843:     return err;
  844: 
  845:   /* The back-references which are in initial states can epsilon transit,
  846:      since in this case all of the subexpressions can be null.
  847:      Then we add epsilon closures of the nodes which are the next nodes of
  848:      the back-references.  */
  849:   if (dfa->nbackref > 0)
  850:     for (i = 0; i < init_nodes.nelem; ++i)
  851:       {
  852: 	int node_idx = init_nodes.elems[i];
  853: 	re_token_type_t type = dfa->nodes[node_idx].type;
  854: 
  855: 	int clexp_idx;
  856: 	if (type != OP_BACK_REF)
  857: 	  continue;
  858: 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
  859: 	  {
  860: 	    re_token_t *clexp_node;
  861: 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
  862: 	    if (clexp_node->type == OP_CLOSE_SUBEXP
  863: 		&& clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
  864: 	      break;
  865: 	  }
  866: 	if (clexp_idx == init_nodes.nelem)
  867: 	  continue;
  868: 
  869: 	if (type == OP_BACK_REF)
  870: 	  {
  871: 	    int dest_idx = dfa->edests[node_idx].elems[0];
  872: 	    if (!re_node_set_contains (&init_nodes, dest_idx))
  873: 	      {
  874: 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
  875: 		i = 0;
  876: 	      }
  877: 	  }
  878:       }
  879: 
  880:   /* It must be the first time to invoke acquire_state.  */
  881:   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
  882:   /* We don't check ERR here, since the initial state must not be NULL.  */
  883:   if (BE (dfa->init_state == NULL, 0))
  884:     return err;
  885:   if (dfa->init_state->has_constraint)
  886:     {
  887:       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
  888: 						       CONTEXT_WORD);
  889:       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
  890: 						     CONTEXT_NEWLINE);
  891:       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
  892: 							 &init_nodes,
  893: 							 CONTEXT_NEWLINE
  894: 							 | CONTEXT_BEGBUF);
  895:       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
  896: 	      || dfa->init_state_begbuf == NULL, 0))
  897: 	return err;
  898:     }
  899:   else
  900:     dfa->init_state_word = dfa->init_state_nl
  901:       = dfa->init_state_begbuf = dfa->init_state;
  902: 
  903:   re_node_set_free (&init_nodes);
  904:   return REG_NOERROR;
  905: }
  906: 
  907: /* Analyze the structure tree, and calculate "first", "next", "edest",
  908:    "eclosure", and "inveclosure".  */
  909: 
  910: static reg_errcode_t
  911: analyze (dfa)
  912:      re_dfa_t *dfa;
  913: {
  914:   int i;
  915:   reg_errcode_t ret;
  916: 
  917:   /* Allocate arrays.  */
  918:   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
  919:   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
  920:   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
  921:   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
  922:   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
  923:   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
  924: 	  || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
  925:     return REG_ESPACE;
  926:   /* Initialize them.  */
  927:   for (i = 0; i < dfa->nodes_len; ++i)
  928:     {
  929:       dfa->nexts[i] = -1;
  930:       re_node_set_init_empty (dfa->edests + i);
  931:       re_node_set_init_empty (dfa->eclosures + i);
  932:       re_node_set_init_empty (dfa->inveclosures + i);
  933:     }
  934: 
  935:   ret = analyze_tree (dfa, dfa->str_tree);
  936:   if (BE (ret == REG_NOERROR, 1))
  937:     {
  938:       ret = calc_eclosure (dfa);
  939:       if (ret == REG_NOERROR)
  940: 	calc_inveclosure (dfa);
  941:     }
  942:   return ret;
  943: }
  944: 
  945: /* Helper functions for analyze.
  946:    This function calculate "first", "next", and "edest" for the subtree
  947:    whose root is NODE.  */
  948: 
  949: static reg_errcode_t
  950: analyze_tree (dfa, node)
  951:      re_dfa_t *dfa;
  952:      bin_tree_t *node;
  953: {
  954:   reg_errcode_t ret;
  955:   if (node->first == -1)
  956:     calc_first (dfa, node);
  957:   if (node->next == -1)
  958:     calc_next (dfa, node);
  959:   if (node->eclosure.nelem == 0)
  960:     calc_epsdest (dfa, node);
  961:   /* Calculate "first" etc. for the left child.  */
  962:   if (node->left != NULL)
  963:     {
  964:       ret = analyze_tree (dfa, node->left);
  965:       if (BE (ret != REG_NOERROR, 0))
  966: 	return ret;
  967:     }
  968:   /* Calculate "first" etc. for the right child.  */
  969:   if (node->right != NULL)
  970:     {
  971:       ret = analyze_tree (dfa, node->right);
  972:       if (BE (ret != REG_NOERROR, 0))
  973: 	return ret;
  974:     }
  975:   return REG_NOERROR;
  976: }
  977: 
  978: /* Calculate "first" for the node NODE.  */
  979: static void
  980: calc_first (dfa, node)
  981:      re_dfa_t *dfa;
  982:      bin_tree_t *node;
  983: {
  984:   int idx, type;
  985:   idx = node->node_idx;
  986:   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
  987: 
  988:   switch (type)
  989:     {
  990: #ifdef DEBUG
  991:     case OP_OPEN_BRACKET:
  992:     case OP_CLOSE_BRACKET:
  993:     case OP_OPEN_DUP_NUM:
  994:     case OP_CLOSE_DUP_NUM:
  995:     case OP_NON_MATCH_LIST:
  996:     case OP_OPEN_COLL_ELEM:
  997:     case OP_CLOSE_COLL_ELEM:
  998:     case OP_OPEN_EQUIV_CLASS:
  999:     case OP_CLOSE_EQUIV_CLASS:
 1000:     case OP_OPEN_CHAR_CLASS:
 1001:     case OP_CLOSE_CHAR_CLASS:
 1002:       /* These must not be appeared here.  */
 1003:       assert (0);
 1004: #endif
 1005:     case END_OF_RE:
 1006:     case CHARACTER:
 1007:     case OP_PERIOD:
 1008:     case OP_DUP_ASTERISK:
 1009:     case OP_DUP_QUESTION:
 1010: #ifdef RE_ENABLE_I18N
 1011:     case COMPLEX_BRACKET:
 1012: #endif /* RE_ENABLE_I18N */
 1013:     case SIMPLE_BRACKET:
 1014:     case OP_BACK_REF:
 1015:     case ANCHOR:
 1016:     case OP_OPEN_SUBEXP:
 1017:     case OP_CLOSE_SUBEXP:
 1018:       node->first = idx;
 1019:       break;
 1020:     case OP_DUP_PLUS:
 1021: #ifdef DEBUG
 1022:       assert (node->left != NULL);
 1023: #endif
 1024:       if (node->left->first == -1)
 1025: 	calc_first (dfa, node->left);
 1026:       node->first = node->left->first;
 1027:       break;
 1028:     case OP_ALT:
 1029:       node->first = idx;
 1030:       break;
 1031:       /* else fall through */
 1032:     default:
 1033: #ifdef DEBUG
 1034:       assert (node->left != NULL);
 1035: #endif
 1036:       if (node->left->first == -1)
 1037: 	calc_first (dfa, node->left);
 1038:       node->first = node->left->first;
 1039:       break;
 1040:     }
 1041: }
 1042: 
 1043: /* Calculate "next" for the node NODE.  */
 1044: 
 1045: static void
 1046: calc_next (dfa, node)
 1047:      re_dfa_t *dfa;
 1048:      bin_tree_t *node;
 1049: {
 1050:   int idx, type;
 1051:   bin_tree_t *parent = node->parent;
 1052:   if (parent == NULL)
 1053:     {
 1054:       node->next = -1;
 1055:       idx = node->node_idx;
 1056:       if (node->type == 0)
 1057: 	dfa->nexts[idx] = node->next;
 1058:       return;
 1059:     }
 1060: 
 1061:   idx = parent->node_idx;
 1062:   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
 1063: 
 1064:   switch (type)
 1065:     {
 1066:     case OP_DUP_ASTERISK:
 1067:     case OP_DUP_PLUS:
 1068:       node->next = idx;
 1069:       break;
 1070:     case CONCAT:
 1071:       if (parent->left == node)
 1072: 	{
 1073: 	  if (parent->right->first == -1)
 1074: 	    calc_first (dfa, parent->right);
 1075: 	  node->next = parent->right->first;
 1076: 	  break;
 1077: 	}
 1078:       /* else fall through */
 1079:     default:
 1080:       if (parent->next == -1)
 1081: 	calc_next (dfa, parent);
 1082:       node->next = parent->next;
 1083:       break;
 1084:     }
 1085:   idx = node->node_idx;
 1086:   if (node->type == 0)
 1087:     dfa->nexts[idx] = node->next;
 1088: }
 1089: 
 1090: /* Calculate "edest" for the node NODE.  */
 1091: 
 1092: static void
 1093: calc_epsdest (dfa, node)
 1094:      re_dfa_t *dfa;
 1095:      bin_tree_t *node;
 1096: {
 1097:   int idx;
 1098:   idx = node->node_idx;
 1099:   if (node->type == 0)
 1100:     {
 1101:       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
 1102: 	  || dfa->nodes[idx].type == OP_DUP_PLUS
 1103: 	  || dfa->nodes[idx].type == OP_DUP_QUESTION)
 1104: 	{
 1105: 	  if (node->left->first == -1)
 1106: 	    calc_first (dfa, node->left);
 1107: 	  if (node->next == -1)
 1108: 	    calc_next (dfa, node);
 1109: 	  re_node_set_init_2 (dfa->edests + idx, node->left->first,
 1110: 			      node->next);
 1111: 	}
 1112:       else if (dfa->nodes[idx].type == OP_ALT)
 1113: 	{
 1114: 	  int left, right;
 1115: 	  if (node->left != NULL)
 1116: 	    {
 1117: 	      if (node->left->first == -1)
 1118: 		calc_first (dfa, node->left);
 1119: 	      left = node->left->first;
 1120: 	    }
 1121: 	  else
 1122: 	    {
 1123: 	      if (node->next == -1)
 1124: 		calc_next (dfa, node);
 1125: 	      left = node->next;
 1126: 	    }
 1127: 	  if (node->right != NULL)
 1128: 	    {
 1129: 	      if (node->right->first == -1)
 1130: 		calc_first (dfa, node->right);
 1131: 	      right = node->right->first;
 1132: 	    }
 1133: 	  else
 1134: 	    {
 1135: 	      if (node->next == -1)
 1136: 		calc_next (dfa, node);
 1137: 	      right = node->next;
 1138: 	    }
 1139: 	  re_node_set_init_2 (dfa->edests + idx, left, right);
 1140: 	}
 1141:       else if (dfa->nodes[idx].type == ANCHOR
 1142: 	       || dfa->nodes[idx].type == OP_OPEN_SUBEXP
 1143: 	       || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
 1144: 	       || dfa->nodes[idx].type == OP_BACK_REF)
 1145: 	re_node_set_init_1 (dfa->edests + idx, node->next);
 1146:     }
 1147: }
 1148: 
 1149: /* Duplicate the epsilon closure of the node ROOT_NODE.
 1150:    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
 1151:    to their own constraint.  */
 1152: 
 1153: static reg_errcode_t
 1154: duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
 1155: 			init_constraint)
 1156:      re_dfa_t *dfa;
 1157:      int top_org_node, top_clone_node, root_node;
 1158:      unsigned int init_constraint;
 1159: {
 1160:   reg_errcode_t err;
 1161:   int org_node, clone_node, ret;
 1162:   unsigned int constraint = init_constraint;
 1163:   for (org_node = top_org_node, clone_node = top_clone_node;;)
 1164:     {
 1165:       int org_dest, clone_dest;
 1166:       if (dfa->nodes[org_node].type == OP_BACK_REF)
 1167: 	{
 1168: 	  /* If the back reference epsilon-transit, its destination must
 1169: 	     also have the constraint.  Then duplicate the epsilon closure
 1170: 	     of the destination of the back reference, and store it in
 1171: 	     edests of the back reference.  */
 1172: 	  org_dest = dfa->nexts[org_node];
 1173: 	  re_node_set_empty (dfa->edests + clone_node);
 1174: 	  err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
 1175: 	  if (BE (err != REG_NOERROR, 0))
 1176: 	    return err;
 1177: 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
 1178: 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
 1179: 	  if (BE (ret < 0, 0))
 1180: 	    return REG_ESPACE;
 1181: 	}
 1182:       else if (dfa->edests[org_node].nelem == 0)
 1183: 	{
 1184: 	  /* In case of the node can't epsilon-transit, don't duplicate the
 1185: 	     destination and store the original destination as the
 1186: 	     destination of the node.  */
 1187: 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
 1188: 	  break;
 1189: 	}
 1190:       else if (dfa->edests[org_node].nelem == 1)
 1191: 	{
 1192: 	  /* In case of the node can epsilon-transit, and it has only one
 1193: 	     destination.  */
 1194: 	  org_dest = dfa->edests[org_node].elems[0];
 1195: 	  re_node_set_empty (dfa->edests + clone_node);
 1196: 	  if (dfa->nodes[org_node].type == ANCHOR)
 1197: 	    {
 1198: 	      /* In case of the node has another constraint, append it.  */
 1199: 	      if (org_node == root_node && clone_node != org_node)
 1200: 		{
 1201: 		  /* ...but if the node is root_node itself, it means the
 1202: 		     epsilon closure have a loop, then tie it to the
 1203: 		     destination of the root_node.  */
 1204: 		  ret = re_node_set_insert (dfa->edests + clone_node,
 1205: 					    org_dest);
 1206: 		  if (BE (ret < 0, 0))
 1207: 		    return REG_ESPACE;
 1208: 		  break;
 1209: 		}
 1210: 	      constraint |= dfa->nodes[org_node].opr.ctx_type;
 1211: 	    }
 1212: 	  err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
 1213: 	  if (BE (err != REG_NOERROR, 0))
 1214: 	    return err;
 1215: 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
 1216: 	  if (BE (ret < 0, 0))
 1217: 	    return REG_ESPACE;
 1218: 	}
 1219:       else /* dfa->edests[org_node].nelem == 2 */
 1220: 	{
 1221: 	  /* In case of the node can epsilon-transit, and it has two
 1222: 	     destinations. E.g. '|', '*', '+', '?'.   */
 1223: 	  org_dest = dfa->edests[org_node].elems[0];
 1224: 	  re_node_set_empty (dfa->edests + clone_node);
 1225: 	  /* Search for a duplicated node which satisfies the constraint.  */
 1226: 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
 1227: 	  if (clone_dest == -1)
 1228: 	    {
 1229: 	      /* There are no such a duplicated node, create a new one.  */
 1230: 	      err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
 1231: 	      if (BE (err != REG_NOERROR, 0))
 1232: 		return err;
 1233: 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
 1234: 	      if (BE (ret < 0, 0))
 1235: 		return REG_ESPACE;
 1236: 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
 1237: 					    root_node, constraint);
 1238: 	      if (BE (err != REG_NOERROR, 0))
 1239: 		return err;
 1240: 	    }
 1241: 	  else
 1242: 	    {
 1243: 	      /* There are a duplicated node which satisfy the constraint,
 1244: 		 use it to avoid infinite loop.  */
 1245: 	      ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
 1246: 	      if (BE (ret < 0, 0))
 1247: 		return REG_ESPACE;
 1248: 	    }
 1249: 
 1250: 	  org_dest = dfa->edests[org_node].elems[1];
 1251: 	  err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
 1252: 	  if (BE (err != REG_NOERROR, 0))
 1253: 	    return err;
 1254: 	  ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
 1255: 	  if (BE (ret < 0, 0))
 1256: 	    return REG_ESPACE;
 1257: 	}
 1258:       org_node = org_dest;
 1259:       clone_node = clone_dest;
 1260:     }
 1261:   return REG_NOERROR;
 1262: }
 1263: 
 1264: /* Search for a node which is duplicated from the node ORG_NODE, and
 1265:    satisfies the constraint CONSTRAINT.  */
 1266: 
 1267: static int
 1268: search_duplicated_node (dfa, org_node, constraint)
 1269:      re_dfa_t *dfa;
 1270:      int org_node;
 1271:      unsigned int constraint;
 1272: {
 1273:   int idx;
 1274:   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
 1275:     {
 1276:       if (org_node == dfa->org_indices[idx]
 1277: 	  && constraint == dfa->nodes[idx].constraint)
 1278: 	return idx; /* Found.  */
 1279:     }
 1280:   return -1; /* Not found.  */
 1281: }
 1282: 
 1283: /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
 1284:    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
 1285:    otherwise return the error code.  */
 1286: 
 1287: static reg_errcode_t
 1288: duplicate_node (new_idx, dfa, org_idx, constraint)
 1289:      re_dfa_t *dfa;
 1290:      int *new_idx, org_idx;
 1291:      unsigned int constraint;
 1292: {
 1293:   re_token_t dup;
 1294:   int dup_idx;
 1295: 
 1296:   dup = dfa->nodes[org_idx];
 1297:   dup_idx = re_dfa_add_node (dfa, dup, 1);
 1298:   if (BE (dup_idx == -1, 0))
 1299:     return REG_ESPACE;
 1300:   dfa->nodes[dup_idx].constraint = constraint;
 1301:   if (dfa->nodes[org_idx].type == ANCHOR)
 1302:     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
 1303:   dfa->nodes[dup_idx].duplicated = 1;
 1304:   re_node_set_init_empty (dfa->edests + dup_idx);
 1305:   re_node_set_init_empty (dfa->eclosures + dup_idx);
 1306:   re_node_set_init_empty (dfa->inveclosures + dup_idx);
 1307: 
 1308:   /* Store the index of the original node.  */
 1309:   dfa->org_indices[dup_idx] = org_idx;
 1310:   *new_idx = dup_idx;
 1311:   return REG_NOERROR;
 1312: }
 1313: 
 1314: static void
 1315: calc_inveclosure (dfa)
 1316:      re_dfa_t *dfa;
 1317: {
 1318:   int src, idx, dest;
 1319:   for (src = 0; src < dfa->nodes_len; ++src)
 1320:     {
 1321:       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
 1322: 	{
 1323: 	  dest = dfa->eclosures[src].elems[idx];
 1324: 	  re_node_set_insert (dfa->inveclosures + dest, src);
 1325: 	}
 1326:     }
 1327: }
 1328: 
 1329: /* Calculate "eclosure" for all the node in DFA.  */
 1330: 
 1331: static reg_errcode_t
 1332: calc_eclosure (dfa)
 1333:      re_dfa_t *dfa;
 1334: {
 1335:   int node_idx, incomplete;
 1336: #ifdef DEBUG
 1337:   assert (dfa->nodes_len > 0);
 1338: #endif
 1339:   incomplete = 0;
 1340:   /* For each nodes, calculate epsilon closure.  */
 1341:   for (node_idx = 0; ; ++node_idx)
 1342:     {
 1343:       reg_errcode_t err;
 1344:       re_node_set eclosure_elem;
 1345:       if (node_idx == dfa->nodes_len)
 1346: 	{
 1347: 	  if (!incomplete)
 1348: 	    break;
 1349: 	  incomplete = 0;
 1350: 	  node_idx = 0;
 1351: 	}
 1352: 
 1353: #ifdef DEBUG
 1354:       assert (dfa->eclosures[node_idx].nelem != -1);
 1355: #endif
 1356:       /* If we have already calculated, skip it.  */
 1357:       if (dfa->eclosures[node_idx].nelem != 0)
 1358: 	continue;
 1359:       /* Calculate epsilon closure of `node_idx'.  */
 1360:       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
 1361:       if (BE (err != REG_NOERROR, 0))
 1362: 	return err;
 1363: 
 1364:       if (dfa->eclosures[node_idx].nelem == 0)
 1365: 	{
 1366: 	  incomplete = 1;
 1367: 	  re_node_set_free (&eclosure_elem);
 1368: 	}
 1369:     }
 1370:   return REG_NOERROR;
 1371: }
 1372: 
 1373: /* Calculate epsilon closure of NODE.  */
 1374: 
 1375: static reg_errcode_t
 1376: calc_eclosure_iter (new_set, dfa, node, root)
 1377:      re_node_set *new_set;
 1378:      re_dfa_t *dfa;
 1379:      int node, root;
 1380: {
 1381:   reg_errcode_t err;
 1382:   unsigned int constraint;
 1383:   int i, incomplete;
 1384:   re_node_set eclosure;
 1385:   incomplete = 0;
 1386:   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
 1387:   if (BE (err != REG_NOERROR, 0))
 1388:     return err;
 1389: 
 1390:   /* This indicates that we are calculating this node now.
 1391:      We reference this value to avoid infinite loop.  */
 1392:   dfa->eclosures[node].nelem = -1;
 1393: 
 1394:   constraint = ((dfa->nodes[node].type == ANCHOR)
 1395: 		? dfa->nodes[node].opr.ctx_type : 0);
 1396:   /* If the current node has constraints, duplicate all nodes.
 1397:      Since they must inherit the constraints.  */
 1398:   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
 1399:     {
 1400:       int org_node, cur_node;
 1401:       org_node = cur_node = node;
 1402:       err = duplicate_node_closure (dfa, node, node, node, constraint);
 1403:       if (BE (err != REG_NOERROR, 0))
 1404: 	return err;
 1405:     }
 1406: 
 1407:   /* Expand each epsilon destination nodes.  */
 1408:   if (IS_EPSILON_NODE(dfa->nodes[node].type))
 1409:     for (i = 0; i < dfa->edests[node].nelem; ++i)
 1410:       {
 1411: 	re_node_set eclosure_elem;
 1412: 	int edest = dfa->edests[node].elems[i];
 1413: 	/* If calculating the epsilon closure of `edest' is in progress,
 1414: 	   return intermediate result.  */
 1415: 	if (dfa->eclosures[edest].nelem == -1)
 1416: 	  {
 1417: 	    incomplete = 1;
 1418: 	    continue;
 1419: 	  }
 1420: 	/* If we haven't calculated the epsilon closure of `edest' yet,
 1421: 	   calculate now. Otherwise use calculated epsilon closure.  */
 1422: 	if (dfa->eclosures[edest].nelem == 0)
 1423: 	  {
 1424: 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
 1425: 	    if (BE (err != REG_NOERROR, 0))
 1426: 	      return err;
 1427: 	  }
 1428: 	else
 1429: 	  eclosure_elem = dfa->eclosures[edest];
 1430: 	/* Merge the epsilon closure of `edest'.  */
 1431: 	re_node_set_merge (&eclosure, &eclosure_elem);
 1432: 	/* If the epsilon closure of `edest' is incomplete,
 1433: 	   the epsilon closure of this node is also incomplete.  */
 1434: 	if (dfa->eclosures[edest].nelem == 0)
 1435: 	  {
 1436: 	    incomplete = 1;
 1437: 	    re_node_set_free (&eclosure_elem);
 1438: 	  }
 1439:       }
 1440: 
 1441:   /* Epsilon closures include itself.  */
 1442:   re_node_set_insert (&eclosure, node);
 1443:   if (incomplete && !root)
 1444:     dfa->eclosures[node].nelem = 0;
 1445:   else
 1446:     dfa->eclosures[node] = eclosure;
 1447:   *new_set = eclosure;
 1448:   return REG_NOERROR;
 1449: }
 1450: 
 1451: /* Functions for token which are used in the parser.  */
 1452: 
 1453: /* Fetch a token from INPUT.
 1454:    We must not use this function inside bracket expressions.  */
 1455: 
 1456: static re_token_t
 1457: fetch_token (input, syntax)
 1458:      re_string_t *input;
 1459:      reg_syntax_t syntax;
 1460: {
 1461:   re_token_t token;
 1462:   int consumed_byte;
 1463:   consumed_byte = peek_token (&token, input, syntax);
 1464:   re_string_skip_bytes (input, consumed_byte);
 1465:   return token;
 1466: }
 1467: 
 1468: /* Peek a token from INPUT, and return the length of the token.
 1469:    We must not use this function inside bracket expressions.  */
 1470: 
 1471: static int
 1472: peek_token (token, input, syntax)
 1473:      re_token_t *token;
 1474:      re_string_t *input;
 1475:      reg_syntax_t syntax;
 1476: {
 1477:   unsigned char c;
 1478: 
 1479:   if (re_string_eoi (input))
 1480:     {
 1481:       token->type = END_OF_RE;
 1482:       return 0;
 1483:     }
 1484: 
 1485:   c = re_string_peek_byte (input, 0);
 1486:   token->opr.c = c;
 1487: 
 1488: #ifdef RE_ENABLE_I18N
 1489:   token->mb_partial = 0;
 1490:   if (MB_CUR_MAX > 1 &&
 1491:       !re_string_first_byte (input, re_string_cur_idx (input)))
 1492:     {
 1493:       token->type = CHARACTER;
 1494:       token->mb_partial = 1;
 1495:       return 1;
 1496:     }
 1497: #endif
 1498:   if (c == '\\')
 1499:     {
 1500:       unsigned char c2;
 1501:       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
 1502: 	{
 1503: 	  token->type = BACK_SLASH;
 1504: 	  return 1;
 1505: 	}
 1506: 
 1507:       c2 = re_string_peek_byte_case (input, 1);
 1508:       token->opr.c = c2;
 1509:       token->type = CHARACTER;
 1510:       switch (c2)
 1511: 	{
 1512: 	case '|':
 1513: 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
 1514: 	    token->type = OP_ALT;
 1515: 	  break;
 1516: 	case '1': case '2': case '3': case '4': case '5':
 1517: 	case '6': case '7': case '8': case '9':
 1518: 	  if (!(syntax & RE_NO_BK_REFS))
 1519: 	    {
 1520: 	      token->type = OP_BACK_REF;
 1521: 	      token->opr.idx = c2 - '0';
 1522: 	    }
 1523: 	  break;
 1524: 	case '<':
 1525: 	  if (!(syntax & RE_NO_GNU_OPS))
 1526: 	    {
 1527: 	      token->type = ANCHOR;
 1528: 	      token->opr.idx = WORD_FIRST;
 1529: 	    }
 1530: 	  break;
 1531: 	case '>':
 1532: 	  if (!(syntax & RE_NO_GNU_OPS))
 1533: 	    {
 1534: 	      token->type = ANCHOR;
 1535: 	      token->opr.idx = WORD_LAST;
 1536: 	    }
 1537: 	  break;
 1538: 	case 'b':
 1539: 	  if (!(syntax & RE_NO_GNU_OPS))
 1540: 	    {
 1541: 	      token->type = ANCHOR;
 1542: 	      token->opr.idx = WORD_DELIM;
 1543: 	    }
 1544: 	  break;
 1545: 	case 'B':
 1546: 	  if (!(syntax & RE_NO_GNU_OPS))
 1547: 	    {
 1548: 	      token->type = ANCHOR;
 1549: 	      token->opr.idx = INSIDE_WORD;
 1550: 	    }
 1551: 	  break;
 1552: 	case 'w':
 1553: 	  if (!(syntax & RE_NO_GNU_OPS))
 1554: 	    token->type = OP_WORD;
 1555: 	  break;
 1556: 	case 'W':
 1557: 	  if (!(syntax & RE_NO_GNU_OPS))
 1558: 	    token->type = OP_NOTWORD;
 1559: 	  break;
 1560: 	case '`':
 1561: 	  if (!(syntax & RE_NO_GNU_OPS))
 1562: 	    {
 1563: 	      token->type = ANCHOR;
 1564: 	      token->opr.idx = BUF_FIRST;
 1565: 	    }
 1566: 	  break;
 1567: 	case '\'':
 1568: 	  if (!(syntax & RE_NO_GNU_OPS))
 1569: 	    {
 1570: 	      token->type = ANCHOR;
 1571: 	      token->opr.idx = BUF_LAST;
 1572: 	    }
 1573: 	  break;
 1574: 	case '(':
 1575: 	  if (!(syntax & RE_NO_BK_PARENS))
 1576: 	    token->type = OP_OPEN_SUBEXP;
 1577: 	  break;
 1578: 	case ')':
 1579: 	  if (!(syntax & RE_NO_BK_PARENS))
 1580: 	    token->type = OP_CLOSE_SUBEXP;
 1581: 	  break;
 1582: 	case '+':
 1583: 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
 1584: 	    token->type = OP_DUP_PLUS;
 1585: 	  break;
 1586: 	case '?':
 1587: 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
 1588: 	    token->type = OP_DUP_QUESTION;
 1589: 	  break;
 1590: 	case '{':
 1591: 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
 1592: 	    token->type = OP_OPEN_DUP_NUM;
 1593: 	  break;
 1594: 	case '}':
 1595: 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
 1596: 	    token->type = OP_CLOSE_DUP_NUM;
 1597: 	  break;
 1598: 	default:
 1599: 	  break;
 1600: 	}
 1601:       return 2;
 1602:     }
 1603: 
 1604:   token->type = CHARACTER;
 1605:   switch (c)
 1606:     {
 1607:     case '\n':
 1608:       if (syntax & RE_NEWLINE_ALT)
 1609: 	token->type = OP_ALT;
 1610:       break;
 1611:     case '|':
 1612:       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
 1613: 	token->type = OP_ALT;
 1614:       break;
 1615:     case '*':
 1616:       token->type = OP_DUP_ASTERISK;
 1617:       break;
 1618:     case '+':
 1619:       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
 1620: 	token->type = OP_DUP_PLUS;
 1621:       break;
 1622:     case '?':
 1623:       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
 1624: 	token->type = OP_DUP_QUESTION;
 1625:       break;
 1626:     case '{':
 1627:       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
 1628: 	token->type = OP_OPEN_DUP_NUM;
 1629:       break;
 1630:     case '}':
 1631:       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
 1632: 	token->type = OP_CLOSE_DUP_NUM;
 1633:       break;
 1634:     case '(':
 1635:       if (syntax & RE_NO_BK_PARENS)
 1636: 	token->type = OP_OPEN_SUBEXP;
 1637:       break;
 1638:     case ')':
 1639:       if (syntax & RE_NO_BK_PARENS)
 1640: 	token->type = OP_CLOSE_SUBEXP;
 1641:       break;
 1642:     case '[':
 1643:       token->type = OP_OPEN_BRACKET;
 1644:       break;
 1645:     case '.':
 1646:       token->type = OP_PERIOD;
 1647:       break;
 1648:     case '^':
 1649:       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
 1650: 	  re_string_cur_idx (input) != 0)
 1651: 	{
 1652: 	  char prev = re_string_peek_byte (input, -1);
 1653: 	  if (prev != '|' && prev != '(' &&
 1654: 	      (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
 1655: 	    break;
 1656: 	}
 1657:       token->type = ANCHOR;
 1658:       token->opr.idx = LINE_FIRST;
 1659:       break;
 1660:     case '$':
 1661:       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
 1662: 	  re_string_cur_idx (input) + 1 != re_string_length (input))
 1663: 	{
 1664: 	  re_token_t next;
 1665: 	  re_string_skip_bytes (input, 1);
 1666: 	  peek_token (&next, input, syntax);
 1667: 	  re_string_skip_bytes (input, -1);
 1668: 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
 1669: 	    break;
 1670: 	}
 1671:       token->type = ANCHOR;
 1672:       token->opr.idx = LINE_LAST;
 1673:       break;
 1674:     default:
 1675:       break;
 1676:     }
 1677:   return 1;
 1678: }
 1679: 
 1680: /* Peek a token from INPUT, and return the length of the token.
 1681:    We must not use this function out of bracket expressions.  */
 1682: 
 1683: static int
 1684: peek_token_bracket (token, input, syntax)
 1685:      re_token_t *token;
 1686:      re_string_t *input;
 1687:      reg_syntax_t syntax;
 1688: {
 1689:   unsigned char c;
 1690:   if (re_string_eoi (input))
 1691:     {
 1692:       token->type = END_OF_RE;
 1693:       return 0;
 1694:     }
 1695:   c = re_string_peek_byte (input, 0);
 1696:   token->opr.c = c;
 1697: 
 1698: #ifdef RE_ENABLE_I18N
 1699:   if (MB_CUR_MAX > 1 &&
 1700:       !re_string_first_byte (input, re_string_cur_idx (input)))
 1701:     {
 1702:       token->type = CHARACTER;
 1703:       return 1;
 1704:     }
 1705: #endif /* RE_ENABLE_I18N */
 1706: 
 1707:   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
 1708:     {
 1709:       /* In this case, '\' escape a character.  */
 1710:       unsigned char c2;
 1711:       re_string_skip_bytes (input, 1);
 1712:       c2 = re_string_peek_byte (input, 0);
 1713:       token->opr.c = c2;
 1714:       token->type = CHARACTER;
 1715:       return 1;
 1716:     }
 1717:   if (c == '[') /* '[' is a special char in a bracket exps.  */
 1718:     {
 1719:       unsigned char c2;
 1720:       int token_len;
 1721:       c2 = re_string_peek_byte (input, 1);
 1722:       token->opr.c = c2;
 1723:       token_len = 2;
 1724:       switch (c2)
 1725: 	{
 1726: 	case '.':
 1727: 	  token->type = OP_OPEN_COLL_ELEM;
 1728: 	  break;
 1729: 	case '=':
 1730: 	  token->type = OP_OPEN_EQUIV_CLASS;
 1731: 	  break;
 1732: 	case ':':
 1733: 	  if (syntax & RE_CHAR_CLASSES)
 1734: 	    {
 1735: 	      token->type = OP_OPEN_CHAR_CLASS;
 1736: 	      break;
 1737: 	    }
 1738: 	  /* else fall through.  */
 1739: 	default:
 1740: 	  token->type = CHARACTER;
 1741: 	  token->opr.c = c;
 1742: 	  token_len = 1;
 1743: 	  break;
 1744: 	}
 1745:       return token_len;
 1746:     }
 1747:   switch (c)
 1748:     {
 1749:     case '-':
 1750:       token->type = OP_CHARSET_RANGE;
 1751:       break;
 1752:     case ']':
 1753:       token->type = OP_CLOSE_BRACKET;
 1754:       break;
 1755:     case '^':
 1756:       token->type = OP_NON_MATCH_LIST;
 1757:       break;
 1758:     default:
 1759:       token->type = CHARACTER;
 1760:     }
 1761:   return 1;
 1762: }
 1763: 
 1764: /* Functions for parser.  */
 1765: 
 1766: /* Entry point of the parser.
 1767:    Parse the regular expression REGEXP and return the structure tree.
 1768:    If an error is occured, ERR is set by error code, and return NULL.
 1769:    This function build the following tree, from regular expression <reg_exp>:
 1770: 	   CAT
 1771: 	   / \
 1772: 	  /   \
 1773:    <reg_exp>  EOR
 1774: 
 1775:    CAT means concatenation.
 1776:    EOR means end of regular expression.  */
 1777: 
 1778: static bin_tree_t *
 1779: parse (regexp, preg, syntax, err)
 1780:      re_string_t *regexp;
 1781:      regex_t *preg;
 1782:      reg_syntax_t syntax;
 1783:      reg_errcode_t *err;
 1784: {
 1785:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 1786:   bin_tree_t *tree, *eor, *root;
 1787:   re_token_t current_token;
 1788:   int new_idx;
 1789:   current_token = fetch_token (regexp, syntax);
 1790:   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
 1791:   if (BE (*err != REG_NOERROR && tree == NULL, 0))
 1792:     return NULL;
 1793:   new_idx = re_dfa_add_node (dfa, current_token, 0);
 1794:   eor = create_tree (NULL, NULL, 0, new_idx);
 1795:   if (tree != NULL)
 1796:     root = create_tree (tree, eor, CONCAT, 0);
 1797:   else
 1798:     root = eor;
 1799:   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
 1800:     {
 1801:       *err = REG_ESPACE;
 1802:       return NULL;
 1803:     }
 1804:   return root;
 1805: }
 1806: 
 1807: /* This function build the following tree, from regular expression
 1808:    <branch1>|<branch2>:
 1809: 	   ALT
 1810: 	   / \
 1811: 	  /   \
 1812:    <branch1> <branch2>
 1813: 
 1814:    ALT means alternative, which represents the operator `|'.  */
 1815: 
 1816: static bin_tree_t *
 1817: parse_reg_exp (regexp, preg, token, syntax, nest, err)
 1818:      re_string_t *regexp;
 1819:      regex_t *preg;
 1820:      re_token_t *token;
 1821:      reg_syntax_t syntax;
 1822:      int nest;
 1823:      reg_errcode_t *err;
 1824: {
 1825:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 1826:   bin_tree_t *tree, *branch = NULL;
 1827:   int new_idx;
 1828:   tree = parse_branch (regexp, preg, token, syntax, nest, err);
 1829:   if (BE (*err != REG_NOERROR && tree == NULL, 0))
 1830:     return NULL;
 1831: 
 1832:   while (token->type == OP_ALT)
 1833:     {
 1834:       re_token_t alt_token = *token;
 1835:       new_idx = re_dfa_add_node (dfa, alt_token, 0);
 1836:       *token = fetch_token (regexp, syntax);
 1837:       if (token->type != OP_ALT && token->type != END_OF_RE
 1838: 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
 1839: 	{
 1840: 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
 1841: 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
 1842: 	    {
 1843: 	      free_bin_tree (tree);
 1844: 	      return NULL;
 1845: 	    }
 1846: 	}
 1847:       else
 1848: 	branch = NULL;
 1849:       tree = create_tree (tree, branch, 0, new_idx);
 1850:       if (BE (new_idx == -1 || tree == NULL, 0))
 1851: 	{
 1852: 	  *err = REG_ESPACE;
 1853: 	  return NULL;
 1854: 	}
 1855:       dfa->has_plural_match = 1;
 1856:     }
 1857:   return tree;
 1858: }
 1859: 
 1860: /* This function build the following tree, from regular expression
 1861:    <exp1><exp2>:
 1862: 	CAT
 1863: 	/ \
 1864:        /   \
 1865:    <exp1> <exp2>
 1866: 
 1867:    CAT means concatenation.  */
 1868: 
 1869: static bin_tree_t *
 1870: parse_branch (regexp, preg, token, syntax, nest, err)
 1871:      re_string_t *regexp;
 1872:      regex_t *preg;
 1873:      re_token_t *token;
 1874:      reg_syntax_t syntax;
 1875:      int nest;
 1876:      reg_errcode_t *err;
 1877: {
 1878:   bin_tree_t *tree, *exp;
 1879:   tree = parse_expression (regexp, preg, token, syntax, nest, err);
 1880:   if (BE (*err != REG_NOERROR && tree == NULL, 0))
 1881:     return NULL;
 1882: 
 1883:   while (token->type != OP_ALT && token->type != END_OF_RE
 1884: 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
 1885:     {
 1886:       exp = parse_expression (regexp, preg, token, syntax, nest, err);
 1887:       if (BE (*err != REG_NOERROR && exp == NULL, 0))
 1888: 	{
 1889: 	  free_bin_tree (tree);
 1890: 	  return NULL;
 1891: 	}
 1892:       if (tree != NULL && exp != NULL)
 1893: 	{
 1894: 	  tree = create_tree (tree, exp, CONCAT, 0);
 1895: 	  if (tree == NULL)
 1896: 	    {
 1897: 	      *err = REG_ESPACE;
 1898: 	      return NULL;
 1899: 	    }
 1900: 	}
 1901:       else if (tree == NULL)
 1902: 	tree = exp;
 1903:       /* Otherwise exp == NULL, we don't need to create new tree.  */
 1904:     }
 1905:   return tree;
 1906: }
 1907: 
 1908: /* This function build the following tree, from regular expression a*:
 1909: 	 *
 1910: 	 |
 1911: 	 a
 1912: */
 1913: 
 1914: static bin_tree_t *
 1915: parse_expression (regexp, preg, token, syntax, nest, err)
 1916:      re_string_t *regexp;
 1917:      regex_t *preg;
 1918:      re_token_t *token;
 1919:      reg_syntax_t syntax;
 1920:      int nest;
 1921:      reg_errcode_t *err;
 1922: {
 1923:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 1924:   bin_tree_t *tree;
 1925:   int new_idx;
 1926:   switch (token->type)
 1927:     {
 1928:     case CHARACTER:
 1929:       new_idx = re_dfa_add_node (dfa, *token, 0);
 1930:       tree = create_tree (NULL, NULL, 0, new_idx);
 1931:       if (BE (new_idx == -1 || tree == NULL, 0))
 1932: 	{
 1933: 	  *err = REG_ESPACE;
 1934: 	  return NULL;
 1935: 	}
 1936: #ifdef RE_ENABLE_I18N
 1937:       if (MB_CUR_MAX > 1)
 1938: 	{
 1939: 	  while (!re_string_eoi (regexp)
 1940: 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
 1941: 	    {
 1942: 	      bin_tree_t *mbc_remain;
 1943: 	      *token = fetch_token (regexp, syntax);
 1944: 	      new_idx = re_dfa_add_node (dfa, *token, 0);
 1945: 	      mbc_remain = create_tree (NULL, NULL, 0, new_idx);
 1946: 	      tree = create_tree (tree, mbc_remain, CONCAT, 0);
 1947: 	      if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
 1948: 		{
 1949: 		  *err = REG_ESPACE;
 1950: 		  return NULL;
 1951: 		}
 1952: 	    }
 1953: 	}
 1954: #endif
 1955:       break;
 1956:     case OP_OPEN_SUBEXP:
 1957:       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
 1958:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
 1959: 	return NULL;
 1960:       break;
 1961:     case OP_OPEN_BRACKET:
 1962:       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
 1963:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
 1964: 	return NULL;
 1965:       break;
 1966:     case OP_BACK_REF:
 1967:       if (BE (preg->re_nsub < token->opr.idx
 1968: 	      || dfa->subexps[token->opr.idx - 1].end == -1, 0))
 1969: 	{
 1970: 	  *err = REG_ESUBREG;
 1971: 	  return NULL;
 1972: 	}
 1973:       dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
 1974:       new_idx = re_dfa_add_node (dfa, *token, 0);
 1975:       tree = create_tree (NULL, NULL, 0, new_idx);
 1976:       if (BE (new_idx == -1 || tree == NULL, 0))
 1977: 	{
 1978: 	  *err = REG_ESPACE;
 1979: 	  return NULL;
 1980: 	}
 1981:       ++dfa->nbackref;
 1982:       dfa->has_mb_node = 1;
 1983:       break;
 1984:     case OP_DUP_ASTERISK:
 1985:     case OP_DUP_PLUS:
 1986:     case OP_DUP_QUESTION:
 1987:     case OP_OPEN_DUP_NUM:
 1988:       if (syntax & RE_CONTEXT_INVALID_OPS)
 1989: 	{
 1990: 	  *err = REG_BADRPT;
 1991: 	  return NULL;
 1992: 	}
 1993:       else if (syntax & RE_CONTEXT_INDEP_OPS)
 1994: 	{
 1995: 	  *token = fetch_token (regexp, syntax);
 1996: 	  return parse_expression (regexp, preg, token, syntax, nest, err);
 1997: 	}
 1998:       /* else fall through  */
 1999:     case OP_CLOSE_SUBEXP:
 2000:       if ((token->type == OP_CLOSE_SUBEXP) &&
 2001: 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
 2002: 	{
 2003: 	  *err = REG_ERPAREN;
 2004: 	  return NULL;
 2005: 	}
 2006:       /* else fall through  */
 2007:     case OP_CLOSE_DUP_NUM:
 2008:       /* We treat it as a normal character.  */
 2009: 
 2010:       /* Then we can these characters as normal characters.  */
 2011:       token->type = CHARACTER;
 2012:       new_idx = re_dfa_add_node (dfa, *token, 0);
 2013:       tree = create_tree (NULL, NULL, 0, new_idx);
 2014:       if (BE (new_idx == -1 || tree == NULL, 0))
 2015: 	{
 2016: 	  *err = REG_ESPACE;
 2017: 	  return NULL;
 2018: 	}
 2019:       break;
 2020:     case ANCHOR:
 2021:       if (dfa->word_char == NULL)
 2022: 	{
 2023: 	  *err = init_word_char (dfa);
 2024: 	  if (BE (*err != REG_NOERROR, 0))
 2025: 	    return NULL;
 2026: 	}
 2027:       if (token->opr.ctx_type == WORD_DELIM)
 2028: 	{
 2029: 	  bin_tree_t *tree_first, *tree_last;
 2030: 	  int idx_first, idx_last;
 2031: 	  token->opr.ctx_type = WORD_FIRST;
 2032: 	  idx_first = re_dfa_add_node (dfa, *token, 0);
 2033: 	  tree_first = create_tree (NULL, NULL, 0, idx_first);
 2034: 	  token->opr.ctx_type = WORD_LAST;
 2035: 	  idx_last = re_dfa_add_node (dfa, *token, 0);
 2036: 	  tree_last = create_tree (NULL, NULL, 0, idx_last);
 2037: 	  token->type = OP_ALT;
 2038: 	  new_idx = re_dfa_add_node (dfa, *token, 0);
 2039: 	  tree = create_tree (tree_first, tree_last, 0, new_idx);
 2040: 	  if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
 2041: 		  || tree_first == NULL || tree_last == NULL
 2042: 		  || tree == NULL, 0))
 2043: 	    {
 2044: 	      *err = REG_ESPACE;
 2045: 	      return NULL;
 2046: 	    }
 2047: 	}
 2048:       else
 2049: 	{
 2050: 	  new_idx = re_dfa_add_node (dfa, *token, 0);
 2051: 	  tree = create_tree (NULL, NULL, 0, new_idx);
 2052: 	  if (BE (new_idx == -1 || tree == NULL, 0))
 2053: 	    {
 2054: 	      *err = REG_ESPACE;
 2055: 	      return NULL;
 2056: 	    }
 2057: 	}
 2058:       /* We must return here, since ANCHORs can't be followed
 2059: 	 by repetition operators.
 2060: 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
 2061: 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
 2062:       *token = fetch_token (regexp, syntax);
 2063:       return tree;
 2064:     case OP_PERIOD:
 2065:       new_idx = re_dfa_add_node (dfa, *token, 0);
 2066:       tree = create_tree (NULL, NULL, 0, new_idx);
 2067:       if (BE (new_idx == -1 || tree == NULL, 0))
 2068: 	{
 2069: 	  *err = REG_ESPACE;
 2070: 	  return NULL;
 2071: 	}
 2072:       if (MB_CUR_MAX > 1)
 2073: 	dfa->has_mb_node = 1;
 2074:       break;
 2075:     case OP_WORD:
 2076:       tree = build_word_op (dfa, 0, err);
 2077:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
 2078: 	return NULL;
 2079:       break;
 2080:     case OP_NOTWORD:
 2081:       tree = build_word_op (dfa, 1, err);
 2082:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
 2083: 	return NULL;
 2084:       break;
 2085:     case OP_ALT:
 2086:     case END_OF_RE:
 2087:       return NULL;
 2088:     case BACK_SLASH:
 2089:       *err = REG_EESCAPE;
 2090:       return NULL;
 2091:     default:
 2092:       /* Must not happen?  */
 2093: #ifdef DEBUG
 2094:       assert (0);
 2095: #endif
 2096:       return NULL;
 2097:     }
 2098:   *token = fetch_token (regexp, syntax);
 2099: 
 2100:   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
 2101: 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
 2102:     {
 2103:       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
 2104:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
 2105: 	return NULL;
 2106:       dfa->has_plural_match = 1;
 2107:     }
 2108: 
 2109:   return tree;
 2110: }
 2111: 
 2112: /* This function build the following tree, from regular expression
 2113:    (<reg_exp>):
 2114: 	 SUBEXP
 2115: 	    |
 2116: 	<reg_exp>
 2117: */
 2118: 
 2119: static bin_tree_t *
 2120: parse_sub_exp (regexp, preg, token, syntax, nest, err)
 2121:      re_string_t *regexp;
 2122:      regex_t *preg;
 2123:      re_token_t *token;
 2124:      reg_syntax_t syntax;
 2125:      int nest;
 2126:      reg_errcode_t *err;
 2127: {
 2128:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 2129:   bin_tree_t *tree, *left_par, *right_par;
 2130:   size_t cur_nsub;
 2131:   int new_idx;
 2132:   cur_nsub = preg->re_nsub++;
 2133:   if (dfa->subexps_alloc < preg->re_nsub)
 2134:     {
 2135:       re_subexp_t *new_array;
 2136:       dfa->subexps_alloc *= 2;
 2137:       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
 2138:       if (BE (new_array == NULL, 0))
 2139: 	{
 2140: 	  dfa->subexps_alloc /= 2;
 2141: 	  *err = REG_ESPACE;
 2142: 	  return NULL;
 2143: 	}
 2144:       dfa->subexps = new_array;
 2145:     }
 2146:   dfa->subexps[cur_nsub].start = dfa->nodes_len;
 2147:   dfa->subexps[cur_nsub].end = -1;
 2148: 
 2149:   new_idx = re_dfa_add_node (dfa, *token, 0);
 2150:   left_par = create_tree (NULL, NULL, 0, new_idx);
 2151:   if (BE (new_idx == -1 || left_par == NULL, 0))
 2152:     {
 2153:       *err = REG_ESPACE;
 2154:       return NULL;
 2155:     }
 2156:   dfa->nodes[new_idx].opr.idx = cur_nsub;
 2157:   *token = fetch_token (regexp, syntax);
 2158: 
 2159:   /* The subexpression may be a null string.  */
 2160:   if (token->type == OP_CLOSE_SUBEXP)
 2161:     tree = NULL;
 2162:   else
 2163:     {
 2164:       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
 2165:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
 2166: 	return NULL;
 2167:     }
 2168:   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
 2169:     {
 2170:       free_bin_tree (tree);
 2171:       *err = REG_BADPAT;
 2172:       return NULL;
 2173:     }
 2174:   new_idx = re_dfa_add_node (dfa, *token, 0);
 2175:   dfa->subexps[cur_nsub].end = dfa->nodes_len;
 2176:   right_par = create_tree (NULL, NULL, 0, new_idx);
 2177:   tree = ((tree == NULL) ? right_par
 2178: 	  : create_tree (tree, right_par, CONCAT, 0));
 2179:   tree = create_tree (left_par, tree, CONCAT, 0);
 2180:   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
 2181:     {
 2182:       *err = REG_ESPACE;
 2183:       return NULL;
 2184:     }
 2185:   dfa->nodes[new_idx].opr.idx = cur_nsub;
 2186: 
 2187:   return tree;
 2188: }
 2189: 
 2190: /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
 2191: 
 2192: static bin_tree_t *
 2193: parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
 2194:      bin_tree_t *dup_elem;
 2195:      re_string_t *regexp;
 2196:      re_dfa_t *dfa;
 2197:      re_token_t *token;
 2198:      reg_syntax_t syntax;
 2199:      reg_errcode_t *err;
 2200: {
 2201:   re_token_t dup_token;
 2202:   bin_tree_t *tree = dup_elem, *work_tree;
 2203:   int new_idx, start_idx = re_string_cur_idx (regexp);
 2204:   re_token_t start_token = *token;
 2205:   if (token->type == OP_OPEN_DUP_NUM)
 2206:     {
 2207:       int i;
 2208:       int end = 0;
 2209:       int start = fetch_number (regexp, token, syntax);
 2210:       bin_tree_t *elem;
 2211:       if (start == -1)
 2212: 	{
 2213: 	  if (token->type == CHARACTER && token->opr.c == ',')
 2214: 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
 2215: 	  else
 2216: 	    {
 2217: 	      *err = REG_BADBR; /* <re>{} is invalid.  */
 2218: 	      return NULL;
 2219: 	    }
 2220: 	}
 2221:       if (BE (start != -2, 1))
 2222: 	{
 2223: 	  /* We treat "{n}" as "{n,n}".  */
 2224: 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
 2225: 		 : ((token->type == CHARACTER && token->opr.c == ',')
 2226: 		    ? fetch_number (regexp, token, syntax) : -2));
 2227: 	}
 2228:       if (BE (start == -2 || end == -2, 0))
 2229: 	{
 2230: 	  /* Invalid sequence.  */
 2231: 	  if (token->type == OP_CLOSE_DUP_NUM)
 2232: 	    goto parse_dup_op_invalid_interval;
 2233: 	  else
 2234: 	    goto parse_dup_op_ebrace;
 2235: 	}
 2236:       if (BE (start == 0 && end == 0, 0))
 2237: 	{
 2238: 	  /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
 2239: 	  *token = fetch_token (regexp, syntax);
 2240: 	  free_bin_tree (dup_elem);
 2241: 	  return NULL;
 2242: 	}
 2243: 
 2244:       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
 2245:       elem = tree;
 2246:       for (i = 0; i < start; ++i)
 2247: 	if (i != 0)
 2248: 	  {
 2249: 	    work_tree = duplicate_tree (elem, dfa);
 2250: 	    tree = create_tree (tree, work_tree, CONCAT, 0);
 2251: 	    if (BE (work_tree == NULL || tree == NULL, 0))
 2252: 	      goto parse_dup_op_espace;
 2253: 	  }
 2254: 
 2255:       if (end == -1)
 2256: 	{
 2257: 	  /* We treat "<re>{0,}" as "<re>*".  */
 2258: 	  dup_token.type = OP_DUP_ASTERISK;
 2259: 	  if (start > 0)
 2260: 	    {
 2261: 	      elem = duplicate_tree (elem, dfa);
 2262: 	      new_idx = re_dfa_add_node (dfa, dup_token, 0);
 2263: 	      work_tree = create_tree (elem, NULL, 0, new_idx);
 2264: 	      tree = create_tree (tree, work_tree, CONCAT, 0);
 2265: 	      if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
 2266: 		      || tree == NULL, 0))
 2267: 		goto parse_dup_op_espace;
 2268: 	    }
 2269: 	  else
 2270: 	    {
 2271: 	      new_idx = re_dfa_add_node (dfa, dup_token, 0);
 2272: 	      tree = create_tree (elem, NULL, 0, new_idx);
 2273: 	      if (BE (new_idx == -1 || tree == NULL, 0))
 2274: 		goto parse_dup_op_espace;
 2275: 	    }
 2276: 	}
 2277:       else if (end - start > 0)
 2278: 	{
 2279: 	  /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
 2280: 	  dup_token.type = OP_DUP_QUESTION;
 2281: 	  if (start > 0)
 2282: 	    {
 2283: 	      elem = duplicate_tree (elem, dfa);
 2284: 	      new_idx = re_dfa_add_node (dfa, dup_token, 0);
 2285: 	      elem = create_tree (elem, NULL, 0, new_idx);
 2286: 	      tree = create_tree (tree, elem, CONCAT, 0);
 2287: 	      if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
 2288: 		goto parse_dup_op_espace;
 2289: 	    }
 2290: 	  else
 2291: 	    {
 2292: 	      new_idx = re_dfa_add_node (dfa, dup_token, 0);
 2293: 	      tree = elem = create_tree (elem, NULL, 0, new_idx);
 2294: 	      if (BE (new_idx == -1 || tree == NULL, 0))
 2295: 		goto parse_dup_op_espace;
 2296: 	    }
 2297: 	  for (i = 1; i < end - start; ++i)
 2298: 	    {
 2299: 	      work_tree = duplicate_tree (elem, dfa);
 2300: 	      tree = create_tree (tree, work_tree, CONCAT, 0);
 2301: 	      if (BE (work_tree == NULL || tree == NULL, 0))
 2302: 		{
 2303: 		  *err = REG_ESPACE;
 2304: 		  return NULL;
 2305: 		}
 2306: 	    }
 2307: 	}
 2308:     }
 2309:   else
 2310:     {
 2311:       new_idx = re_dfa_add_node (dfa, *token, 0);
 2312:       tree = create_tree (tree, NULL, 0, new_idx);
 2313:       if (BE (new_idx == -1 || tree == NULL, 0))
 2314: 	{
 2315: 	  *err = REG_ESPACE;
 2316: 	  return NULL;
 2317: 	}
 2318:     }
 2319:   *token = fetch_token (regexp, syntax);
 2320:   return tree;
 2321: 
 2322:  parse_dup_op_espace:
 2323:   free_bin_tree (tree);
 2324:   *err = REG_ESPACE;
 2325:   return NULL;
 2326: 
 2327:  parse_dup_op_ebrace:
 2328:   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
 2329:     {
 2330:       *err = REG_EBRACE;
 2331:       return NULL;
 2332:     }
 2333:   goto parse_dup_op_rollback;
 2334:  parse_dup_op_invalid_interval:
 2335:   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
 2336:     {
 2337:       *err = REG_BADBR;
 2338:       return NULL;
 2339:     }
 2340:  parse_dup_op_rollback:
 2341:   re_string_set_index (regexp, start_idx);
 2342:   *token = start_token;
 2343:   token->type = CHARACTER;
 2344:   return dup_elem;
 2345: }
 2346: 
 2347: /* Size of the names for collating symbol/equivalence_class/character_class.
 2348:    I'm not sure, but maybe enough.  */
 2349: #define BRACKET_NAME_BUF_SIZE 32
 2350: 
 2351: #ifndef _LIBC
 2352:   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
 2353:      Build the range expression which starts from START_ELEM, and ends
 2354:      at END_ELEM.  The result are written to MBCSET and SBCSET.
 2355:      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
 2356:      mbcset->range_ends, is a pointer argument sinse we may
 2357:      update it.  */
 2358: 
 2359: static reg_errcode_t
 2360: # ifdef RE_ENABLE_I18N
 2361: build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
 2362:      re_charset_t *mbcset;
 2363:      int *range_alloc;
 2364: # else /* not RE_ENABLE_I18N */
 2365: build_range_exp (sbcset, start_elem, end_elem)
 2366: # endif /* not RE_ENABLE_I18N */
 2367:      re_bitset_ptr_t sbcset;
 2368:      bracket_elem_t *start_elem, *end_elem;
 2369: {
 2370:   unsigned int start_ch, end_ch;
 2371:   /* Equivalence Classes and Character Classes can't be a range start/end.  */
 2372:   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
 2373: 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
 2374: 	  0))
 2375:     return REG_ERANGE;
 2376: 
 2377:   /* We can handle no multi character collating elements without libc
 2378:      support.  */
 2379:   if (BE ((start_elem->type == COLL_SYM
 2380: 	   && strlen ((char *) start_elem->opr.name) > 1)
 2381: 	  || (end_elem->type == COLL_SYM
 2382: 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
 2383:     return REG_ECOLLATE;
 2384: 
 2385: # ifdef RE_ENABLE_I18N
 2386:   {
 2387:     wchar_t wc, start_wc, end_wc;
 2388:     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
 2389: 
 2390:     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
 2391: 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
 2392: 		   : 0));
 2393:     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
 2394: 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
 2395: 		 : 0));
 2396:     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
 2397: 		? __btowc (start_ch) : start_elem->opr.wch);
 2398:     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
 2399: 	      ? __btowc (end_ch) : end_elem->opr.wch);
 2400:     cmp_buf[0] = start_wc;
 2401:     cmp_buf[4] = end_wc;
 2402:     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
 2403:       return REG_ERANGE;
 2404: 
 2405:     /* Check the space of the arrays.  */
 2406:     if (*range_alloc == mbcset->nranges)
 2407:       {
 2408: 	/* There are not enough space, need realloc.  */
 2409: 	wchar_t *new_array_start, *new_array_end;
 2410: 	int new_nranges;
 2411: 
 2412: 	/* +1 in case of mbcset->nranges is 0.  */
 2413: 	new_nranges = 2 * mbcset->nranges + 1;
 2414: 	/* Use realloc since mbcset->range_starts and mbcset->range_ends
 2415: 	   are NULL if *range_alloc == 0.  */
 2416: 	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
 2417: 				      new_nranges);
 2418: 	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
 2419: 				    new_nranges);
 2420: 
 2421: 	if (BE (new_array_start == NULL || new_array_end == NULL, 0))
 2422: 	  return REG_ESPACE;
 2423: 
 2424: 	mbcset->range_starts = new_array_start;
 2425: 	mbcset->range_ends = new_array_end;
 2426: 	*range_alloc = new_nranges;
 2427:       }
 2428: 
 2429:     mbcset->range_starts[mbcset->nranges] = start_wc;
 2430:     mbcset->range_ends[mbcset->nranges++] = end_wc;
 2431: 
 2432:     /* Build the table for single byte characters.  */
 2433:     for (wc = 0; wc <= SBC_MAX; ++wc)
 2434:       {
 2435: 	cmp_buf[2] = wc;
 2436: 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
 2437: 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
 2438: 	  bitset_set (sbcset, wc);
 2439:       }
 2440:   }
 2441: # else /* not RE_ENABLE_I18N */
 2442:   {
 2443:     unsigned int ch;
 2444:     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
 2445: 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
 2446: 		   : 0));
 2447:     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
 2448: 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
 2449: 		 : 0));
 2450:     if (start_ch > end_ch)
 2451:       return REG_ERANGE;
 2452:     /* Build the table for single byte characters.  */
 2453:     for (ch = 0; ch <= SBC_MAX; ++ch)
 2454:       if (start_ch <= ch  && ch <= end_ch)
 2455: 	bitset_set (sbcset, ch);
 2456:   }
 2457: # endif /* not RE_ENABLE_I18N */
 2458:   return REG_NOERROR;
 2459: }
 2460: #endif /* not _LIBC */
 2461: 
 2462: #ifndef _LIBC
 2463: /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
 2464:    Build the collating element which is represented by NAME.
 2465:    The result are written to MBCSET and SBCSET.
 2466:    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
 2467:    pointer argument since we may update it.  */
 2468: 
 2469: static reg_errcode_t
 2470: # ifdef RE_ENABLE_I18N
 2471: build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
 2472:      re_charset_t *mbcset;
 2473:      int *coll_sym_alloc;
 2474: # else /* not RE_ENABLE_I18N */
 2475: build_collating_symbol (sbcset, name)
 2476: # endif /* not RE_ENABLE_I18N */
 2477:      re_bitset_ptr_t sbcset;
 2478:      const unsigned char *name;
 2479: {
 2480:   size_t name_len = strlen ((const char *) name);
 2481:   if (BE (name_len != 1, 0))
 2482:     return REG_ECOLLATE;
 2483:   else
 2484:     {
 2485:       bitset_set (sbcset, name[0]);
 2486:       return REG_NOERROR;
 2487:     }
 2488: }
 2489: #endif /* not _LIBC */
 2490: 
 2491: /* This function parse bracket expression like "[abc]", "[a-c]",
 2492:    "[[.a-a.]]" etc.  */
 2493: 
 2494: static bin_tree_t *
 2495: parse_bracket_exp (regexp, dfa, token, syntax, err)
 2496:      re_string_t *regexp;
 2497:      re_dfa_t *dfa;
 2498:      re_token_t *token;
 2499:      reg_syntax_t syntax;
 2500:      reg_errcode_t *err;
 2501: {
 2502: #ifdef _LIBC
 2503:   const unsigned char *collseqmb;
 2504:   const char *collseqwc;
 2505:   uint32_t nrules;
 2506:   int32_t table_size;
 2507:   const int32_t *symb_table;
 2508:   const unsigned char *extra;
 2509: 
 2510:   /* Local function for parse_bracket_exp used in _LIBC environement.
 2511:      Seek the collating symbol entry correspondings to NAME.
 2512:      Return the index of the symbol in the SYMB_TABLE.  */
 2513: 
 2514:   static inline int32_t
 2515:   seek_collating_symbol_entry (name, name_len)
 2516: 	 const unsigned char *name;
 2517: 	 size_t name_len;
 2518:     {
 2519:       int32_t hash = elem_hash ((const char *) name, name_len);
 2520:       int32_t elem = hash % table_size;
 2521:       int32_t second = hash % (table_size - 2);
 2522:       while (symb_table[2 * elem] != 0)
 2523: 	{
 2524: 	  /* First compare the hashing value.  */
 2525: 	  if (symb_table[2 * elem] == hash
 2526: 	      /* Compare the length of the name.  */
 2527: 	      && name_len == extra[symb_table[2 * elem + 1]]
 2528: 	      /* Compare the name.  */
 2529: 	      && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
 2530: 			 name_len) == 0)
 2531: 	    {
 2532: 	      /* Yep, this is the entry.  */
 2533: 	      break;
 2534: 	    }
 2535: 
 2536: 	  /* Next entry.  */
 2537: 	  elem += second;
 2538: 	}
 2539:       return elem;
 2540:     }
 2541: 
 2542:   /* Local function for parse_bracket_exp used in _LIBC environement.
 2543:      Look up the collation sequence value of BR_ELEM.
 2544:      Return the value if succeeded, UINT_MAX otherwise.  */
 2545: 
 2546:   static inline unsigned int
 2547:   lookup_collation_sequence_value (br_elem)
 2548: 	 bracket_elem_t *br_elem;
 2549:     {
 2550:       if (br_elem->type == SB_CHAR)
 2551: 	{
 2552: 	  /*
 2553: 	  if (MB_CUR_MAX == 1)
 2554: 	  */
 2555: 	  if (nrules == 0)
 2556: 	    return collseqmb[br_elem->opr.ch];
 2557: 	  else
 2558: 	    {
 2559: 	      wint_t wc = __btowc (br_elem->opr.ch);
 2560: 	      return collseq_table_lookup (collseqwc, wc);
 2561: 	    }
 2562: 	}
 2563:       else if (br_elem->type == MB_CHAR)
 2564: 	{
 2565: 	  return collseq_table_lookup (collseqwc, br_elem->opr.wch);
 2566: 	}
 2567:       else if (br_elem->type == COLL_SYM)
 2568: 	{
 2569: 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
 2570: 	  if (nrules != 0)
 2571: 	    {
 2572: 	      int32_t elem, idx;
 2573: 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
 2574: 						  sym_name_len);
 2575: 	      if (symb_table[2 * elem] != 0)
 2576: 		{
 2577: 		  /* We found the entry.  */
 2578: 		  idx = symb_table[2 * elem + 1];
 2579: 		  /* Skip the name of collating element name.  */
 2580: 		  idx += 1 + extra[idx];
 2581: 		  /* Skip the byte sequence of the collating element.  */
 2582: 		  idx += 1 + extra[idx];
 2583: 		  /* Adjust for the alignment.  */
 2584: 		  idx = (idx + 3) & ~3;
 2585: 		  /* Skip the multibyte collation sequence value.  */
 2586: 		  idx += sizeof (unsigned int);
 2587: 		  /* Skip the wide char sequence of the collating element.  */
 2588: 		  idx += sizeof (unsigned int) *
 2589: 		    (1 + *(unsigned int *) (extra + idx));
 2590: 		  /* Return the collation sequence value.  */
 2591: 		  return *(unsigned int *) (extra + idx);
 2592: 		}
 2593: 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
 2594: 		{
 2595: 		  /* No valid character.  Match it as a single byte
 2596: 		     character.  */
 2597: 		  return collseqmb[br_elem->opr.name[0]];
 2598: 		}
 2599: 	    }
 2600: 	  else if (sym_name_len == 1)
 2601: 	    return collseqmb[br_elem->opr.name[0]];
 2602: 	}
 2603:       return UINT_MAX;
 2604:     }
 2605: 
 2606:   /* Local function for parse_bracket_exp used in _LIBC environement.
 2607:      Build the range expression which starts from START_ELEM, and ends
 2608:      at END_ELEM.  The result are written to MBCSET and SBCSET.
 2609:      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
 2610:      mbcset->range_ends, is a pointer argument sinse we may
 2611:      update it.  */
 2612: 
 2613:   static inline reg_errcode_t
 2614: # ifdef RE_ENABLE_I18N
 2615:   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
 2616: 	 re_charset_t *mbcset;
 2617: 	 int *range_alloc;
 2618: # else /* not RE_ENABLE_I18N */
 2619:   build_range_exp (sbcset, start_elem, end_elem)
 2620: # endif /* not RE_ENABLE_I18N */
 2621: 	 re_bitset_ptr_t sbcset;
 2622: 	 bracket_elem_t *start_elem, *end_elem;
 2623:     {
 2624:       unsigned int ch;
 2625:       uint32_t start_collseq;
 2626:       uint32_t end_collseq;
 2627: 
 2628: # ifdef RE_ENABLE_I18N
 2629:       /* Check the space of the arrays.  */
 2630:       if (*range_alloc == mbcset->nranges)
 2631: 	{
 2632: 	  /* There are not enough space, need realloc.  */
 2633: 	  uint32_t *new_array_start;
 2634: 	  uint32_t *new_array_end;
 2635: 	  int new_nranges;
 2636: 
 2637: 	  /* +1 in case of mbcset->nranges is 0.  */
 2638: 	  new_nranges = 2 * mbcset->nranges + 1;
 2639: 	  /* Use realloc since mbcset->range_starts and mbcset->range_ends
 2640: 	     are NULL if *range_alloc == 0.  */
 2641: 	  new_array_start = re_realloc (mbcset->range_starts, uint32_t,
 2642: 					new_nranges);
 2643: 	  new_array_end = re_realloc (mbcset->range_ends, uint32_t,
 2644: 				      new_nranges);
 2645: 
 2646: 	  if (BE (new_array_start == NULL || new_array_end == NULL, 0))
 2647: 	    return REG_ESPACE;
 2648: 
 2649: 	  mbcset->range_starts = new_array_start;
 2650: 	  mbcset->range_ends = new_array_end;
 2651: 	  *range_alloc = new_nranges;
 2652: 	}
 2653: # endif /* RE_ENABLE_I18N */
 2654: 
 2655:       /* Equivalence Classes and Character Classes can't be a range
 2656: 	 start/end.  */
 2657:       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
 2658: 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
 2659: 	      0))
 2660: 	return REG_ERANGE;
 2661: 
 2662:       start_collseq = lookup_collation_sequence_value (start_elem);
 2663:       end_collseq = lookup_collation_sequence_value (end_elem);
 2664:       /* Check start/end collation sequence values.  */
 2665:       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
 2666: 	return REG_ECOLLATE;
 2667:       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
 2668: 	return REG_ERANGE;
 2669: 
 2670: # ifdef RE_ENABLE_I18N
 2671:       /* Got valid collation sequence values, add them as a new entry.  */
 2672:       mbcset->range_starts[mbcset->nranges] = start_collseq;
 2673:       mbcset->range_ends[mbcset->nranges++] = end_collseq;
 2674: # endif /* RE_ENABLE_I18N */
 2675: 
 2676:       /* Build the table for single byte characters.  */
 2677:       for (ch = 0; ch <= SBC_MAX; ch++)
 2678: 	{
 2679: 	  uint32_t ch_collseq;
 2680: 	  /*
 2681: 	  if (MB_CUR_MAX == 1)
 2682: 	  */
 2683: 	  if (nrules == 0)
 2684: 	    ch_collseq = collseqmb[ch];
 2685: 	  else
 2686: 	    ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
 2687: 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
 2688: 	    bitset_set (sbcset, ch);
 2689: 	}
 2690:       return REG_NOERROR;
 2691:     }
 2692: 
 2693:   /* Local function for parse_bracket_exp used in _LIBC environement.
 2694:      Build the collating element which is represented by NAME.
 2695:      The result are written to MBCSET and SBCSET.
 2696:      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
 2697:      pointer argument sinse we may update it.  */
 2698: 
 2699:   static inline reg_errcode_t
 2700: # ifdef RE_ENABLE_I18N
 2701:   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
 2702: 	 re_charset_t *mbcset;
 2703: 	 int *coll_sym_alloc;
 2704: # else /* not RE_ENABLE_I18N */
 2705:   build_collating_symbol (sbcset, name)
 2706: # endif /* not RE_ENABLE_I18N */
 2707: 	 re_bitset_ptr_t sbcset;
 2708: 	 const unsigned char *name;
 2709:     {
 2710:       int32_t elem, idx;
 2711:       size_t name_len = strlen ((const char *) name);
 2712:       if (nrules != 0)
 2713: 	{
 2714: 	  elem = seek_collating_symbol_entry (name, name_len);
 2715: 	  if (symb_table[2 * elem] != 0)
 2716: 	    {
 2717: 	      /* We found the entry.  */
 2718: 	      idx = symb_table[2 * elem + 1];
 2719: 	      /* Skip the name of collating element name.  */
 2720: 	      idx += 1 + extra[idx];
 2721: 	    }
 2722: 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
 2723: 	    {
 2724: 	      /* No valid character, treat it as a normal
 2725: 		 character.  */
 2726: 	      bitset_set (sbcset, name[0]);
 2727: 	      return REG_NOERROR;
 2728: 	    }
 2729: 	  else
 2730: 	    return REG_ECOLLATE;
 2731: 
 2732: # ifdef RE_ENABLE_I18N
 2733: 	  /* Got valid collation sequence, add it as a new entry.  */
 2734: 	  /* Check the space of the arrays.  */
 2735: 	  if (*coll_sym_alloc == mbcset->ncoll_syms)
 2736: 	    {
 2737: 	      /* Not enough, realloc it.  */
 2738: 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
 2739: 	      *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
 2740: 	      /* Use realloc since mbcset->coll_syms is NULL
 2741: 		 if *alloc == 0.  */
 2742: 	      mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
 2743: 					      *coll_sym_alloc);
 2744: 	      if (BE (mbcset->coll_syms == NULL, 0))
 2745: 		return REG_ESPACE;
 2746: 	    }
 2747: 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
 2748: # endif /* RE_ENABLE_I18N */
 2749: 	  return REG_NOERROR;
 2750: 	}
 2751:       else
 2752: 	{
 2753: 	  if (BE (name_len != 1, 0))
 2754: 	    return REG_ECOLLATE;
 2755: 	  else
 2756: 	    {
 2757: 	      bitset_set (sbcset, name[0]);
 2758: 	      return REG_NOERROR;
 2759: 	    }
 2760: 	}
 2761:     }
 2762: #endif
 2763: 
 2764:   re_token_t br_token;
 2765:   re_bitset_ptr_t sbcset;
 2766: #ifdef RE_ENABLE_I18N
 2767:   re_charset_t *mbcset;
 2768:   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
 2769:   int equiv_class_alloc = 0, char_class_alloc = 0;
 2770: #else /* not RE_ENABLE_I18N */
 2771:   int non_match = 0;
 2772: #endif /* not RE_ENABLE_I18N */
 2773:   bin_tree_t *work_tree;
 2774:   int token_len, new_idx;
 2775: #ifdef _LIBC
 2776:   collseqmb = (const unsigned char *)
 2777:     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
 2778:   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
 2779:   if (nrules)
 2780:     {
 2781:       /*
 2782:       if (MB_CUR_MAX > 1)
 2783:       */
 2784: 	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
 2785:       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
 2786:       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
 2787: 						  _NL_COLLATE_SYMB_TABLEMB);
 2788:       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
 2789: 						   _NL_COLLATE_SYMB_EXTRAMB);
 2790:     }
 2791: #endif
 2792:   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
 2793: #ifdef RE_ENABLE_I18N
 2794:   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
 2795: #endif /* RE_ENABLE_I18N */
 2796: #ifdef RE_ENABLE_I18N
 2797:   if (BE (sbcset == NULL || mbcset == NULL, 0))
 2798: #else
 2799:   if (BE (sbcset == NULL, 0))
 2800: #endif /* RE_ENABLE_I18N */
 2801:     {
 2802:       *err = REG_ESPACE;
 2803:       return NULL;
 2804:     }
 2805: 
 2806:   token_len = peek_token_bracket (token, regexp, syntax);
 2807:   if (BE (token->type == END_OF_RE, 0))
 2808:     {
 2809:       *err = REG_BADPAT;
 2810:       goto parse_bracket_exp_free_return;
 2811:     }
 2812:   if (token->type == OP_NON_MATCH_LIST)
 2813:     {
 2814: #ifdef RE_ENABLE_I18N
 2815:       int i;
 2816:       mbcset->non_match = 1;
 2817: #else /* not RE_ENABLE_I18N */
 2818:       non_match = 1;
 2819: #endif /* not RE_ENABLE_I18N */
 2820:       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
 2821: 	bitset_set (sbcset, '\0');
 2822:       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
 2823:       token_len = peek_token_bracket (token, regexp, syntax);
 2824:       if (BE (token->type == END_OF_RE, 0))
 2825: 	{
 2826: 	  *err = REG_BADPAT;
 2827: 	  goto parse_bracket_exp_free_return;
 2828: 	}
 2829: #ifdef RE_ENABLE_I18N
 2830:       if (MB_CUR_MAX > 1)
 2831: 	for (i = 0; i < SBC_MAX; ++i)
 2832: 	  if (__btowc (i) == WEOF)
 2833: 	    bitset_set (sbcset, i);
 2834: #endif /* RE_ENABLE_I18N */
 2835:     }
 2836: 
 2837:   /* We treat the first ']' as a normal character.  */
 2838:   if (token->type == OP_CLOSE_BRACKET)
 2839:     token->type = CHARACTER;
 2840: 
 2841:   while (1)
 2842:     {
 2843:       bracket_elem_t start_elem, end_elem;
 2844:       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
 2845:       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
 2846:       reg_errcode_t ret;
 2847:       int token_len2 = 0, is_range_exp = 0;
 2848:       re_token_t token2;
 2849: 
 2850:       start_elem.opr.name = start_name_buf;
 2851:       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
 2852: 				   syntax);
 2853:       if (BE (ret != REG_NOERROR, 0))
 2854: 	{
 2855: 	  *err = ret;
 2856: 	  goto parse_bracket_exp_free_return;
 2857: 	}
 2858: 
 2859:       token_len = peek_token_bracket (token, regexp, syntax);
 2860:       if (BE (token->type == END_OF_RE, 0))
 2861: 	{
 2862: 	  *err = REG_BADPAT;
 2863: 	  goto parse_bracket_exp_free_return;
 2864: 	}
 2865:       if (token->type == OP_CHARSET_RANGE)
 2866: 	{
 2867: 	  re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
 2868: 	  token_len2 = peek_token_bracket (&token2, regexp, syntax);
 2869: 	  if (BE (token->type == END_OF_RE, 0))
 2870: 	    {
 2871: 	      *err = REG_BADPAT;
 2872: 	      goto parse_bracket_exp_free_return;
 2873: 	    }
 2874: 	  if (token2.type == OP_CLOSE_BRACKET)
 2875: 	    {
 2876: 	      /* We treat the last '-' as a normal character.  */
 2877: 	      re_string_skip_bytes (regexp, -token_len);
 2878: 	      token->type = CHARACTER;
 2879: 	    }
 2880: 	  else
 2881: 	    is_range_exp = 1;
 2882: 	}
 2883: 
 2884:       if (is_range_exp == 1)
 2885: 	{
 2886: 	  end_elem.opr.name = end_name_buf;
 2887: 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
 2888: 				       dfa, syntax);
 2889: 	  if (BE (ret != REG_NOERROR, 0))
 2890: 	    {
 2891: 	      *err = ret;
 2892: 	      goto parse_bracket_exp_free_return;
 2893: 	    }
 2894: 
 2895: 	  token_len = peek_token_bracket (token, regexp, syntax);
 2896: 	  if (BE (token->type == END_OF_RE, 0))
 2897: 	    {
 2898: 	      *err = REG_BADPAT;
 2899: 	      goto parse_bracket_exp_free_return;
 2900: 	    }
 2901: 	  *err = build_range_exp (sbcset,
 2902: #ifdef RE_ENABLE_I18N
 2903: 				  mbcset, &range_alloc,
 2904: #endif /* RE_ENABLE_I18N */
 2905: 				  &start_elem, &end_elem);
 2906: 	  if (BE (*err != REG_NOERROR, 0))
 2907: 	    goto parse_bracket_exp_free_return;
 2908: 	}
 2909:       else
 2910: 	{
 2911: 	  switch (start_elem.type)
 2912: 	    {
 2913: 	    case SB_CHAR:
 2914: 	      bitset_set (sbcset, start_elem.opr.ch);
 2915: 	      break;
 2916: #ifdef RE_ENABLE_I18N
 2917: 	    case MB_CHAR:
 2918: 	      /* Check whether the array has enough space.  */
 2919: 	      if (mbchar_alloc == mbcset->nmbchars)
 2920: 		{
 2921: 		  /* Not enough, realloc it.  */
 2922: 		  /* +1 in case of mbcset->nmbchars is 0.  */
 2923: 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
 2924: 		  /* Use realloc since array is NULL if *alloc == 0.  */
 2925: 		  mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
 2926: 						mbchar_alloc);
 2927: 		  if (BE (mbcset->mbchars == NULL, 0))
 2928: 		    goto parse_bracket_exp_espace;
 2929: 		}
 2930: 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
 2931: 	      break;
 2932: #endif /* RE_ENABLE_I18N */
 2933: 	    case EQUIV_CLASS:
 2934: 	      *err = build_equiv_class (sbcset,
 2935: #ifdef RE_ENABLE_I18N
 2936: 					mbcset, &equiv_class_alloc,
 2937: #endif /* RE_ENABLE_I18N */
 2938: 					start_elem.opr.name);
 2939: 	      if (BE (*err != REG_NOERROR, 0))
 2940: 		goto parse_bracket_exp_free_return;
 2941: 	      break;
 2942: 	    case COLL_SYM:
 2943: 	      *err = build_collating_symbol (sbcset,
 2944: #ifdef RE_ENABLE_I18N
 2945: 					     mbcset, &coll_sym_alloc,
 2946: #endif /* RE_ENABLE_I18N */
 2947: 					     start_elem.opr.name);
 2948: 	      if (BE (*err != REG_NOERROR, 0))
 2949: 		goto parse_bracket_exp_free_return;
 2950: 	      break;
 2951: 	    case CHAR_CLASS:
 2952: 	      *err = build_charclass (sbcset,
 2953: #ifdef RE_ENABLE_I18N
 2954: 				      mbcset, &char_class_alloc,
 2955: #endif /* RE_ENABLE_I18N */
 2956: 				      start_elem.opr.name, syntax);
 2957: 	      if (BE (*err != REG_NOERROR, 0))
 2958: 	       goto parse_bracket_exp_free_return;
 2959: 	      break;
 2960: 	    default:
 2961: 	      assert (0);
 2962: 	      break;
 2963: 	    }
 2964: 	}
 2965:       if (token->type == OP_CLOSE_BRACKET)
 2966: 	break;
 2967:     }
 2968: 
 2969:   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
 2970: 
 2971:   /* If it is non-matching list.  */
 2972: #ifdef RE_ENABLE_I18N
 2973:   if (mbcset->non_match)
 2974: #else /* not RE_ENABLE_I18N */
 2975:   if (non_match)
 2976: #endif /* not RE_ENABLE_I18N */
 2977:     bitset_not (sbcset);
 2978: 
 2979:   /* Build a tree for simple bracket.  */
 2980:   br_token.type = SIMPLE_BRACKET;
 2981:   br_token.opr.sbcset = sbcset;
 2982:   new_idx = re_dfa_add_node (dfa, br_token, 0);
 2983:   work_tree = create_tree (NULL, NULL, 0, new_idx);
 2984:   if (BE (new_idx == -1 || work_tree == NULL, 0))
 2985:     goto parse_bracket_exp_espace;
 2986: 
 2987: #ifdef RE_ENABLE_I18N
 2988:   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
 2989:       || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
 2990: 						|| mbcset->non_match)))
 2991:     {
 2992:       re_token_t alt_token;
 2993:       bin_tree_t *mbc_tree;
 2994:       /* Build a tree for complex bracket.  */
 2995:       br_token.type = COMPLEX_BRACKET;
 2996:       br_token.opr.mbcset = mbcset;
 2997:       dfa->has_mb_node = 1;
 2998:       new_idx = re_dfa_add_node (dfa, br_token, 0);
 2999:       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
 3000:       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
 3001: 	goto parse_bracket_exp_espace;
 3002:       /* Then join them by ALT node.  */
 3003:       dfa->has_plural_match = 1;
 3004:       alt_token.type = OP_ALT;
 3005:       new_idx = re_dfa_add_node (dfa, alt_token, 0);
 3006:       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
 3007:       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
 3008: 	return work_tree;
 3009:     }
 3010:   else
 3011:     {
 3012:       free_charset (mbcset);
 3013:       return work_tree;
 3014:     }
 3015: #else /* not RE_ENABLE_I18N */
 3016:   return work_tree;
 3017: #endif /* not RE_ENABLE_I18N */
 3018: 
 3019:  parse_bracket_exp_espace:
 3020:   *err = REG_ESPACE;
 3021:  parse_bracket_exp_free_return:
 3022:   re_free (sbcset);
 3023: #ifdef RE_ENABLE_I18N
 3024:   free_charset (mbcset);
 3025: #endif /* RE_ENABLE_I18N */
 3026:   return NULL;
 3027: }
 3028: 
 3029: /* Parse an element in the bracket expression.  */
 3030: 
 3031: static reg_errcode_t
 3032: parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
 3033:      bracket_elem_t *elem;
 3034:      re_string_t *regexp;
 3035:      re_token_t *token;
 3036:      int token_len;
 3037:      re_dfa_t *dfa;
 3038:      reg_syntax_t syntax;
 3039: {
 3040: #ifdef RE_ENABLE_I18N
 3041:   int cur_char_size;
 3042:   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
 3043:   if (cur_char_size > 1)
 3044:     {
 3045:       elem->type = MB_CHAR;
 3046:       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
 3047:       re_string_skip_bytes (regexp, cur_char_size);
 3048:       return REG_NOERROR;
 3049:     }
 3050: #endif /* RE_ENABLE_I18N */
 3051:   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
 3052:   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
 3053:       || token->type == OP_OPEN_EQUIV_CLASS)
 3054:     return parse_bracket_symbol (elem, regexp, token);
 3055:   elem->type = SB_CHAR;
 3056:   elem->opr.ch = token->opr.c;
 3057:   return REG_NOERROR;
 3058: }
 3059: 
 3060: /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
 3061:    such as [:<character_class>:], [.<collating_element>.], and
 3062:    [=<equivalent_class>=].  */
 3063: 
 3064: static reg_errcode_t
 3065: parse_bracket_symbol (elem, regexp, token)
 3066:      bracket_elem_t *elem;
 3067:      re_string_t *regexp;
 3068:      re_token_t *token;
 3069: {
 3070:   unsigned char ch, delim = token->opr.c;
 3071:   int i = 0;
 3072:   for (;; ++i)
 3073:     {
 3074:       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
 3075: 	return REG_EBRACK;
 3076:       if (token->type == OP_OPEN_CHAR_CLASS)
 3077: 	ch = re_string_fetch_byte_case (regexp);
 3078:       else
 3079: 	ch = re_string_fetch_byte (regexp);
 3080:       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
 3081: 	break;
 3082:       elem->opr.name[i] = ch;
 3083:     }
 3084:   re_string_skip_bytes (regexp, 1);
 3085:   elem->opr.name[i] = '\0';
 3086:   switch (token->type)
 3087:     {
 3088:     case OP_OPEN_COLL_ELEM:
 3089:       elem->type = COLL_SYM;
 3090:       break;
 3091:     case OP_OPEN_EQUIV_CLASS:
 3092:       elem->type = EQUIV_CLASS;
 3093:       break;
 3094:     case OP_OPEN_CHAR_CLASS:
 3095:       elem->type = CHAR_CLASS;
 3096:       break;
 3097:     default:
 3098:       break;
 3099:     }
 3100:   return REG_NOERROR;
 3101: }
 3102: 
 3103:   /* Helper function for parse_bracket_exp.
 3104:      Build the equivalence class which is represented by NAME.
 3105:      The result are written to MBCSET and SBCSET.
 3106:      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
 3107:      is a pointer argument sinse we may update it.  */
 3108: 
 3109: static reg_errcode_t
 3110: #ifdef RE_ENABLE_I18N
 3111: build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
 3112:      re_charset_t *mbcset;
 3113:      int *equiv_class_alloc;
 3114: #else /* not RE_ENABLE_I18N */
 3115: build_equiv_class (sbcset, name)
 3116: #endif /* not RE_ENABLE_I18N */
 3117:      re_bitset_ptr_t sbcset;
 3118:      const unsigned char *name;
 3119: {
 3120: #if defined _LIBC && defined RE_ENABLE_I18N
 3121:   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
 3122:   if (nrules != 0)
 3123:     {
 3124:       const int32_t *table, *indirect;
 3125:       const unsigned char *weights, *extra, *cp;
 3126:       unsigned char char_buf[2];
 3127:       int32_t idx1, idx2;
 3128:       unsigned int ch;
 3129:       size_t len;
 3130:       /* This #include defines a local function!  */
 3131: # include <locale/weight.h>
 3132:       /* Calculate the index for equivalence class.  */
 3133:       cp = name;
 3134:       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 3135:       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
 3136: 					       _NL_COLLATE_WEIGHTMB);
 3137:       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
 3138: 						   _NL_COLLATE_EXTRAMB);
 3139:       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
 3140: 						_NL_COLLATE_INDIRECTMB);
 3141:       idx1 = findidx (&cp);
 3142:       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
 3143: 	/* This isn't a valid character.  */
 3144: 	return REG_ECOLLATE;
 3145: 
 3146:       /* Build single byte matcing table for this equivalence class.  */
 3147:       char_buf[1] = (unsigned char) '\0';
 3148:       len = weights[idx1];
 3149:       for (ch = 0; ch < SBC_MAX; ++ch)
 3150: 	{
 3151: 	  char_buf[0] = ch;
 3152: 	  cp = char_buf;
 3153: 	  idx2 = findidx (&cp);
 3154: /*
 3155: 	  idx2 = table[ch];
 3156: */
 3157: 	  if (idx2 == 0)
 3158: 	    /* This isn't a valid character.  */
 3159: 	    continue;
 3160: 	  if (len == weights[idx2])
 3161: 	    {
 3162: 	      int cnt = 0;
 3163: 	      while (cnt <= len &&
 3164: 		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
 3165: 		++cnt;
 3166: 
 3167: 	      if (cnt > len)
 3168: 		bitset_set (sbcset, ch);
 3169: 	    }
 3170: 	}
 3171:       /* Check whether the array has enough space.  */
 3172:       if (*equiv_class_alloc == mbcset->nequiv_classes)
 3173: 	{
 3174: 	  /* Not enough, realloc it.  */
 3175: 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
 3176: 	  *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
 3177: 	  /* Use realloc since the array is NULL if *alloc == 0.  */
 3178: 	  mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
 3179: 					      *equiv_class_alloc);
 3180: 	  if (BE (mbcset->equiv_classes == NULL, 0))
 3181: 	    return REG_ESPACE;
 3182: 	}
 3183:       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
 3184:     }
 3185:   else
 3186: #endif /* _LIBC && RE_ENABLE_I18N */
 3187:     {
 3188:       if (BE (strlen ((const char *) name) != 1, 0))
 3189: 	return REG_ECOLLATE;
 3190:       bitset_set (sbcset, *name);
 3191:     }
 3192:   return REG_NOERROR;
 3193: }
 3194: 
 3195:   /* Helper function for parse_bracket_exp.
 3196:      Build the character class which is represented by NAME.
 3197:      The result are written to MBCSET and SBCSET.
 3198:      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
 3199:      is a pointer argument sinse we may update it.  */
 3200: 
 3201: static reg_errcode_t
 3202: #ifdef RE_ENABLE_I18N
 3203: build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
 3204:      re_charset_t *mbcset;
 3205:      int *char_class_alloc;
 3206: #else /* not RE_ENABLE_I18N */
 3207: build_charclass (sbcset, class_name, syntax)
 3208: #endif /* not RE_ENABLE_I18N */
 3209:      re_bitset_ptr_t sbcset;
 3210:      const unsigned char *class_name;
 3211:      reg_syntax_t syntax;
 3212: {
 3213:   int i;
 3214:   const char *name = (const char *) class_name;
 3215: 
 3216:   /* In case of REG_ICASE "upper" and "lower" match the both of
 3217:      upper and lower cases.  */
 3218:   if ((syntax & RE_ICASE)
 3219:       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
 3220:     name = "alpha";
 3221: 
 3222: #ifdef RE_ENABLE_I18N
 3223:   /* Check the space of the arrays.  */
 3224:   if (*char_class_alloc == mbcset->nchar_classes)
 3225:     {
 3226:       /* Not enough, realloc it.  */
 3227:       /* +1 in case of mbcset->nchar_classes is 0.  */
 3228:       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
 3229:       /* Use realloc since array is NULL if *alloc == 0.  */
 3230:       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
 3231: 					 *char_class_alloc);
 3232:       if (BE (mbcset->char_classes == NULL, 0))
 3233: 	return REG_ESPACE;
 3234:     }
 3235:   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
 3236: #endif /* RE_ENABLE_I18N */
 3237: 
 3238: #define BUILD_CHARCLASS_LOOP(ctype_func)\
 3239:     for (i = 0; i < SBC_MAX; ++i)	\
 3240:       {					\
 3241: 	if (ctype_func (i))		\
 3242: 	  bitset_set (sbcset, i);	\
 3243:       }
 3244: 
 3245:   if (strcmp (name, "alnum") == 0)
 3246:     BUILD_CHARCLASS_LOOP (isalnum)
 3247:   else if (strcmp (name, "cntrl") == 0)
 3248:     BUILD_CHARCLASS_LOOP (iscntrl)
 3249:   else if (strcmp (name, "lower") == 0)
 3250:     BUILD_CHARCLASS_LOOP (islower)
 3251:   else if (strcmp (name, "space") == 0)
 3252:     BUILD_CHARCLASS_LOOP (isspace)
 3253:   else if (strcmp (name, "alpha") == 0)
 3254:     BUILD_CHARCLASS_LOOP (isalpha)
 3255:   else if (strcmp (name, "digit") == 0)
 3256:     BUILD_CHARCLASS_LOOP (isdigit)
 3257:   else if (strcmp (name, "print") == 0)
 3258:     BUILD_CHARCLASS_LOOP (isprint)
 3259:   else if (strcmp (name, "upper") == 0)
 3260:     BUILD_CHARCLASS_LOOP (isupper)
 3261:   else if (strcmp (name, "blank") == 0)
 3262:     BUILD_CHARCLASS_LOOP (isblank)
 3263:   else if (strcmp (name, "graph") == 0)
 3264:     BUILD_CHARCLASS_LOOP (isgraph)
 3265:   else if (strcmp (name, "punct") == 0)
 3266:     BUILD_CHARCLASS_LOOP (ispunct)
 3267:   else if (strcmp (name, "xdigit") == 0)
 3268:     BUILD_CHARCLASS_LOOP (isxdigit)
 3269:   else
 3270:     return REG_ECTYPE;
 3271: 
 3272:   return REG_NOERROR;
 3273: }
 3274: 
 3275: static bin_tree_t *
 3276: build_word_op (dfa, not, err)
 3277:      re_dfa_t *dfa;
 3278:      int not;
 3279:      reg_errcode_t *err;
 3280: {
 3281:   re_bitset_ptr_t sbcset;
 3282: #ifdef RE_ENABLE_I18N
 3283:   re_charset_t *mbcset;
 3284:   int alloc = 0;
 3285: #else /* not RE_ENABLE_I18N */
 3286:   int non_match = 0;
 3287: #endif /* not RE_ENABLE_I18N */
 3288:   reg_errcode_t ret;
 3289:   re_token_t br_token;
 3290:   bin_tree_t *tree;
 3291:   int new_idx;
 3292: 
 3293:   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
 3294: #ifdef RE_ENABLE_I18N
 3295:   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
 3296: #endif /* RE_ENABLE_I18N */
 3297: 
 3298: #ifdef RE_ENABLE_I18N
 3299:   if (BE (sbcset == NULL || mbcset == NULL, 0))
 3300: #else /* not RE_ENABLE_I18N */
 3301:   if (BE (sbcset == NULL, 0))
 3302: #endif /* not RE_ENABLE_I18N */
 3303:     {
 3304:       *err = REG_ESPACE;
 3305:       return NULL;
 3306:     }
 3307: 
 3308:   if (not)
 3309:     {
 3310: #ifdef RE_ENABLE_I18N
 3311:       int i;
 3312:       /*
 3313:       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
 3314: 	bitset_set(cset->sbcset, '\0');
 3315:       */
 3316:       mbcset->non_match = 1;
 3317:       if (MB_CUR_MAX > 1)
 3318: 	for (i = 0; i < SBC_MAX; ++i)
 3319: 	  if (__btowc (i) == WEOF)
 3320: 	    bitset_set (sbcset, i);
 3321: #else /* not RE_ENABLE_I18N */
 3322:       non_match = 1;
 3323: #endif /* not RE_ENABLE_I18N */
 3324:     }
 3325: 
 3326:   /* We don't care the syntax in this case.  */
 3327:   ret = build_charclass (sbcset,
 3328: #ifdef RE_ENABLE_I18N
 3329: 			 mbcset, &alloc,
 3330: #endif /* RE_ENABLE_I18N */
 3331: 			 (const unsigned char *) "alpha", 0);
 3332: 
 3333:   if (BE (ret != REG_NOERROR, 0))
 3334:     {
 3335:       re_free (sbcset);
 3336: #ifdef RE_ENABLE_I18N
 3337:       free_charset (mbcset);
 3338: #endif /* RE_ENABLE_I18N */
 3339:       *err = ret;
 3340:       return NULL;
 3341:     }
 3342:   /* \w match '_' also.  */
 3343:   bitset_set (sbcset, '_');
 3344: 
 3345:   /* If it is non-matching list.  */
 3346: #ifdef RE_ENABLE_I18N
 3347:   if (mbcset->non_match)
 3348: #else /* not RE_ENABLE_I18N */
 3349:   if (non_match)
 3350: #endif /* not RE_ENABLE_I18N */
 3351:     bitset_not (sbcset);
 3352: 
 3353:   /* Build a tree for simple bracket.  */
 3354:   br_token.type = SIMPLE_BRACKET;
 3355:   br_token.opr.sbcset = sbcset;
 3356:   new_idx = re_dfa_add_node (dfa, br_token, 0);
 3357:   tree = create_tree (NULL, NULL, 0, new_idx);
 3358:   if (BE (new_idx == -1 || tree == NULL, 0))
 3359:     goto build_word_op_espace;
 3360: 
 3361: #ifdef RE_ENABLE_I18N
 3362:   if (MB_CUR_MAX > 1)
 3363:     {
 3364:       re_token_t alt_token;
 3365:       bin_tree_t *mbc_tree;
 3366:       /* Build a tree for complex bracket.  */
 3367:       br_token.type = COMPLEX_BRACKET;
 3368:       br_token.opr.mbcset = mbcset;
 3369:       dfa->has_mb_node = 1;
 3370:       new_idx = re_dfa_add_node (dfa, br_token, 0);
 3371:       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
 3372:       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
 3373: 	goto build_word_op_espace;
 3374:       /* Then join them by ALT node.  */
 3375:       alt_token.type = OP_ALT;
 3376:       new_idx = re_dfa_add_node (dfa, alt_token, 0);
 3377:       tree = create_tree (tree, mbc_tree, 0, new_idx);
 3378:       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
 3379: 	return tree;
 3380:     }
 3381:   else
 3382:     {
 3383:       free_charset (mbcset);
 3384:       return tree;
 3385:     }
 3386: #else /* not RE_ENABLE_I18N */
 3387:   return tree;
 3388: #endif /* not RE_ENABLE_I18N */
 3389: 
 3390:  build_word_op_espace:
 3391:   re_free (sbcset);
 3392: #ifdef RE_ENABLE_I18N
 3393:   free_charset (mbcset);
 3394: #endif /* RE_ENABLE_I18N */
 3395:   *err = REG_ESPACE;
 3396:   return NULL;
 3397: }
 3398: 
 3399: /* This is intended for the expressions like "a{1,3}".
 3400:    Fetch a number from `input', and return the number.
 3401:    Return -1, if the number field is empty like "{,1}".
 3402:    Return -2, If an error is occured.  */
 3403: 
 3404: static int
 3405: fetch_number (input, token, syntax)
 3406:      re_string_t *input;
 3407:      re_token_t *token;
 3408:      reg_syntax_t syntax;
 3409: {
 3410:   int num = -1;
 3411:   unsigned char c;
 3412:   while (1)
 3413:     {
 3414:       *token = fetch_token (input, syntax);
 3415:       c = token->opr.c;
 3416:       if (BE (token->type == END_OF_RE, 0))
 3417: 	return -2;
 3418:       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
 3419: 	break;
 3420:       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
 3421: 	     ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
 3422:       num = (num > RE_DUP_MAX) ? -2 : num;
 3423:     }
 3424:   return num;
 3425: }
 3426: 
 3427: #ifdef RE_ENABLE_I18N
 3428: static void
 3429: free_charset (re_charset_t *cset)
 3430: {
 3431:   re_free (cset->mbchars);
 3432: # ifdef _LIBC
 3433:   re_free (cset->coll_syms);
 3434:   re_free (cset->equiv_classes);
 3435:   re_free (cset->range_starts);
 3436:   re_free (cset->range_ends);
 3437: # endif
 3438:   re_free (cset->char_classes);
 3439:   re_free (cset);
 3440: }
 3441: #endif /* RE_ENABLE_I18N */
 3442: 
 3443: /* Functions for binary tree operation.  */
 3444: 
 3445: /* Create a node of tree.
 3446:    Note: This function automatically free left and right if malloc fails.  */
 3447: 
 3448: static bin_tree_t *
 3449: create_tree (left, right, type, index)
 3450:      bin_tree_t *left;
 3451:      bin_tree_t *right;
 3452:      re_token_type_t type;
 3453:      int index;
 3454: {
 3455:   bin_tree_t *tree;
 3456:   tree = re_malloc (bin_tree_t, 1);
 3457:   if (BE (tree == NULL, 0))
 3458:     {
 3459:       free_bin_tree (left);
 3460:       free_bin_tree (right);
 3461:       return NULL;
 3462:     }
 3463:   tree->parent = NULL;
 3464:   tree->left = left;
 3465:   tree->right = right;
 3466:   tree->type = type;
 3467:   tree->node_idx = index;
 3468:   tree->first = -1;
 3469:   tree->next = -1;
 3470:   re_node_set_init_empty (&tree->eclosure);
 3471: 
 3472:   if (left != NULL)
 3473:     left->parent = tree;
 3474:   if (right != NULL)
 3475:     right->parent = tree;
 3476:   return tree;
 3477: }
 3478: 
 3479: /* Free the sub tree pointed by TREE.  */
 3480: 
 3481: static void
 3482: free_bin_tree (tree)
 3483:      bin_tree_t *tree;
 3484: {
 3485:   if (tree == NULL)
 3486:     return;
 3487:   /*re_node_set_free (&tree->eclosure);*/
 3488:   free_bin_tree (tree->left);
 3489:   free_bin_tree (tree->right);
 3490:   re_free (tree);
 3491: }
 3492: 
 3493: /* Duplicate the node SRC, and return new node.  */
 3494: 
 3495: static bin_tree_t *
 3496: duplicate_tree (src, dfa)
 3497:      const bin_tree_t *src;
 3498:      re_dfa_t *dfa;
 3499: {
 3500:   bin_tree_t *left = NULL, *right = NULL, *new_tree;
 3501:   int new_node_idx;
 3502:   /* Since node indies must be according to Post-order of the tree,
 3503:      we must duplicate the left at first.  */
 3504:   if (src->left != NULL)
 3505:     {
 3506:       left = duplicate_tree (src->left, dfa);
 3507:       if (left == NULL)
 3508: 	return NULL;
 3509:     }
 3510: 
 3511:   /* Secondaly, duplicate the right.  */
 3512:   if (src->right != NULL)
 3513:     {
 3514:       right = duplicate_tree (src->right, dfa);
 3515:       if (right == NULL)
 3516: 	{
 3517: 	  free_bin_tree (left);
 3518: 	  return NULL;
 3519: 	}
 3520:     }
 3521: 
 3522:   /* At last, duplicate itself.  */
 3523:   if (src->type == NON_TYPE)
 3524:     {
 3525:       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
 3526:       dfa->nodes[new_node_idx].duplicated = 1;
 3527:       if (BE (new_node_idx == -1, 0))
 3528: 	{
 3529: 	  free_bin_tree (left);
 3530: 	  free_bin_tree (right);
 3531: 	  return NULL;
 3532: 	}
 3533:     }
 3534:   else
 3535:     new_node_idx = src->type;
 3536: 
 3537:   new_tree = create_tree (left, right, src->type, new_node_idx);
 3538:   if (BE (new_tree == NULL, 0))
 3539:     {
 3540:       free_bin_tree (left);
 3541:       free_bin_tree (right);
 3542:     }
 3543:   return new_tree;
 3544: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>