Annotation of embedaddon/smartmontools/regex/regcomp.c, revision 1.1.1.2

1.1       misho       1: /* Extended regular expression matching and search library.
                      2:    Copyright (C) 2002, 2003 Free Software Foundation, Inc.
                      3:    This file is part of the GNU C Library.
                      4:    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
                      5: 
                      6:    The GNU C Library is free software; you can redistribute it and/or
                      7:    modify it under the terms of the GNU Lesser General Public
                      8:    License as published by the Free Software Foundation; either
                      9:    version 2.1 of the License, or (at your option) any later version.
                     10: 
                     11:    The GNU C Library is distributed in the hope that it will be useful,
                     12:    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     13:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     14:    Lesser General Public License for more details.
                     15: 
                     16:    You should have received a copy of the GNU Lesser General Public
                     17:    License along with the GNU C Library; if not, write to the Free
1.1.1.2 ! misho      18:    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
        !            19:    MA 02110-1301 USA.  */
1.1       misho      20: 
                     21: static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
                     22:                                          int length, reg_syntax_t syntax);
                     23: static void re_compile_fastmap_iter (regex_t *bufp,
                     24:                                     const re_dfastate_t *init_state,
                     25:                                     char *fastmap);
                     26: static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
                     27: static reg_errcode_t init_word_char (re_dfa_t *dfa);
                     28: #ifdef RE_ENABLE_I18N
                     29: static void free_charset (re_charset_t *cset);
                     30: #endif /* RE_ENABLE_I18N */
                     31: static void free_workarea_compile (regex_t *preg);
                     32: static reg_errcode_t create_initial_state (re_dfa_t *dfa);
                     33: static reg_errcode_t analyze (re_dfa_t *dfa);
                     34: static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
                     35: static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
                     36: static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
                     37: static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
                     38: static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
                     39:                                             int top_clone_node, int root_node,
                     40:                                             unsigned int constraint);
                     41: static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
                     42:                                     unsigned int constraint);
                     43: static int search_duplicated_node (re_dfa_t *dfa, int org_node,
                     44:                                   unsigned int constraint);
                     45: static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
                     46: static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
                     47:                                         int node, int root);
                     48: static void calc_inveclosure (re_dfa_t *dfa);
                     49: static int fetch_number (re_string_t *input, re_token_t *token,
                     50:                         reg_syntax_t syntax);
                     51: static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
                     52: static int peek_token (re_token_t *token, re_string_t *input,
                     53:                        reg_syntax_t syntax);
                     54: static int peek_token_bracket (re_token_t *token, re_string_t *input,
                     55:                               reg_syntax_t syntax);
                     56: static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
                     57:                          reg_syntax_t syntax, reg_errcode_t *err);
                     58: static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
                     59:                                  re_token_t *token, reg_syntax_t syntax,
                     60:                                  int nest, reg_errcode_t *err);
                     61: static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
                     62:                                 re_token_t *token, reg_syntax_t syntax,
                     63:                                 int nest, reg_errcode_t *err);
                     64: static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
                     65:                                     re_token_t *token, reg_syntax_t syntax,
                     66:                                     int nest, reg_errcode_t *err);
                     67: static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
                     68:                                  re_token_t *token, reg_syntax_t syntax,
                     69:                                  int nest, reg_errcode_t *err);
                     70: static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
                     71:                                 re_dfa_t *dfa, re_token_t *token,
                     72:                                 reg_syntax_t syntax, reg_errcode_t *err);
                     73: static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
                     74:                                      re_token_t *token, reg_syntax_t syntax,
                     75:                                      reg_errcode_t *err);
                     76: static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
                     77:                                            re_string_t *regexp,
                     78:                                            re_token_t *token, int token_len,
                     79:                                            re_dfa_t *dfa,
                     80:                                            reg_syntax_t syntax);
                     81: static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
                     82:                                          re_string_t *regexp,
                     83:                                          re_token_t *token);
                     84: #ifndef _LIBC
                     85: # ifdef RE_ENABLE_I18N
                     86: static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
                     87:                                      re_charset_t *mbcset, int *range_alloc,
                     88:                                      bracket_elem_t *start_elem,
                     89:                                      bracket_elem_t *end_elem);
                     90: static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
                     91:                                             re_charset_t *mbcset,
                     92:                                             int *coll_sym_alloc,
                     93:                                             const unsigned char *name);
                     94: # else /* not RE_ENABLE_I18N */
                     95: static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
                     96:                                      bracket_elem_t *start_elem,
                     97:                                      bracket_elem_t *end_elem);
                     98: static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
                     99:                                             const unsigned char *name);
                    100: # endif /* not RE_ENABLE_I18N */
                    101: #endif /* not _LIBC */
                    102: #ifdef RE_ENABLE_I18N
                    103: static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
                    104:                                        re_charset_t *mbcset,
                    105:                                        int *equiv_class_alloc,
                    106:                                        const unsigned char *name);
                    107: static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
                    108:                                      re_charset_t *mbcset,
                    109:                                      int *char_class_alloc,
                    110:                                      const unsigned char *class_name,
                    111:                                      reg_syntax_t syntax);
                    112: #else  /* not RE_ENABLE_I18N */
                    113: static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
                    114:                                        const unsigned char *name);
                    115: static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
                    116:                                      const unsigned char *class_name,
                    117:                                      reg_syntax_t syntax);
                    118: #endif /* not RE_ENABLE_I18N */
                    119: static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
                    120: static void free_bin_tree (bin_tree_t *tree);
                    121: static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
                    122:                                re_token_type_t type, int index);
                    123: static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
                    124: 
                    125: /* This table gives an error message for each of the error codes listed
                    126:    in regex.h.  Obviously the order here has to be same as there.
                    127:    POSIX doesn't require that we do anything for REG_NOERROR,
                    128:    but why not be nice?  */
                    129: 
                    130: const char __re_error_msgid[] attribute_hidden =
                    131:   {
                    132: #define REG_NOERROR_IDX        0
                    133:     gettext_noop ("Success")   /* REG_NOERROR */
                    134:     "\0"
                    135: #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
                    136:     gettext_noop ("No match")  /* REG_NOMATCH */
                    137:     "\0"
                    138: #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
                    139:     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
                    140:     "\0"
                    141: #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
                    142:     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
                    143:     "\0"
                    144: #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
                    145:     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
                    146:     "\0"
                    147: #define REG_EESCAPE_IDX        (REG_ECTYPE_IDX + sizeof "Invalid character class name")
                    148:     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
                    149:     "\0"
                    150: #define REG_ESUBREG_IDX        (REG_EESCAPE_IDX + sizeof "Trailing backslash")
                    151:     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
                    152:     "\0"
                    153: #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
                    154:     gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
                    155:     "\0"
                    156: #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
                    157:     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
                    158:     "\0"
                    159: #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
                    160:     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
                    161:     "\0"
                    162: #define REG_BADBR_IDX  (REG_EBRACE_IDX + sizeof "Unmatched \\{")
                    163:     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
                    164:     "\0"
                    165: #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
                    166:     gettext_noop ("Invalid range end") /* REG_ERANGE */
                    167:     "\0"
                    168: #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
                    169:     gettext_noop ("Memory exhausted") /* REG_ESPACE */
                    170:     "\0"
                    171: #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
                    172:     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
                    173:     "\0"
                    174: #define REG_EEND_IDX   (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
                    175:     gettext_noop ("Premature end of regular expression") /* REG_EEND */
                    176:     "\0"
                    177: #define REG_ESIZE_IDX  (REG_EEND_IDX + sizeof "Premature end of regular expression")
                    178:     gettext_noop ("Regular expression too big") /* REG_ESIZE */
                    179:     "\0"
                    180: #define REG_ERPAREN_IDX        (REG_ESIZE_IDX + sizeof "Regular expression too big")
                    181:     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
                    182:   };
                    183: 
                    184: const size_t __re_error_msgid_idx[] attribute_hidden =
                    185:   {
                    186:     REG_NOERROR_IDX,
                    187:     REG_NOMATCH_IDX,
                    188:     REG_BADPAT_IDX,
                    189:     REG_ECOLLATE_IDX,
                    190:     REG_ECTYPE_IDX,
                    191:     REG_EESCAPE_IDX,
                    192:     REG_ESUBREG_IDX,
                    193:     REG_EBRACK_IDX,
                    194:     REG_EPAREN_IDX,
                    195:     REG_EBRACE_IDX,
                    196:     REG_BADBR_IDX,
                    197:     REG_ERANGE_IDX,
                    198:     REG_ESPACE_IDX,
                    199:     REG_BADRPT_IDX,
                    200:     REG_EEND_IDX,
                    201:     REG_ESIZE_IDX,
                    202:     REG_ERPAREN_IDX
                    203:   };
                    204: 
                    205: /* Entry points for GNU code.  */
                    206: 
                    207: /* re_compile_pattern is the GNU regular expression compiler: it
                    208:    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
                    209:    Returns 0 if the pattern was valid, otherwise an error string.
                    210: 
                    211:    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
                    212:    are set in BUFP on entry.  */
                    213: 
                    214: const char *
                    215: re_compile_pattern (pattern, length, bufp)
                    216:     const char *pattern;
                    217:     size_t length;
                    218:     struct re_pattern_buffer *bufp;
                    219: {
                    220:   reg_errcode_t ret;
                    221: 
                    222:   /* And GNU code determines whether or not to get register information
                    223:      by passing null for the REGS argument to re_match, etc., not by
                    224:      setting no_sub.  */
                    225:   bufp->no_sub = 0;
                    226: 
                    227:   /* Match anchors at newline.  */
                    228:   bufp->newline_anchor = 1;
                    229: 
                    230:   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
                    231: 
                    232:   if (!ret)
                    233:     return NULL;
                    234:   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
                    235: }
                    236: #ifdef _LIBC
                    237: weak_alias (__re_compile_pattern, re_compile_pattern)
                    238: #endif
                    239: 
                    240: /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
                    241:    also be assigned to arbitrarily: each pattern buffer stores its own
                    242:    syntax, so it can be changed between regex compilations.  */
                    243: /* This has no initializer because initialized variables in Emacs
                    244:    become read-only after dumping.  */
                    245: reg_syntax_t re_syntax_options;
                    246: 
                    247: 
                    248: /* Specify the precise syntax of regexps for compilation.  This provides
                    249:    for compatibility for various utilities which historically have
                    250:    different, incompatible syntaxes.
                    251: 
                    252:    The argument SYNTAX is a bit mask comprised of the various bits
                    253:    defined in regex.h.  We return the old syntax.  */
                    254: 
                    255: reg_syntax_t
                    256: re_set_syntax (syntax)
                    257:     reg_syntax_t syntax;
                    258: {
                    259:   reg_syntax_t ret = re_syntax_options;
                    260: 
                    261:   re_syntax_options = syntax;
                    262:   return ret;
                    263: }
                    264: #ifdef _LIBC
                    265: weak_alias (__re_set_syntax, re_set_syntax)
                    266: #endif
                    267: 
                    268: int
                    269: re_compile_fastmap (bufp)
                    270:     struct re_pattern_buffer *bufp;
                    271: {
                    272:   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
                    273:   char *fastmap = bufp->fastmap;
                    274: 
                    275:   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
                    276:   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
                    277:   if (dfa->init_state != dfa->init_state_word)
                    278:     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
                    279:   if (dfa->init_state != dfa->init_state_nl)
                    280:     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
                    281:   if (dfa->init_state != dfa->init_state_begbuf)
                    282:     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
                    283:   bufp->fastmap_accurate = 1;
                    284:   return 0;
                    285: }
                    286: #ifdef _LIBC
                    287: weak_alias (__re_compile_fastmap, re_compile_fastmap)
                    288: #endif
                    289: 
                    290: static inline void
                    291: re_set_fastmap (char *fastmap, int icase, int ch)
                    292: {
                    293:   fastmap[ch] = 1;
                    294:   if (icase)
                    295:     fastmap[tolower (ch)] = 1;
                    296: }
                    297: 
                    298: /* Helper function for re_compile_fastmap.
                    299:    Compile fastmap for the initial_state INIT_STATE.  */
                    300: 
                    301: static void
                    302: re_compile_fastmap_iter (bufp, init_state, fastmap)
                    303:      regex_t *bufp;
                    304:      const re_dfastate_t *init_state;
                    305:      char *fastmap;
                    306: {
                    307:   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
                    308:   int node_cnt;
                    309:   int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
                    310:   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
                    311:     {
                    312:       int node = init_state->nodes.elems[node_cnt];
                    313:       re_token_type_t type = dfa->nodes[node].type;
                    314: 
                    315:       if (type == CHARACTER)
                    316:        re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
                    317:       else if (type == SIMPLE_BRACKET)
                    318:        {
                    319:          int i, j, ch;
                    320:          for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
                    321:            for (j = 0; j < UINT_BITS; ++j, ++ch)
                    322:              if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
                    323:                re_set_fastmap (fastmap, icase, ch);
                    324:        }
                    325: #ifdef RE_ENABLE_I18N
                    326:       else if (type == COMPLEX_BRACKET)
                    327:        {
                    328:          int i;
                    329:          re_charset_t *cset = dfa->nodes[node].opr.mbcset;
                    330:          if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
                    331:              || cset->nranges || cset->nchar_classes)
                    332:            {
                    333: # ifdef _LIBC
                    334:              if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
                    335:                {
                    336:                  /* In this case we want to catch the bytes which are
                    337:                     the first byte of any collation elements.
                    338:                     e.g. In da_DK, we want to catch 'a' since "aa"
                    339:                          is a valid collation element, and don't catch
                    340:                          'b' since 'b' is the only collation element
                    341:                          which starts from 'b'.  */
                    342:                  int j, ch;
                    343:                  const int32_t *table = (const int32_t *)
                    344:                    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
                    345:                  for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
                    346:                    for (j = 0; j < UINT_BITS; ++j, ++ch)
                    347:                      if (table[ch] < 0)
                    348:                        re_set_fastmap (fastmap, icase, ch);
                    349:                }
                    350: # else
                    351:              if (MB_CUR_MAX > 1)
                    352:                for (i = 0; i < SBC_MAX; ++i)
                    353:                  if (__btowc (i) == WEOF)
                    354:                    re_set_fastmap (fastmap, icase, i);
                    355: # endif /* not _LIBC */
                    356:            }
                    357:          for (i = 0; i < cset->nmbchars; ++i)
                    358:            {
                    359:              char buf[256];
                    360:              mbstate_t state;
                    361:              memset (&state, '\0', sizeof (state));
                    362:              __wcrtomb (buf, cset->mbchars[i], &state);
                    363:              re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
                    364:            }
                    365:        }
                    366: #endif /* RE_ENABLE_I18N */
                    367:       else if (type == END_OF_RE || type == OP_PERIOD)
                    368:        {
                    369:          memset (fastmap, '\1', sizeof (char) * SBC_MAX);
                    370:          if (type == END_OF_RE)
                    371:            bufp->can_be_null = 1;
                    372:          return;
                    373:        }
                    374:     }
                    375: }
                    376: 
                    377: /* Entry point for POSIX code.  */
                    378: /* regcomp takes a regular expression as a string and compiles it.
                    379: 
                    380:    PREG is a regex_t *.  We do not expect any fields to be initialized,
                    381:    since POSIX says we shouldn't.  Thus, we set
                    382: 
                    383:      `buffer' to the compiled pattern;
                    384:      `used' to the length of the compiled pattern;
                    385:      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
                    386:        REG_EXTENDED bit in CFLAGS is set; otherwise, to
                    387:        RE_SYNTAX_POSIX_BASIC;
                    388:      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
                    389:      `fastmap' to an allocated space for the fastmap;
                    390:      `fastmap_accurate' to zero;
                    391:      `re_nsub' to the number of subexpressions in PATTERN.
                    392: 
                    393:    PATTERN is the address of the pattern string.
                    394: 
                    395:    CFLAGS is a series of bits which affect compilation.
                    396: 
                    397:      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
                    398:      use POSIX basic syntax.
                    399: 
                    400:      If REG_NEWLINE is set, then . and [^...] don't match newline.
                    401:      Also, regexec will try a match beginning after every newline.
                    402: 
                    403:      If REG_ICASE is set, then we considers upper- and lowercase
                    404:      versions of letters to be equivalent when matching.
                    405: 
                    406:      If REG_NOSUB is set, then when PREG is passed to regexec, that
                    407:      routine will report only success or failure, and nothing about the
                    408:      registers.
                    409: 
                    410:    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
                    411:    the return codes and their meanings.)  */
                    412: 
                    413: int
                    414: regcomp (preg, pattern, cflags)
                    415:     regex_t *__restrict preg;
                    416:     const char *__restrict pattern;
                    417:     int cflags;
                    418: {
                    419:   reg_errcode_t ret;
                    420:   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
                    421:                         : RE_SYNTAX_POSIX_BASIC);
                    422: 
                    423:   preg->buffer = NULL;
                    424:   preg->allocated = 0;
                    425:   preg->used = 0;
                    426: 
                    427:   /* Try to allocate space for the fastmap.  */
                    428:   preg->fastmap = re_malloc (char, SBC_MAX);
                    429:   if (BE (preg->fastmap == NULL, 0))
                    430:     return REG_ESPACE;
                    431: 
                    432:   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
                    433: 
                    434:   /* If REG_NEWLINE is set, newlines are treated differently.  */
                    435:   if (cflags & REG_NEWLINE)
                    436:     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
                    437:       syntax &= ~RE_DOT_NEWLINE;
                    438:       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
                    439:       /* It also changes the matching behavior.  */
                    440:       preg->newline_anchor = 1;
                    441:     }
                    442:   else
                    443:     preg->newline_anchor = 0;
                    444:   preg->no_sub = !!(cflags & REG_NOSUB);
                    445:   preg->translate = NULL;
                    446: 
                    447:   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
                    448: 
                    449:   /* POSIX doesn't distinguish between an unmatched open-group and an
                    450:      unmatched close-group: both are REG_EPAREN.  */
                    451:   if (ret == REG_ERPAREN)
                    452:     ret = REG_EPAREN;
                    453: 
                    454:   /* We have already checked preg->fastmap != NULL.  */
                    455:   if (BE (ret == REG_NOERROR, 1))
                    456:     /* Compute the fastmap now, since regexec cannot modify the pattern
                    457:        buffer.  This function nevers fails in this implementation.  */
                    458:     (void) re_compile_fastmap (preg);
                    459:   else
                    460:     {
                    461:       /* Some error occurred while compiling the expression.  */
                    462:       re_free (preg->fastmap);
                    463:       preg->fastmap = NULL;
                    464:     }
                    465: 
                    466:   return (int) ret;
                    467: }
                    468: #ifdef _LIBC
                    469: weak_alias (__regcomp, regcomp)
                    470: #endif
                    471: 
                    472: /* Returns a message corresponding to an error code, ERRCODE, returned
                    473:    from either regcomp or regexec.   We don't use PREG here.  */
                    474: 
                    475: size_t
                    476: regerror (errcode, preg, errbuf, errbuf_size)
                    477:     int errcode;
                    478:     const regex_t *preg;
                    479:     char *errbuf;
                    480:     size_t errbuf_size;
                    481: {
                    482:   const char *msg;
                    483:   size_t msg_size;
                    484: 
                    485:   if (BE (errcode < 0
                    486:          || errcode >= (int) (sizeof (__re_error_msgid_idx)
                    487:                               / sizeof (__re_error_msgid_idx[0])), 0))
                    488:     /* Only error codes returned by the rest of the code should be passed
                    489:        to this routine.  If we are given anything else, or if other regex
                    490:        code generates an invalid error code, then the program has a bug.
                    491:        Dump core so we can fix it.  */
                    492:     abort ();
                    493: 
                    494:   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
                    495: 
                    496:   msg_size = strlen (msg) + 1; /* Includes the null.  */
                    497: 
                    498:   if (BE (errbuf_size != 0, 1))
                    499:     {
                    500:       if (BE (msg_size > errbuf_size, 0))
                    501:        {
                    502: #if defined HAVE_MEMPCPY || defined _LIBC
                    503:          *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
                    504: #else
                    505:          memcpy (errbuf, msg, errbuf_size - 1);
                    506:          errbuf[errbuf_size - 1] = 0;
                    507: #endif
                    508:        }
                    509:       else
                    510:        memcpy (errbuf, msg, msg_size);
                    511:     }
                    512: 
                    513:   return msg_size;
                    514: }
                    515: #ifdef _LIBC
                    516: weak_alias (__regerror, regerror)
                    517: #endif
                    518: 
                    519: 
                    520: static void
                    521: free_dfa_content (re_dfa_t *dfa)
                    522: {
                    523:   int i, j;
                    524: 
                    525:   re_free (dfa->subexps);
                    526: 
                    527:   for (i = 0; i < dfa->nodes_len; ++i)
                    528:     {
                    529:       re_token_t *node = dfa->nodes + i;
                    530: #ifdef RE_ENABLE_I18N
                    531:       if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
                    532:        free_charset (node->opr.mbcset);
                    533:       else
                    534: #endif /* RE_ENABLE_I18N */
                    535:        if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
                    536:          re_free (node->opr.sbcset);
                    537:     }
                    538:   re_free (dfa->nexts);
                    539:   for (i = 0; i < dfa->nodes_len; ++i)
                    540:     {
                    541:       if (dfa->eclosures != NULL)
                    542:        re_node_set_free (dfa->eclosures + i);
                    543:       if (dfa->inveclosures != NULL)
                    544:        re_node_set_free (dfa->inveclosures + i);
                    545:       if (dfa->edests != NULL)
                    546:        re_node_set_free (dfa->edests + i);
                    547:     }
                    548:   re_free (dfa->edests);
                    549:   re_free (dfa->eclosures);
                    550:   re_free (dfa->inveclosures);
                    551:   re_free (dfa->nodes);
                    552: 
                    553:   for (i = 0; i <= dfa->state_hash_mask; ++i)
                    554:     {
                    555:       struct re_state_table_entry *entry = dfa->state_table + i;
                    556:       for (j = 0; j < entry->num; ++j)
                    557:        {
                    558:          re_dfastate_t *state = entry->array[j];
                    559:          free_state (state);
                    560:        }
                    561:       re_free (entry->array);
                    562:     }
                    563:   re_free (dfa->state_table);
                    564: 
                    565:   if (dfa->word_char != NULL)
                    566:     re_free (dfa->word_char);
                    567: #ifdef DEBUG
                    568:   re_free (dfa->re_str);
                    569: #endif
                    570: 
                    571:   re_free (dfa);
                    572: }
                    573: 
                    574: 
                    575: /* Free dynamically allocated space used by PREG.  */
                    576: 
                    577: void
                    578: regfree (preg)
                    579:     regex_t *preg;
                    580: {
                    581:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
                    582:   if (BE (dfa != NULL, 1))
                    583:     free_dfa_content (dfa);
                    584: 
                    585:   re_free (preg->fastmap);
                    586: }
                    587: #ifdef _LIBC
                    588: weak_alias (__regfree, regfree)
                    589: #endif
                    590: 
                    591: /* Entry points compatible with 4.2 BSD regex library.  We don't define
                    592:    them unless specifically requested.  */
                    593: 
                    594: #if defined _REGEX_RE_COMP || defined _LIBC
                    595: 
                    596: /* BSD has one and only one pattern buffer.  */
                    597: static struct re_pattern_buffer re_comp_buf;
                    598: 
                    599: char *
                    600: # ifdef _LIBC
                    601: /* Make these definitions weak in libc, so POSIX programs can redefine
                    602:    these names if they don't use our functions, and still use
                    603:    regcomp/regexec above without link errors.  */
                    604: weak_function
                    605: # endif
                    606: re_comp (s)
                    607:      const char *s;
                    608: {
                    609:   reg_errcode_t ret;
                    610:   char *fastmap;
                    611: 
                    612:   if (!s)
                    613:     {
                    614:       if (!re_comp_buf.buffer)
                    615:        return gettext ("No previous regular expression");
                    616:       return 0;
                    617:     }
                    618: 
                    619:   if (re_comp_buf.buffer)
                    620:     {
                    621:       fastmap = re_comp_buf.fastmap;
                    622:       re_comp_buf.fastmap = NULL;
                    623:       __regfree (&re_comp_buf);
                    624:       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
                    625:       re_comp_buf.fastmap = fastmap;
                    626:     }
                    627: 
                    628:   if (re_comp_buf.fastmap == NULL)
                    629:     {
                    630:       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
                    631:       if (re_comp_buf.fastmap == NULL)
                    632:        return (char *) gettext (__re_error_msgid
                    633:                                 + __re_error_msgid_idx[(int) REG_ESPACE]);
                    634:     }
                    635: 
                    636:   /* Since `re_exec' always passes NULL for the `regs' argument, we
                    637:      don't need to initialize the pattern buffer fields which affect it.  */
                    638: 
                    639:   /* Match anchors at newlines.  */
                    640:   re_comp_buf.newline_anchor = 1;
                    641: 
                    642:   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
                    643: 
                    644:   if (!ret)
                    645:     return NULL;
                    646: 
                    647:   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
                    648:   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
                    649: }
                    650: 
                    651: #ifdef _LIBC
                    652: libc_freeres_fn (free_mem)
                    653: {
                    654:   __regfree (&re_comp_buf);
                    655: }
                    656: #endif
                    657: 
                    658: #endif /* _REGEX_RE_COMP */
                    659: 
                    660: /* Internal entry point.
                    661:    Compile the regular expression PATTERN, whose length is LENGTH.
                    662:    SYNTAX indicate regular expression's syntax.  */
                    663: 
                    664: static reg_errcode_t
                    665: re_compile_internal (preg, pattern, length, syntax)
                    666:      regex_t *preg;
                    667:      const char * pattern;
                    668:      int length;
                    669:      reg_syntax_t syntax;
                    670: {
                    671:   reg_errcode_t err = REG_NOERROR;
                    672:   re_dfa_t *dfa;
                    673:   re_string_t regexp;
                    674: 
                    675:   /* Initialize the pattern buffer.  */
                    676:   preg->fastmap_accurate = 0;
                    677:   preg->syntax = syntax;
                    678:   preg->not_bol = preg->not_eol = 0;
                    679:   preg->used = 0;
                    680:   preg->re_nsub = 0;
                    681:   preg->can_be_null = 0;
                    682:   preg->regs_allocated = REGS_UNALLOCATED;
                    683: 
                    684:   /* Initialize the dfa.  */
                    685:   dfa = (re_dfa_t *) preg->buffer;
                    686:   if (preg->allocated < sizeof (re_dfa_t))
                    687:     {
                    688:       /* If zero allocated, but buffer is non-null, try to realloc
                    689:         enough space.  This loses if buffer's address is bogus, but
                    690:         that is the user's responsibility.  If ->buffer is NULL this
                    691:         is a simple allocation.  */
                    692:       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
                    693:       if (dfa == NULL)
                    694:        return REG_ESPACE;
                    695:       preg->allocated = sizeof (re_dfa_t);
                    696:     }
                    697:   preg->buffer = (unsigned char *) dfa;
                    698:   preg->used = sizeof (re_dfa_t);
                    699: 
                    700:   err = init_dfa (dfa, length);
                    701:   if (BE (err != REG_NOERROR, 0))
                    702:     {
                    703:       re_free (dfa);
                    704:       preg->buffer = NULL;
                    705:       preg->allocated = 0;
                    706:       return err;
                    707:     }
                    708: #ifdef DEBUG
                    709:   dfa->re_str = re_malloc (char, length + 1);
                    710:   strncpy (dfa->re_str, pattern, length + 1);
                    711: #endif
                    712: 
                    713:   err = re_string_construct (&regexp, pattern, length, preg->translate,
                    714:                             syntax & RE_ICASE);
                    715:   if (BE (err != REG_NOERROR, 0))
                    716:     {
                    717:       re_free (dfa);
                    718:       preg->buffer = NULL;
                    719:       preg->allocated = 0;
                    720:       return err;
                    721:     }
                    722: 
                    723:   /* Parse the regular expression, and build a structure tree.  */
                    724:   preg->re_nsub = 0;
                    725:   dfa->str_tree = parse (&regexp, preg, syntax, &err);
                    726:   if (BE (dfa->str_tree == NULL, 0))
                    727:     goto re_compile_internal_free_return;
                    728: 
                    729:   /* Analyze the tree and collect information which is necessary to
                    730:      create the dfa.  */
                    731:   err = analyze (dfa);
                    732:   if (BE (err != REG_NOERROR, 0))
                    733:     goto re_compile_internal_free_return;
                    734: 
                    735:   /* Then create the initial state of the dfa.  */
                    736:   err = create_initial_state (dfa);
                    737: 
                    738:   /* Release work areas.  */
                    739:   free_workarea_compile (preg);
                    740:   re_string_destruct (&regexp);
                    741: 
                    742:   if (BE (err != REG_NOERROR, 0))
                    743:     {
                    744:     re_compile_internal_free_return:
                    745:       free_dfa_content (dfa);
                    746:       preg->buffer = NULL;
                    747:       preg->allocated = 0;
                    748:     }
                    749: 
                    750:   return err;
                    751: }
                    752: 
                    753: /* Initialize DFA.  We use the length of the regular expression PAT_LEN
                    754:    as the initial length of some arrays.  */
                    755: 
                    756: static reg_errcode_t
                    757: init_dfa (dfa, pat_len)
                    758:      re_dfa_t *dfa;
                    759:      int pat_len;
                    760: {
                    761:   int table_size;
                    762: 
                    763:   memset (dfa, '\0', sizeof (re_dfa_t));
                    764: 
                    765:   dfa->nodes_alloc = pat_len + 1;
                    766:   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
                    767: 
                    768:   dfa->states_alloc = pat_len + 1;
                    769: 
                    770:   /*  table_size = 2 ^ ceil(log pat_len) */
                    771:   for (table_size = 1; table_size > 0; table_size <<= 1)
                    772:     if (table_size > pat_len)
                    773:       break;
                    774: 
                    775:   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
                    776:   dfa->state_hash_mask = table_size - 1;
                    777: 
                    778:   dfa->subexps_alloc = 1;
                    779:   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
                    780:   dfa->word_char = NULL;
                    781: 
                    782:   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
                    783:          || dfa->subexps == NULL, 0))
                    784:     {
                    785:       /* We don't bother to free anything which was allocated.  Very
                    786:         soon the process will go down anyway.  */
                    787:       dfa->subexps = NULL;
                    788:       dfa->state_table = NULL;
                    789:       dfa->nodes = NULL;
                    790:       return REG_ESPACE;
                    791:     }
                    792:   return REG_NOERROR;
                    793: }
                    794: 
                    795: /* Initialize WORD_CHAR table, which indicate which character is
                    796:    "word".  In this case "word" means that it is the word construction
                    797:    character used by some operators like "\<", "\>", etc.  */
                    798: 
                    799: static reg_errcode_t
                    800: init_word_char (dfa)
                    801:      re_dfa_t *dfa;
                    802: {
                    803:   int i, j, ch;
                    804:   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
                    805:   if (BE (dfa->word_char == NULL, 0))
                    806:     return REG_ESPACE;
                    807:   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
                    808:     for (j = 0; j < UINT_BITS; ++j, ++ch)
                    809:       if (isalnum (ch) || ch == '_')
                    810:        dfa->word_char[i] |= 1 << j;
                    811:   return REG_NOERROR;
                    812: }
                    813: 
                    814: /* Free the work area which are only used while compiling.  */
                    815: 
                    816: static void
                    817: free_workarea_compile (preg)
                    818:      regex_t *preg;
                    819: {
                    820:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
                    821:   free_bin_tree (dfa->str_tree);
                    822:   dfa->str_tree = NULL;
                    823:   re_free (dfa->org_indices);
                    824:   dfa->org_indices = NULL;
                    825: }
                    826: 
                    827: /* Create initial states for all contexts.  */
                    828: 
                    829: static reg_errcode_t
                    830: create_initial_state (dfa)
                    831:      re_dfa_t *dfa;
                    832: {
                    833:   int first, i;
                    834:   reg_errcode_t err;
                    835:   re_node_set init_nodes;
                    836: 
                    837:   /* Initial states have the epsilon closure of the node which is
                    838:      the first node of the regular expression.  */
                    839:   first = dfa->str_tree->first;
                    840:   dfa->init_node = first;
                    841:   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
                    842:   if (BE (err != REG_NOERROR, 0))
                    843:     return err;
                    844: 
                    845:   /* The back-references which are in initial states can epsilon transit,
                    846:      since in this case all of the subexpressions can be null.
                    847:      Then we add epsilon closures of the nodes which are the next nodes of
                    848:      the back-references.  */
                    849:   if (dfa->nbackref > 0)
                    850:     for (i = 0; i < init_nodes.nelem; ++i)
                    851:       {
                    852:        int node_idx = init_nodes.elems[i];
                    853:        re_token_type_t type = dfa->nodes[node_idx].type;
                    854: 
                    855:        int clexp_idx;
                    856:        if (type != OP_BACK_REF)
                    857:          continue;
                    858:        for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
                    859:          {
                    860:            re_token_t *clexp_node;
                    861:            clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
                    862:            if (clexp_node->type == OP_CLOSE_SUBEXP
                    863:                && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
                    864:              break;
                    865:          }
                    866:        if (clexp_idx == init_nodes.nelem)
                    867:          continue;
                    868: 
                    869:        if (type == OP_BACK_REF)
                    870:          {
                    871:            int dest_idx = dfa->edests[node_idx].elems[0];
                    872:            if (!re_node_set_contains (&init_nodes, dest_idx))
                    873:              {
                    874:                re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
                    875:                i = 0;
                    876:              }
                    877:          }
                    878:       }
                    879: 
                    880:   /* It must be the first time to invoke acquire_state.  */
                    881:   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
                    882:   /* We don't check ERR here, since the initial state must not be NULL.  */
                    883:   if (BE (dfa->init_state == NULL, 0))
                    884:     return err;
                    885:   if (dfa->init_state->has_constraint)
                    886:     {
                    887:       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
                    888:                                                       CONTEXT_WORD);
                    889:       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
                    890:                                                     CONTEXT_NEWLINE);
                    891:       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
                    892:                                                         &init_nodes,
                    893:                                                         CONTEXT_NEWLINE
                    894:                                                         | CONTEXT_BEGBUF);
                    895:       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
                    896:              || dfa->init_state_begbuf == NULL, 0))
                    897:        return err;
                    898:     }
                    899:   else
                    900:     dfa->init_state_word = dfa->init_state_nl
                    901:       = dfa->init_state_begbuf = dfa->init_state;
                    902: 
                    903:   re_node_set_free (&init_nodes);
                    904:   return REG_NOERROR;
                    905: }
                    906: 
                    907: /* Analyze the structure tree, and calculate "first", "next", "edest",
                    908:    "eclosure", and "inveclosure".  */
                    909: 
                    910: static reg_errcode_t
                    911: analyze (dfa)
                    912:      re_dfa_t *dfa;
                    913: {
                    914:   int i;
                    915:   reg_errcode_t ret;
                    916: 
                    917:   /* Allocate arrays.  */
                    918:   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
                    919:   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
                    920:   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
                    921:   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
                    922:   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
                    923:   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
                    924:          || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
                    925:     return REG_ESPACE;
                    926:   /* Initialize them.  */
                    927:   for (i = 0; i < dfa->nodes_len; ++i)
                    928:     {
                    929:       dfa->nexts[i] = -1;
                    930:       re_node_set_init_empty (dfa->edests + i);
                    931:       re_node_set_init_empty (dfa->eclosures + i);
                    932:       re_node_set_init_empty (dfa->inveclosures + i);
                    933:     }
                    934: 
                    935:   ret = analyze_tree (dfa, dfa->str_tree);
                    936:   if (BE (ret == REG_NOERROR, 1))
                    937:     {
                    938:       ret = calc_eclosure (dfa);
                    939:       if (ret == REG_NOERROR)
                    940:        calc_inveclosure (dfa);
                    941:     }
                    942:   return ret;
                    943: }
                    944: 
                    945: /* Helper functions for analyze.
                    946:    This function calculate "first", "next", and "edest" for the subtree
                    947:    whose root is NODE.  */
                    948: 
                    949: static reg_errcode_t
                    950: analyze_tree (dfa, node)
                    951:      re_dfa_t *dfa;
                    952:      bin_tree_t *node;
                    953: {
                    954:   reg_errcode_t ret;
                    955:   if (node->first == -1)
                    956:     calc_first (dfa, node);
                    957:   if (node->next == -1)
                    958:     calc_next (dfa, node);
                    959:   if (node->eclosure.nelem == 0)
                    960:     calc_epsdest (dfa, node);
                    961:   /* Calculate "first" etc. for the left child.  */
                    962:   if (node->left != NULL)
                    963:     {
                    964:       ret = analyze_tree (dfa, node->left);
                    965:       if (BE (ret != REG_NOERROR, 0))
                    966:        return ret;
                    967:     }
                    968:   /* Calculate "first" etc. for the right child.  */
                    969:   if (node->right != NULL)
                    970:     {
                    971:       ret = analyze_tree (dfa, node->right);
                    972:       if (BE (ret != REG_NOERROR, 0))
                    973:        return ret;
                    974:     }
                    975:   return REG_NOERROR;
                    976: }
                    977: 
                    978: /* Calculate "first" for the node NODE.  */
                    979: static void
                    980: calc_first (dfa, node)
                    981:      re_dfa_t *dfa;
                    982:      bin_tree_t *node;
                    983: {
                    984:   int idx, type;
                    985:   idx = node->node_idx;
                    986:   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
                    987: 
                    988:   switch (type)
                    989:     {
                    990: #ifdef DEBUG
                    991:     case OP_OPEN_BRACKET:
                    992:     case OP_CLOSE_BRACKET:
                    993:     case OP_OPEN_DUP_NUM:
                    994:     case OP_CLOSE_DUP_NUM:
                    995:     case OP_NON_MATCH_LIST:
                    996:     case OP_OPEN_COLL_ELEM:
                    997:     case OP_CLOSE_COLL_ELEM:
                    998:     case OP_OPEN_EQUIV_CLASS:
                    999:     case OP_CLOSE_EQUIV_CLASS:
                   1000:     case OP_OPEN_CHAR_CLASS:
                   1001:     case OP_CLOSE_CHAR_CLASS:
                   1002:       /* These must not be appeared here.  */
                   1003:       assert (0);
                   1004: #endif
                   1005:     case END_OF_RE:
                   1006:     case CHARACTER:
                   1007:     case OP_PERIOD:
                   1008:     case OP_DUP_ASTERISK:
                   1009:     case OP_DUP_QUESTION:
                   1010: #ifdef RE_ENABLE_I18N
                   1011:     case COMPLEX_BRACKET:
                   1012: #endif /* RE_ENABLE_I18N */
                   1013:     case SIMPLE_BRACKET:
                   1014:     case OP_BACK_REF:
                   1015:     case ANCHOR:
                   1016:     case OP_OPEN_SUBEXP:
                   1017:     case OP_CLOSE_SUBEXP:
                   1018:       node->first = idx;
                   1019:       break;
                   1020:     case OP_DUP_PLUS:
                   1021: #ifdef DEBUG
                   1022:       assert (node->left != NULL);
                   1023: #endif
                   1024:       if (node->left->first == -1)
                   1025:        calc_first (dfa, node->left);
                   1026:       node->first = node->left->first;
                   1027:       break;
                   1028:     case OP_ALT:
                   1029:       node->first = idx;
                   1030:       break;
                   1031:       /* else fall through */
                   1032:     default:
                   1033: #ifdef DEBUG
                   1034:       assert (node->left != NULL);
                   1035: #endif
                   1036:       if (node->left->first == -1)
                   1037:        calc_first (dfa, node->left);
                   1038:       node->first = node->left->first;
                   1039:       break;
                   1040:     }
                   1041: }
                   1042: 
                   1043: /* Calculate "next" for the node NODE.  */
                   1044: 
                   1045: static void
                   1046: calc_next (dfa, node)
                   1047:      re_dfa_t *dfa;
                   1048:      bin_tree_t *node;
                   1049: {
                   1050:   int idx, type;
                   1051:   bin_tree_t *parent = node->parent;
                   1052:   if (parent == NULL)
                   1053:     {
                   1054:       node->next = -1;
                   1055:       idx = node->node_idx;
                   1056:       if (node->type == 0)
                   1057:        dfa->nexts[idx] = node->next;
                   1058:       return;
                   1059:     }
                   1060: 
                   1061:   idx = parent->node_idx;
                   1062:   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
                   1063: 
                   1064:   switch (type)
                   1065:     {
                   1066:     case OP_DUP_ASTERISK:
                   1067:     case OP_DUP_PLUS:
                   1068:       node->next = idx;
                   1069:       break;
                   1070:     case CONCAT:
                   1071:       if (parent->left == node)
                   1072:        {
                   1073:          if (parent->right->first == -1)
                   1074:            calc_first (dfa, parent->right);
                   1075:          node->next = parent->right->first;
                   1076:          break;
                   1077:        }
                   1078:       /* else fall through */
                   1079:     default:
                   1080:       if (parent->next == -1)
                   1081:        calc_next (dfa, parent);
                   1082:       node->next = parent->next;
                   1083:       break;
                   1084:     }
                   1085:   idx = node->node_idx;
                   1086:   if (node->type == 0)
                   1087:     dfa->nexts[idx] = node->next;
                   1088: }
                   1089: 
                   1090: /* Calculate "edest" for the node NODE.  */
                   1091: 
                   1092: static void
                   1093: calc_epsdest (dfa, node)
                   1094:      re_dfa_t *dfa;
                   1095:      bin_tree_t *node;
                   1096: {
                   1097:   int idx;
                   1098:   idx = node->node_idx;
                   1099:   if (node->type == 0)
                   1100:     {
                   1101:       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
                   1102:          || dfa->nodes[idx].type == OP_DUP_PLUS
                   1103:          || dfa->nodes[idx].type == OP_DUP_QUESTION)
                   1104:        {
                   1105:          if (node->left->first == -1)
                   1106:            calc_first (dfa, node->left);
                   1107:          if (node->next == -1)
                   1108:            calc_next (dfa, node);
                   1109:          re_node_set_init_2 (dfa->edests + idx, node->left->first,
                   1110:                              node->next);
                   1111:        }
                   1112:       else if (dfa->nodes[idx].type == OP_ALT)
                   1113:        {
                   1114:          int left, right;
                   1115:          if (node->left != NULL)
                   1116:            {
                   1117:              if (node->left->first == -1)
                   1118:                calc_first (dfa, node->left);
                   1119:              left = node->left->first;
                   1120:            }
                   1121:          else
                   1122:            {
                   1123:              if (node->next == -1)
                   1124:                calc_next (dfa, node);
                   1125:              left = node->next;
                   1126:            }
                   1127:          if (node->right != NULL)
                   1128:            {
                   1129:              if (node->right->first == -1)
                   1130:                calc_first (dfa, node->right);
                   1131:              right = node->right->first;
                   1132:            }
                   1133:          else
                   1134:            {
                   1135:              if (node->next == -1)
                   1136:                calc_next (dfa, node);
                   1137:              right = node->next;
                   1138:            }
                   1139:          re_node_set_init_2 (dfa->edests + idx, left, right);
                   1140:        }
                   1141:       else if (dfa->nodes[idx].type == ANCHOR
                   1142:               || dfa->nodes[idx].type == OP_OPEN_SUBEXP
                   1143:               || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
                   1144:               || dfa->nodes[idx].type == OP_BACK_REF)
                   1145:        re_node_set_init_1 (dfa->edests + idx, node->next);
                   1146:     }
                   1147: }
                   1148: 
                   1149: /* Duplicate the epsilon closure of the node ROOT_NODE.
                   1150:    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
                   1151:    to their own constraint.  */
                   1152: 
                   1153: static reg_errcode_t
                   1154: duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
                   1155:                        init_constraint)
                   1156:      re_dfa_t *dfa;
                   1157:      int top_org_node, top_clone_node, root_node;
                   1158:      unsigned int init_constraint;
                   1159: {
                   1160:   reg_errcode_t err;
                   1161:   int org_node, clone_node, ret;
                   1162:   unsigned int constraint = init_constraint;
                   1163:   for (org_node = top_org_node, clone_node = top_clone_node;;)
                   1164:     {
                   1165:       int org_dest, clone_dest;
                   1166:       if (dfa->nodes[org_node].type == OP_BACK_REF)
                   1167:        {
                   1168:          /* If the back reference epsilon-transit, its destination must
                   1169:             also have the constraint.  Then duplicate the epsilon closure
                   1170:             of the destination of the back reference, and store it in
                   1171:             edests of the back reference.  */
                   1172:          org_dest = dfa->nexts[org_node];
                   1173:          re_node_set_empty (dfa->edests + clone_node);
                   1174:          err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
                   1175:          if (BE (err != REG_NOERROR, 0))
                   1176:            return err;
                   1177:          dfa->nexts[clone_node] = dfa->nexts[org_node];
                   1178:          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
                   1179:          if (BE (ret < 0, 0))
                   1180:            return REG_ESPACE;
                   1181:        }
                   1182:       else if (dfa->edests[org_node].nelem == 0)
                   1183:        {
                   1184:          /* In case of the node can't epsilon-transit, don't duplicate the
                   1185:             destination and store the original destination as the
                   1186:             destination of the node.  */
                   1187:          dfa->nexts[clone_node] = dfa->nexts[org_node];
                   1188:          break;
                   1189:        }
                   1190:       else if (dfa->edests[org_node].nelem == 1)
                   1191:        {
                   1192:          /* In case of the node can epsilon-transit, and it has only one
                   1193:             destination.  */
                   1194:          org_dest = dfa->edests[org_node].elems[0];
                   1195:          re_node_set_empty (dfa->edests + clone_node);
                   1196:          if (dfa->nodes[org_node].type == ANCHOR)
                   1197:            {
                   1198:              /* In case of the node has another constraint, append it.  */
                   1199:              if (org_node == root_node && clone_node != org_node)
                   1200:                {
                   1201:                  /* ...but if the node is root_node itself, it means the
                   1202:                     epsilon closure have a loop, then tie it to the
                   1203:                     destination of the root_node.  */
                   1204:                  ret = re_node_set_insert (dfa->edests + clone_node,
                   1205:                                            org_dest);
                   1206:                  if (BE (ret < 0, 0))
                   1207:                    return REG_ESPACE;
                   1208:                  break;
                   1209:                }
                   1210:              constraint |= dfa->nodes[org_node].opr.ctx_type;
                   1211:            }
                   1212:          err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
                   1213:          if (BE (err != REG_NOERROR, 0))
                   1214:            return err;
                   1215:          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
                   1216:          if (BE (ret < 0, 0))
                   1217:            return REG_ESPACE;
                   1218:        }
                   1219:       else /* dfa->edests[org_node].nelem == 2 */
                   1220:        {
                   1221:          /* In case of the node can epsilon-transit, and it has two
                   1222:             destinations. E.g. '|', '*', '+', '?'.   */
                   1223:          org_dest = dfa->edests[org_node].elems[0];
                   1224:          re_node_set_empty (dfa->edests + clone_node);
                   1225:          /* Search for a duplicated node which satisfies the constraint.  */
                   1226:          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
                   1227:          if (clone_dest == -1)
                   1228:            {
                   1229:              /* There are no such a duplicated node, create a new one.  */
                   1230:              err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
                   1231:              if (BE (err != REG_NOERROR, 0))
                   1232:                return err;
                   1233:              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
                   1234:              if (BE (ret < 0, 0))
                   1235:                return REG_ESPACE;
                   1236:              err = duplicate_node_closure (dfa, org_dest, clone_dest,
                   1237:                                            root_node, constraint);
                   1238:              if (BE (err != REG_NOERROR, 0))
                   1239:                return err;
                   1240:            }
                   1241:          else
                   1242:            {
                   1243:              /* There are a duplicated node which satisfy the constraint,
                   1244:                 use it to avoid infinite loop.  */
                   1245:              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
                   1246:              if (BE (ret < 0, 0))
                   1247:                return REG_ESPACE;
                   1248:            }
                   1249: 
                   1250:          org_dest = dfa->edests[org_node].elems[1];
                   1251:          err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
                   1252:          if (BE (err != REG_NOERROR, 0))
                   1253:            return err;
                   1254:          ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
                   1255:          if (BE (ret < 0, 0))
                   1256:            return REG_ESPACE;
                   1257:        }
                   1258:       org_node = org_dest;
                   1259:       clone_node = clone_dest;
                   1260:     }
                   1261:   return REG_NOERROR;
                   1262: }
                   1263: 
                   1264: /* Search for a node which is duplicated from the node ORG_NODE, and
                   1265:    satisfies the constraint CONSTRAINT.  */
                   1266: 
                   1267: static int
                   1268: search_duplicated_node (dfa, org_node, constraint)
                   1269:      re_dfa_t *dfa;
                   1270:      int org_node;
                   1271:      unsigned int constraint;
                   1272: {
                   1273:   int idx;
                   1274:   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
                   1275:     {
                   1276:       if (org_node == dfa->org_indices[idx]
                   1277:          && constraint == dfa->nodes[idx].constraint)
                   1278:        return idx; /* Found.  */
                   1279:     }
                   1280:   return -1; /* Not found.  */
                   1281: }
                   1282: 
                   1283: /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
                   1284:    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
                   1285:    otherwise return the error code.  */
                   1286: 
                   1287: static reg_errcode_t
                   1288: duplicate_node (new_idx, dfa, org_idx, constraint)
                   1289:      re_dfa_t *dfa;
                   1290:      int *new_idx, org_idx;
                   1291:      unsigned int constraint;
                   1292: {
                   1293:   re_token_t dup;
                   1294:   int dup_idx;
                   1295: 
                   1296:   dup = dfa->nodes[org_idx];
                   1297:   dup_idx = re_dfa_add_node (dfa, dup, 1);
                   1298:   if (BE (dup_idx == -1, 0))
                   1299:     return REG_ESPACE;
                   1300:   dfa->nodes[dup_idx].constraint = constraint;
                   1301:   if (dfa->nodes[org_idx].type == ANCHOR)
                   1302:     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
                   1303:   dfa->nodes[dup_idx].duplicated = 1;
                   1304:   re_node_set_init_empty (dfa->edests + dup_idx);
                   1305:   re_node_set_init_empty (dfa->eclosures + dup_idx);
                   1306:   re_node_set_init_empty (dfa->inveclosures + dup_idx);
                   1307: 
                   1308:   /* Store the index of the original node.  */
                   1309:   dfa->org_indices[dup_idx] = org_idx;
                   1310:   *new_idx = dup_idx;
                   1311:   return REG_NOERROR;
                   1312: }
                   1313: 
                   1314: static void
                   1315: calc_inveclosure (dfa)
                   1316:      re_dfa_t *dfa;
                   1317: {
                   1318:   int src, idx, dest;
                   1319:   for (src = 0; src < dfa->nodes_len; ++src)
                   1320:     {
                   1321:       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
                   1322:        {
                   1323:          dest = dfa->eclosures[src].elems[idx];
                   1324:          re_node_set_insert (dfa->inveclosures + dest, src);
                   1325:        }
                   1326:     }
                   1327: }
                   1328: 
                   1329: /* Calculate "eclosure" for all the node in DFA.  */
                   1330: 
                   1331: static reg_errcode_t
                   1332: calc_eclosure (dfa)
                   1333:      re_dfa_t *dfa;
                   1334: {
                   1335:   int node_idx, incomplete;
                   1336: #ifdef DEBUG
                   1337:   assert (dfa->nodes_len > 0);
                   1338: #endif
                   1339:   incomplete = 0;
                   1340:   /* For each nodes, calculate epsilon closure.  */
                   1341:   for (node_idx = 0; ; ++node_idx)
                   1342:     {
                   1343:       reg_errcode_t err;
                   1344:       re_node_set eclosure_elem;
                   1345:       if (node_idx == dfa->nodes_len)
                   1346:        {
                   1347:          if (!incomplete)
                   1348:            break;
                   1349:          incomplete = 0;
                   1350:          node_idx = 0;
                   1351:        }
                   1352: 
                   1353: #ifdef DEBUG
                   1354:       assert (dfa->eclosures[node_idx].nelem != -1);
                   1355: #endif
                   1356:       /* If we have already calculated, skip it.  */
                   1357:       if (dfa->eclosures[node_idx].nelem != 0)
                   1358:        continue;
                   1359:       /* Calculate epsilon closure of `node_idx'.  */
                   1360:       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
                   1361:       if (BE (err != REG_NOERROR, 0))
                   1362:        return err;
                   1363: 
                   1364:       if (dfa->eclosures[node_idx].nelem == 0)
                   1365:        {
                   1366:          incomplete = 1;
                   1367:          re_node_set_free (&eclosure_elem);
                   1368:        }
                   1369:     }
                   1370:   return REG_NOERROR;
                   1371: }
                   1372: 
                   1373: /* Calculate epsilon closure of NODE.  */
                   1374: 
                   1375: static reg_errcode_t
                   1376: calc_eclosure_iter (new_set, dfa, node, root)
                   1377:      re_node_set *new_set;
                   1378:      re_dfa_t *dfa;
                   1379:      int node, root;
                   1380: {
                   1381:   reg_errcode_t err;
                   1382:   unsigned int constraint;
                   1383:   int i, incomplete;
                   1384:   re_node_set eclosure;
                   1385:   incomplete = 0;
                   1386:   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
                   1387:   if (BE (err != REG_NOERROR, 0))
                   1388:     return err;
                   1389: 
                   1390:   /* This indicates that we are calculating this node now.
                   1391:      We reference this value to avoid infinite loop.  */
                   1392:   dfa->eclosures[node].nelem = -1;
                   1393: 
                   1394:   constraint = ((dfa->nodes[node].type == ANCHOR)
                   1395:                ? dfa->nodes[node].opr.ctx_type : 0);
                   1396:   /* If the current node has constraints, duplicate all nodes.
                   1397:      Since they must inherit the constraints.  */
                   1398:   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
                   1399:     {
                   1400:       int org_node, cur_node;
                   1401:       org_node = cur_node = node;
                   1402:       err = duplicate_node_closure (dfa, node, node, node, constraint);
                   1403:       if (BE (err != REG_NOERROR, 0))
                   1404:        return err;
                   1405:     }
                   1406: 
                   1407:   /* Expand each epsilon destination nodes.  */
                   1408:   if (IS_EPSILON_NODE(dfa->nodes[node].type))
                   1409:     for (i = 0; i < dfa->edests[node].nelem; ++i)
                   1410:       {
                   1411:        re_node_set eclosure_elem;
                   1412:        int edest = dfa->edests[node].elems[i];
                   1413:        /* If calculating the epsilon closure of `edest' is in progress,
                   1414:           return intermediate result.  */
                   1415:        if (dfa->eclosures[edest].nelem == -1)
                   1416:          {
                   1417:            incomplete = 1;
                   1418:            continue;
                   1419:          }
                   1420:        /* If we haven't calculated the epsilon closure of `edest' yet,
                   1421:           calculate now. Otherwise use calculated epsilon closure.  */
                   1422:        if (dfa->eclosures[edest].nelem == 0)
                   1423:          {
                   1424:            err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
                   1425:            if (BE (err != REG_NOERROR, 0))
                   1426:              return err;
                   1427:          }
                   1428:        else
                   1429:          eclosure_elem = dfa->eclosures[edest];
                   1430:        /* Merge the epsilon closure of `edest'.  */
                   1431:        re_node_set_merge (&eclosure, &eclosure_elem);
                   1432:        /* If the epsilon closure of `edest' is incomplete,
                   1433:           the epsilon closure of this node is also incomplete.  */
                   1434:        if (dfa->eclosures[edest].nelem == 0)
                   1435:          {
                   1436:            incomplete = 1;
                   1437:            re_node_set_free (&eclosure_elem);
                   1438:          }
                   1439:       }
                   1440: 
                   1441:   /* Epsilon closures include itself.  */
                   1442:   re_node_set_insert (&eclosure, node);
                   1443:   if (incomplete && !root)
                   1444:     dfa->eclosures[node].nelem = 0;
                   1445:   else
                   1446:     dfa->eclosures[node] = eclosure;
                   1447:   *new_set = eclosure;
                   1448:   return REG_NOERROR;
                   1449: }
                   1450: 
                   1451: /* Functions for token which are used in the parser.  */
                   1452: 
                   1453: /* Fetch a token from INPUT.
                   1454:    We must not use this function inside bracket expressions.  */
                   1455: 
                   1456: static re_token_t
                   1457: fetch_token (input, syntax)
                   1458:      re_string_t *input;
                   1459:      reg_syntax_t syntax;
                   1460: {
                   1461:   re_token_t token;
                   1462:   int consumed_byte;
                   1463:   consumed_byte = peek_token (&token, input, syntax);
                   1464:   re_string_skip_bytes (input, consumed_byte);
                   1465:   return token;
                   1466: }
                   1467: 
                   1468: /* Peek a token from INPUT, and return the length of the token.
                   1469:    We must not use this function inside bracket expressions.  */
                   1470: 
                   1471: static int
                   1472: peek_token (token, input, syntax)
                   1473:      re_token_t *token;
                   1474:      re_string_t *input;
                   1475:      reg_syntax_t syntax;
                   1476: {
                   1477:   unsigned char c;
                   1478: 
                   1479:   if (re_string_eoi (input))
                   1480:     {
                   1481:       token->type = END_OF_RE;
                   1482:       return 0;
                   1483:     }
                   1484: 
                   1485:   c = re_string_peek_byte (input, 0);
                   1486:   token->opr.c = c;
                   1487: 
                   1488: #ifdef RE_ENABLE_I18N
                   1489:   token->mb_partial = 0;
                   1490:   if (MB_CUR_MAX > 1 &&
                   1491:       !re_string_first_byte (input, re_string_cur_idx (input)))
                   1492:     {
                   1493:       token->type = CHARACTER;
                   1494:       token->mb_partial = 1;
                   1495:       return 1;
                   1496:     }
                   1497: #endif
                   1498:   if (c == '\\')
                   1499:     {
                   1500:       unsigned char c2;
                   1501:       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
                   1502:        {
                   1503:          token->type = BACK_SLASH;
                   1504:          return 1;
                   1505:        }
                   1506: 
                   1507:       c2 = re_string_peek_byte_case (input, 1);
                   1508:       token->opr.c = c2;
                   1509:       token->type = CHARACTER;
                   1510:       switch (c2)
                   1511:        {
                   1512:        case '|':
                   1513:          if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
                   1514:            token->type = OP_ALT;
                   1515:          break;
                   1516:        case '1': case '2': case '3': case '4': case '5':
                   1517:        case '6': case '7': case '8': case '9':
                   1518:          if (!(syntax & RE_NO_BK_REFS))
                   1519:            {
                   1520:              token->type = OP_BACK_REF;
                   1521:              token->opr.idx = c2 - '0';
                   1522:            }
                   1523:          break;
                   1524:        case '<':
                   1525:          if (!(syntax & RE_NO_GNU_OPS))
                   1526:            {
                   1527:              token->type = ANCHOR;
                   1528:              token->opr.idx = WORD_FIRST;
                   1529:            }
                   1530:          break;
                   1531:        case '>':
                   1532:          if (!(syntax & RE_NO_GNU_OPS))
                   1533:            {
                   1534:              token->type = ANCHOR;
                   1535:              token->opr.idx = WORD_LAST;
                   1536:            }
                   1537:          break;
                   1538:        case 'b':
                   1539:          if (!(syntax & RE_NO_GNU_OPS))
                   1540:            {
                   1541:              token->type = ANCHOR;
                   1542:              token->opr.idx = WORD_DELIM;
                   1543:            }
                   1544:          break;
                   1545:        case 'B':
                   1546:          if (!(syntax & RE_NO_GNU_OPS))
                   1547:            {
                   1548:              token->type = ANCHOR;
                   1549:              token->opr.idx = INSIDE_WORD;
                   1550:            }
                   1551:          break;
                   1552:        case 'w':
                   1553:          if (!(syntax & RE_NO_GNU_OPS))
                   1554:            token->type = OP_WORD;
                   1555:          break;
                   1556:        case 'W':
                   1557:          if (!(syntax & RE_NO_GNU_OPS))
                   1558:            token->type = OP_NOTWORD;
                   1559:          break;
                   1560:        case '`':
                   1561:          if (!(syntax & RE_NO_GNU_OPS))
                   1562:            {
                   1563:              token->type = ANCHOR;
                   1564:              token->opr.idx = BUF_FIRST;
                   1565:            }
                   1566:          break;
                   1567:        case '\'':
                   1568:          if (!(syntax & RE_NO_GNU_OPS))
                   1569:            {
                   1570:              token->type = ANCHOR;
                   1571:              token->opr.idx = BUF_LAST;
                   1572:            }
                   1573:          break;
                   1574:        case '(':
                   1575:          if (!(syntax & RE_NO_BK_PARENS))
                   1576:            token->type = OP_OPEN_SUBEXP;
                   1577:          break;
                   1578:        case ')':
                   1579:          if (!(syntax & RE_NO_BK_PARENS))
                   1580:            token->type = OP_CLOSE_SUBEXP;
                   1581:          break;
                   1582:        case '+':
                   1583:          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
                   1584:            token->type = OP_DUP_PLUS;
                   1585:          break;
                   1586:        case '?':
                   1587:          if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
                   1588:            token->type = OP_DUP_QUESTION;
                   1589:          break;
                   1590:        case '{':
                   1591:          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
                   1592:            token->type = OP_OPEN_DUP_NUM;
                   1593:          break;
                   1594:        case '}':
                   1595:          if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
                   1596:            token->type = OP_CLOSE_DUP_NUM;
                   1597:          break;
                   1598:        default:
                   1599:          break;
                   1600:        }
                   1601:       return 2;
                   1602:     }
                   1603: 
                   1604:   token->type = CHARACTER;
                   1605:   switch (c)
                   1606:     {
                   1607:     case '\n':
                   1608:       if (syntax & RE_NEWLINE_ALT)
                   1609:        token->type = OP_ALT;
                   1610:       break;
                   1611:     case '|':
                   1612:       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
                   1613:        token->type = OP_ALT;
                   1614:       break;
                   1615:     case '*':
                   1616:       token->type = OP_DUP_ASTERISK;
                   1617:       break;
                   1618:     case '+':
                   1619:       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
                   1620:        token->type = OP_DUP_PLUS;
                   1621:       break;
                   1622:     case '?':
                   1623:       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
                   1624:        token->type = OP_DUP_QUESTION;
                   1625:       break;
                   1626:     case '{':
                   1627:       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
                   1628:        token->type = OP_OPEN_DUP_NUM;
                   1629:       break;
                   1630:     case '}':
                   1631:       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
                   1632:        token->type = OP_CLOSE_DUP_NUM;
                   1633:       break;
                   1634:     case '(':
                   1635:       if (syntax & RE_NO_BK_PARENS)
                   1636:        token->type = OP_OPEN_SUBEXP;
                   1637:       break;
                   1638:     case ')':
                   1639:       if (syntax & RE_NO_BK_PARENS)
                   1640:        token->type = OP_CLOSE_SUBEXP;
                   1641:       break;
                   1642:     case '[':
                   1643:       token->type = OP_OPEN_BRACKET;
                   1644:       break;
                   1645:     case '.':
                   1646:       token->type = OP_PERIOD;
                   1647:       break;
                   1648:     case '^':
                   1649:       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
                   1650:          re_string_cur_idx (input) != 0)
                   1651:        {
                   1652:          char prev = re_string_peek_byte (input, -1);
                   1653:          if (prev != '|' && prev != '(' &&
                   1654:              (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
                   1655:            break;
                   1656:        }
                   1657:       token->type = ANCHOR;
                   1658:       token->opr.idx = LINE_FIRST;
                   1659:       break;
                   1660:     case '$':
                   1661:       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
                   1662:          re_string_cur_idx (input) + 1 != re_string_length (input))
                   1663:        {
                   1664:          re_token_t next;
                   1665:          re_string_skip_bytes (input, 1);
                   1666:          peek_token (&next, input, syntax);
                   1667:          re_string_skip_bytes (input, -1);
                   1668:          if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
                   1669:            break;
                   1670:        }
                   1671:       token->type = ANCHOR;
                   1672:       token->opr.idx = LINE_LAST;
                   1673:       break;
                   1674:     default:
                   1675:       break;
                   1676:     }
                   1677:   return 1;
                   1678: }
                   1679: 
                   1680: /* Peek a token from INPUT, and return the length of the token.
                   1681:    We must not use this function out of bracket expressions.  */
                   1682: 
                   1683: static int
                   1684: peek_token_bracket (token, input, syntax)
                   1685:      re_token_t *token;
                   1686:      re_string_t *input;
                   1687:      reg_syntax_t syntax;
                   1688: {
                   1689:   unsigned char c;
                   1690:   if (re_string_eoi (input))
                   1691:     {
                   1692:       token->type = END_OF_RE;
                   1693:       return 0;
                   1694:     }
                   1695:   c = re_string_peek_byte (input, 0);
                   1696:   token->opr.c = c;
                   1697: 
                   1698: #ifdef RE_ENABLE_I18N
                   1699:   if (MB_CUR_MAX > 1 &&
                   1700:       !re_string_first_byte (input, re_string_cur_idx (input)))
                   1701:     {
                   1702:       token->type = CHARACTER;
                   1703:       return 1;
                   1704:     }
                   1705: #endif /* RE_ENABLE_I18N */
                   1706: 
                   1707:   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
                   1708:     {
                   1709:       /* In this case, '\' escape a character.  */
                   1710:       unsigned char c2;
                   1711:       re_string_skip_bytes (input, 1);
                   1712:       c2 = re_string_peek_byte (input, 0);
                   1713:       token->opr.c = c2;
                   1714:       token->type = CHARACTER;
                   1715:       return 1;
                   1716:     }
                   1717:   if (c == '[') /* '[' is a special char in a bracket exps.  */
                   1718:     {
                   1719:       unsigned char c2;
                   1720:       int token_len;
                   1721:       c2 = re_string_peek_byte (input, 1);
                   1722:       token->opr.c = c2;
                   1723:       token_len = 2;
                   1724:       switch (c2)
                   1725:        {
                   1726:        case '.':
                   1727:          token->type = OP_OPEN_COLL_ELEM;
                   1728:          break;
                   1729:        case '=':
                   1730:          token->type = OP_OPEN_EQUIV_CLASS;
                   1731:          break;
                   1732:        case ':':
                   1733:          if (syntax & RE_CHAR_CLASSES)
                   1734:            {
                   1735:              token->type = OP_OPEN_CHAR_CLASS;
                   1736:              break;
                   1737:            }
                   1738:          /* else fall through.  */
                   1739:        default:
                   1740:          token->type = CHARACTER;
                   1741:          token->opr.c = c;
                   1742:          token_len = 1;
                   1743:          break;
                   1744:        }
                   1745:       return token_len;
                   1746:     }
                   1747:   switch (c)
                   1748:     {
                   1749:     case '-':
                   1750:       token->type = OP_CHARSET_RANGE;
                   1751:       break;
                   1752:     case ']':
                   1753:       token->type = OP_CLOSE_BRACKET;
                   1754:       break;
                   1755:     case '^':
                   1756:       token->type = OP_NON_MATCH_LIST;
                   1757:       break;
                   1758:     default:
                   1759:       token->type = CHARACTER;
                   1760:     }
                   1761:   return 1;
                   1762: }
                   1763: 
                   1764: /* Functions for parser.  */
                   1765: 
                   1766: /* Entry point of the parser.
                   1767:    Parse the regular expression REGEXP and return the structure tree.
                   1768:    If an error is occured, ERR is set by error code, and return NULL.
                   1769:    This function build the following tree, from regular expression <reg_exp>:
                   1770:           CAT
                   1771:           / \
                   1772:          /   \
                   1773:    <reg_exp>  EOR
                   1774: 
                   1775:    CAT means concatenation.
                   1776:    EOR means end of regular expression.  */
                   1777: 
                   1778: static bin_tree_t *
                   1779: parse (regexp, preg, syntax, err)
                   1780:      re_string_t *regexp;
                   1781:      regex_t *preg;
                   1782:      reg_syntax_t syntax;
                   1783:      reg_errcode_t *err;
                   1784: {
                   1785:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
                   1786:   bin_tree_t *tree, *eor, *root;
                   1787:   re_token_t current_token;
                   1788:   int new_idx;
                   1789:   current_token = fetch_token (regexp, syntax);
                   1790:   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
                   1791:   if (BE (*err != REG_NOERROR && tree == NULL, 0))
                   1792:     return NULL;
                   1793:   new_idx = re_dfa_add_node (dfa, current_token, 0);
                   1794:   eor = create_tree (NULL, NULL, 0, new_idx);
                   1795:   if (tree != NULL)
                   1796:     root = create_tree (tree, eor, CONCAT, 0);
                   1797:   else
                   1798:     root = eor;
                   1799:   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
                   1800:     {
                   1801:       *err = REG_ESPACE;
                   1802:       return NULL;
                   1803:     }
                   1804:   return root;
                   1805: }
                   1806: 
                   1807: /* This function build the following tree, from regular expression
                   1808:    <branch1>|<branch2>:
                   1809:           ALT
                   1810:           / \
                   1811:          /   \
                   1812:    <branch1> <branch2>
                   1813: 
                   1814:    ALT means alternative, which represents the operator `|'.  */
                   1815: 
                   1816: static bin_tree_t *
                   1817: parse_reg_exp (regexp, preg, token, syntax, nest, err)
                   1818:      re_string_t *regexp;
                   1819:      regex_t *preg;
                   1820:      re_token_t *token;
                   1821:      reg_syntax_t syntax;
                   1822:      int nest;
                   1823:      reg_errcode_t *err;
                   1824: {
                   1825:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
                   1826:   bin_tree_t *tree, *branch = NULL;
                   1827:   int new_idx;
                   1828:   tree = parse_branch (regexp, preg, token, syntax, nest, err);
                   1829:   if (BE (*err != REG_NOERROR && tree == NULL, 0))
                   1830:     return NULL;
                   1831: 
                   1832:   while (token->type == OP_ALT)
                   1833:     {
                   1834:       re_token_t alt_token = *token;
                   1835:       new_idx = re_dfa_add_node (dfa, alt_token, 0);
                   1836:       *token = fetch_token (regexp, syntax);
                   1837:       if (token->type != OP_ALT && token->type != END_OF_RE
                   1838:          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
                   1839:        {
                   1840:          branch = parse_branch (regexp, preg, token, syntax, nest, err);
                   1841:          if (BE (*err != REG_NOERROR && branch == NULL, 0))
                   1842:            {
                   1843:              free_bin_tree (tree);
                   1844:              return NULL;
                   1845:            }
                   1846:        }
                   1847:       else
                   1848:        branch = NULL;
                   1849:       tree = create_tree (tree, branch, 0, new_idx);
                   1850:       if (BE (new_idx == -1 || tree == NULL, 0))
                   1851:        {
                   1852:          *err = REG_ESPACE;
                   1853:          return NULL;
                   1854:        }
                   1855:       dfa->has_plural_match = 1;
                   1856:     }
                   1857:   return tree;
                   1858: }
                   1859: 
                   1860: /* This function build the following tree, from regular expression
                   1861:    <exp1><exp2>:
                   1862:        CAT
                   1863:        / \
                   1864:        /   \
                   1865:    <exp1> <exp2>
                   1866: 
                   1867:    CAT means concatenation.  */
                   1868: 
                   1869: static bin_tree_t *
                   1870: parse_branch (regexp, preg, token, syntax, nest, err)
                   1871:      re_string_t *regexp;
                   1872:      regex_t *preg;
                   1873:      re_token_t *token;
                   1874:      reg_syntax_t syntax;
                   1875:      int nest;
                   1876:      reg_errcode_t *err;
                   1877: {
                   1878:   bin_tree_t *tree, *exp;
                   1879:   tree = parse_expression (regexp, preg, token, syntax, nest, err);
                   1880:   if (BE (*err != REG_NOERROR && tree == NULL, 0))
                   1881:     return NULL;
                   1882: 
                   1883:   while (token->type != OP_ALT && token->type != END_OF_RE
                   1884:         && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
                   1885:     {
                   1886:       exp = parse_expression (regexp, preg, token, syntax, nest, err);
                   1887:       if (BE (*err != REG_NOERROR && exp == NULL, 0))
                   1888:        {
                   1889:          free_bin_tree (tree);
                   1890:          return NULL;
                   1891:        }
                   1892:       if (tree != NULL && exp != NULL)
                   1893:        {
                   1894:          tree = create_tree (tree, exp, CONCAT, 0);
                   1895:          if (tree == NULL)
                   1896:            {
                   1897:              *err = REG_ESPACE;
                   1898:              return NULL;
                   1899:            }
                   1900:        }
                   1901:       else if (tree == NULL)
                   1902:        tree = exp;
                   1903:       /* Otherwise exp == NULL, we don't need to create new tree.  */
                   1904:     }
                   1905:   return tree;
                   1906: }
                   1907: 
                   1908: /* This function build the following tree, from regular expression a*:
                   1909:         *
                   1910:         |
                   1911:         a
                   1912: */
                   1913: 
                   1914: static bin_tree_t *
                   1915: parse_expression (regexp, preg, token, syntax, nest, err)
                   1916:      re_string_t *regexp;
                   1917:      regex_t *preg;
                   1918:      re_token_t *token;
                   1919:      reg_syntax_t syntax;
                   1920:      int nest;
                   1921:      reg_errcode_t *err;
                   1922: {
                   1923:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
                   1924:   bin_tree_t *tree;
                   1925:   int new_idx;
                   1926:   switch (token->type)
                   1927:     {
                   1928:     case CHARACTER:
                   1929:       new_idx = re_dfa_add_node (dfa, *token, 0);
                   1930:       tree = create_tree (NULL, NULL, 0, new_idx);
                   1931:       if (BE (new_idx == -1 || tree == NULL, 0))
                   1932:        {
                   1933:          *err = REG_ESPACE;
                   1934:          return NULL;
                   1935:        }
                   1936: #ifdef RE_ENABLE_I18N
                   1937:       if (MB_CUR_MAX > 1)
                   1938:        {
                   1939:          while (!re_string_eoi (regexp)
                   1940:                 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
                   1941:            {
                   1942:              bin_tree_t *mbc_remain;
                   1943:              *token = fetch_token (regexp, syntax);
                   1944:              new_idx = re_dfa_add_node (dfa, *token, 0);
                   1945:              mbc_remain = create_tree (NULL, NULL, 0, new_idx);
                   1946:              tree = create_tree (tree, mbc_remain, CONCAT, 0);
                   1947:              if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
                   1948:                {
                   1949:                  *err = REG_ESPACE;
                   1950:                  return NULL;
                   1951:                }
                   1952:            }
                   1953:        }
                   1954: #endif
                   1955:       break;
                   1956:     case OP_OPEN_SUBEXP:
                   1957:       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
                   1958:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
                   1959:        return NULL;
                   1960:       break;
                   1961:     case OP_OPEN_BRACKET:
                   1962:       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
                   1963:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
                   1964:        return NULL;
                   1965:       break;
                   1966:     case OP_BACK_REF:
                   1967:       if (BE (preg->re_nsub < token->opr.idx
                   1968:              || dfa->subexps[token->opr.idx - 1].end == -1, 0))
                   1969:        {
                   1970:          *err = REG_ESUBREG;
                   1971:          return NULL;
                   1972:        }
                   1973:       dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
                   1974:       new_idx = re_dfa_add_node (dfa, *token, 0);
                   1975:       tree = create_tree (NULL, NULL, 0, new_idx);
                   1976:       if (BE (new_idx == -1 || tree == NULL, 0))
                   1977:        {
                   1978:          *err = REG_ESPACE;
                   1979:          return NULL;
                   1980:        }
                   1981:       ++dfa->nbackref;
                   1982:       dfa->has_mb_node = 1;
                   1983:       break;
                   1984:     case OP_DUP_ASTERISK:
                   1985:     case OP_DUP_PLUS:
                   1986:     case OP_DUP_QUESTION:
                   1987:     case OP_OPEN_DUP_NUM:
                   1988:       if (syntax & RE_CONTEXT_INVALID_OPS)
                   1989:        {
                   1990:          *err = REG_BADRPT;
                   1991:          return NULL;
                   1992:        }
                   1993:       else if (syntax & RE_CONTEXT_INDEP_OPS)
                   1994:        {
                   1995:          *token = fetch_token (regexp, syntax);
                   1996:          return parse_expression (regexp, preg, token, syntax, nest, err);
                   1997:        }
                   1998:       /* else fall through  */
                   1999:     case OP_CLOSE_SUBEXP:
                   2000:       if ((token->type == OP_CLOSE_SUBEXP) &&
                   2001:          !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
                   2002:        {
                   2003:          *err = REG_ERPAREN;
                   2004:          return NULL;
                   2005:        }
                   2006:       /* else fall through  */
                   2007:     case OP_CLOSE_DUP_NUM:
                   2008:       /* We treat it as a normal character.  */
                   2009: 
                   2010:       /* Then we can these characters as normal characters.  */
                   2011:       token->type = CHARACTER;
                   2012:       new_idx = re_dfa_add_node (dfa, *token, 0);
                   2013:       tree = create_tree (NULL, NULL, 0, new_idx);
                   2014:       if (BE (new_idx == -1 || tree == NULL, 0))
                   2015:        {
                   2016:          *err = REG_ESPACE;
                   2017:          return NULL;
                   2018:        }
                   2019:       break;
                   2020:     case ANCHOR:
                   2021:       if (dfa->word_char == NULL)
                   2022:        {
                   2023:          *err = init_word_char (dfa);
                   2024:          if (BE (*err != REG_NOERROR, 0))
                   2025:            return NULL;
                   2026:        }
                   2027:       if (token->opr.ctx_type == WORD_DELIM)
                   2028:        {
                   2029:          bin_tree_t *tree_first, *tree_last;
                   2030:          int idx_first, idx_last;
                   2031:          token->opr.ctx_type = WORD_FIRST;
                   2032:          idx_first = re_dfa_add_node (dfa, *token, 0);
                   2033:          tree_first = create_tree (NULL, NULL, 0, idx_first);
                   2034:          token->opr.ctx_type = WORD_LAST;
                   2035:          idx_last = re_dfa_add_node (dfa, *token, 0);
                   2036:          tree_last = create_tree (NULL, NULL, 0, idx_last);
                   2037:          token->type = OP_ALT;
                   2038:          new_idx = re_dfa_add_node (dfa, *token, 0);
                   2039:          tree = create_tree (tree_first, tree_last, 0, new_idx);
                   2040:          if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
                   2041:                  || tree_first == NULL || tree_last == NULL
                   2042:                  || tree == NULL, 0))
                   2043:            {
                   2044:              *err = REG_ESPACE;
                   2045:              return NULL;
                   2046:            }
                   2047:        }
                   2048:       else
                   2049:        {
                   2050:          new_idx = re_dfa_add_node (dfa, *token, 0);
                   2051:          tree = create_tree (NULL, NULL, 0, new_idx);
                   2052:          if (BE (new_idx == -1 || tree == NULL, 0))
                   2053:            {
                   2054:              *err = REG_ESPACE;
                   2055:              return NULL;
                   2056:            }
                   2057:        }
                   2058:       /* We must return here, since ANCHORs can't be followed
                   2059:         by repetition operators.
                   2060:         eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
                   2061:             it must not be "<ANCHOR(^)><REPEAT(*)>".  */
                   2062:       *token = fetch_token (regexp, syntax);
                   2063:       return tree;
                   2064:     case OP_PERIOD:
                   2065:       new_idx = re_dfa_add_node (dfa, *token, 0);
                   2066:       tree = create_tree (NULL, NULL, 0, new_idx);
                   2067:       if (BE (new_idx == -1 || tree == NULL, 0))
                   2068:        {
                   2069:          *err = REG_ESPACE;
                   2070:          return NULL;
                   2071:        }
                   2072:       if (MB_CUR_MAX > 1)
                   2073:        dfa->has_mb_node = 1;
                   2074:       break;
                   2075:     case OP_WORD:
                   2076:       tree = build_word_op (dfa, 0, err);
                   2077:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
                   2078:        return NULL;
                   2079:       break;
                   2080:     case OP_NOTWORD:
                   2081:       tree = build_word_op (dfa, 1, err);
                   2082:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
                   2083:        return NULL;
                   2084:       break;
                   2085:     case OP_ALT:
                   2086:     case END_OF_RE:
                   2087:       return NULL;
                   2088:     case BACK_SLASH:
                   2089:       *err = REG_EESCAPE;
                   2090:       return NULL;
                   2091:     default:
                   2092:       /* Must not happen?  */
                   2093: #ifdef DEBUG
                   2094:       assert (0);
                   2095: #endif
                   2096:       return NULL;
                   2097:     }
                   2098:   *token = fetch_token (regexp, syntax);
                   2099: 
                   2100:   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
                   2101:         || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
                   2102:     {
                   2103:       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
                   2104:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
                   2105:        return NULL;
                   2106:       dfa->has_plural_match = 1;
                   2107:     }
                   2108: 
                   2109:   return tree;
                   2110: }
                   2111: 
                   2112: /* This function build the following tree, from regular expression
                   2113:    (<reg_exp>):
                   2114:         SUBEXP
                   2115:            |
                   2116:        <reg_exp>
                   2117: */
                   2118: 
                   2119: static bin_tree_t *
                   2120: parse_sub_exp (regexp, preg, token, syntax, nest, err)
                   2121:      re_string_t *regexp;
                   2122:      regex_t *preg;
                   2123:      re_token_t *token;
                   2124:      reg_syntax_t syntax;
                   2125:      int nest;
                   2126:      reg_errcode_t *err;
                   2127: {
                   2128:   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
                   2129:   bin_tree_t *tree, *left_par, *right_par;
                   2130:   size_t cur_nsub;
                   2131:   int new_idx;
                   2132:   cur_nsub = preg->re_nsub++;
                   2133:   if (dfa->subexps_alloc < preg->re_nsub)
                   2134:     {
                   2135:       re_subexp_t *new_array;
                   2136:       dfa->subexps_alloc *= 2;
                   2137:       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
                   2138:       if (BE (new_array == NULL, 0))
                   2139:        {
                   2140:          dfa->subexps_alloc /= 2;
                   2141:          *err = REG_ESPACE;
                   2142:          return NULL;
                   2143:        }
                   2144:       dfa->subexps = new_array;
                   2145:     }
                   2146:   dfa->subexps[cur_nsub].start = dfa->nodes_len;
                   2147:   dfa->subexps[cur_nsub].end = -1;
                   2148: 
                   2149:   new_idx = re_dfa_add_node (dfa, *token, 0);
                   2150:   left_par = create_tree (NULL, NULL, 0, new_idx);
                   2151:   if (BE (new_idx == -1 || left_par == NULL, 0))
                   2152:     {
                   2153:       *err = REG_ESPACE;
                   2154:       return NULL;
                   2155:     }
                   2156:   dfa->nodes[new_idx].opr.idx = cur_nsub;
                   2157:   *token = fetch_token (regexp, syntax);
                   2158: 
                   2159:   /* The subexpression may be a null string.  */
                   2160:   if (token->type == OP_CLOSE_SUBEXP)
                   2161:     tree = NULL;
                   2162:   else
                   2163:     {
                   2164:       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
                   2165:       if (BE (*err != REG_NOERROR && tree == NULL, 0))
                   2166:        return NULL;
                   2167:     }
                   2168:   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
                   2169:     {
                   2170:       free_bin_tree (tree);
                   2171:       *err = REG_BADPAT;
                   2172:       return NULL;
                   2173:     }
                   2174:   new_idx = re_dfa_add_node (dfa, *token, 0);
                   2175:   dfa->subexps[cur_nsub].end = dfa->nodes_len;
                   2176:   right_par = create_tree (NULL, NULL, 0, new_idx);
                   2177:   tree = ((tree == NULL) ? right_par
                   2178:          : create_tree (tree, right_par, CONCAT, 0));
                   2179:   tree = create_tree (left_par, tree, CONCAT, 0);
                   2180:   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
                   2181:     {
                   2182:       *err = REG_ESPACE;
                   2183:       return NULL;
                   2184:     }
                   2185:   dfa->nodes[new_idx].opr.idx = cur_nsub;
                   2186: 
                   2187:   return tree;
                   2188: }
                   2189: 
                   2190: /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
                   2191: 
                   2192: static bin_tree_t *
                   2193: parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
                   2194:      bin_tree_t *dup_elem;
                   2195:      re_string_t *regexp;
                   2196:      re_dfa_t *dfa;
                   2197:      re_token_t *token;
                   2198:      reg_syntax_t syntax;
                   2199:      reg_errcode_t *err;
                   2200: {
                   2201:   re_token_t dup_token;
                   2202:   bin_tree_t *tree = dup_elem, *work_tree;
                   2203:   int new_idx, start_idx = re_string_cur_idx (regexp);
                   2204:   re_token_t start_token = *token;
                   2205:   if (token->type == OP_OPEN_DUP_NUM)
                   2206:     {
                   2207:       int i;
                   2208:       int end = 0;
                   2209:       int start = fetch_number (regexp, token, syntax);
                   2210:       bin_tree_t *elem;
                   2211:       if (start == -1)
                   2212:        {
                   2213:          if (token->type == CHARACTER && token->opr.c == ',')
                   2214:            start = 0; /* We treat "{,m}" as "{0,m}".  */
                   2215:          else
                   2216:            {
                   2217:              *err = REG_BADBR; /* <re>{} is invalid.  */
                   2218:              return NULL;
                   2219:            }
                   2220:        }
                   2221:       if (BE (start != -2, 1))
                   2222:        {
                   2223:          /* We treat "{n}" as "{n,n}".  */
                   2224:          end = ((token->type == OP_CLOSE_DUP_NUM) ? start
                   2225:                 : ((token->type == CHARACTER && token->opr.c == ',')
                   2226:                    ? fetch_number (regexp, token, syntax) : -2));
                   2227:        }
                   2228:       if (BE (start == -2 || end == -2, 0))
                   2229:        {
                   2230:          /* Invalid sequence.  */
                   2231:          if (token->type == OP_CLOSE_DUP_NUM)
                   2232:            goto parse_dup_op_invalid_interval;
                   2233:          else
                   2234:            goto parse_dup_op_ebrace;
                   2235:        }
                   2236:       if (BE (start == 0 && end == 0, 0))
                   2237:        {
                   2238:          /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
                   2239:          *token = fetch_token (regexp, syntax);
                   2240:          free_bin_tree (dup_elem);
                   2241:          return NULL;
                   2242:        }
                   2243: 
                   2244:       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
                   2245:       elem = tree;
                   2246:       for (i = 0; i < start; ++i)
                   2247:        if (i != 0)
                   2248:          {
                   2249:            work_tree = duplicate_tree (elem, dfa);
                   2250:            tree = create_tree (tree, work_tree, CONCAT, 0);
                   2251:            if (BE (work_tree == NULL || tree == NULL, 0))
                   2252:              goto parse_dup_op_espace;
                   2253:          }
                   2254: 
                   2255:       if (end == -1)
                   2256:        {
                   2257:          /* We treat "<re>{0,}" as "<re>*".  */
                   2258:          dup_token.type = OP_DUP_ASTERISK;
                   2259:          if (start > 0)
                   2260:            {
                   2261:              elem = duplicate_tree (elem, dfa);
                   2262:              new_idx = re_dfa_add_node (dfa, dup_token, 0);
                   2263:              work_tree = create_tree (elem, NULL, 0, new_idx);
                   2264:              tree = create_tree (tree, work_tree, CONCAT, 0);
                   2265:              if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
                   2266:                      || tree == NULL, 0))
                   2267:                goto parse_dup_op_espace;
                   2268:            }
                   2269:          else
                   2270:            {
                   2271:              new_idx = re_dfa_add_node (dfa, dup_token, 0);
                   2272:              tree = create_tree (elem, NULL, 0, new_idx);
                   2273:              if (BE (new_idx == -1 || tree == NULL, 0))
                   2274:                goto parse_dup_op_espace;
                   2275:            }
                   2276:        }
                   2277:       else if (end - start > 0)
                   2278:        {
                   2279:          /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
                   2280:          dup_token.type = OP_DUP_QUESTION;
                   2281:          if (start > 0)
                   2282:            {
                   2283:              elem = duplicate_tree (elem, dfa);
                   2284:              new_idx = re_dfa_add_node (dfa, dup_token, 0);
                   2285:              elem = create_tree (elem, NULL, 0, new_idx);
                   2286:              tree = create_tree (tree, elem, CONCAT, 0);
                   2287:              if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
                   2288:                goto parse_dup_op_espace;
                   2289:            }
                   2290:          else
                   2291:            {
                   2292:              new_idx = re_dfa_add_node (dfa, dup_token, 0);
                   2293:              tree = elem = create_tree (elem, NULL, 0, new_idx);
                   2294:              if (BE (new_idx == -1 || tree == NULL, 0))
                   2295:                goto parse_dup_op_espace;
                   2296:            }
                   2297:          for (i = 1; i < end - start; ++i)
                   2298:            {
                   2299:              work_tree = duplicate_tree (elem, dfa);
                   2300:              tree = create_tree (tree, work_tree, CONCAT, 0);
                   2301:              if (BE (work_tree == NULL || tree == NULL, 0))
                   2302:                {
                   2303:                  *err = REG_ESPACE;
                   2304:                  return NULL;
                   2305:                }
                   2306:            }
                   2307:        }
                   2308:     }
                   2309:   else
                   2310:     {
                   2311:       new_idx = re_dfa_add_node (dfa, *token, 0);
                   2312:       tree = create_tree (tree, NULL, 0, new_idx);
                   2313:       if (BE (new_idx == -1 || tree == NULL, 0))
                   2314:        {
                   2315:          *err = REG_ESPACE;
                   2316:          return NULL;
                   2317:        }
                   2318:     }
                   2319:   *token = fetch_token (regexp, syntax);
                   2320:   return tree;
                   2321: 
                   2322:  parse_dup_op_espace:
                   2323:   free_bin_tree (tree);
                   2324:   *err = REG_ESPACE;
                   2325:   return NULL;
                   2326: 
                   2327:  parse_dup_op_ebrace:
                   2328:   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
                   2329:     {
                   2330:       *err = REG_EBRACE;
                   2331:       return NULL;
                   2332:     }
                   2333:   goto parse_dup_op_rollback;
                   2334:  parse_dup_op_invalid_interval:
                   2335:   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
                   2336:     {
                   2337:       *err = REG_BADBR;
                   2338:       return NULL;
                   2339:     }
                   2340:  parse_dup_op_rollback:
                   2341:   re_string_set_index (regexp, start_idx);
                   2342:   *token = start_token;
                   2343:   token->type = CHARACTER;
                   2344:   return dup_elem;
                   2345: }
                   2346: 
                   2347: /* Size of the names for collating symbol/equivalence_class/character_class.
                   2348:    I'm not sure, but maybe enough.  */
                   2349: #define BRACKET_NAME_BUF_SIZE 32
                   2350: 
                   2351: #ifndef _LIBC
                   2352:   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
                   2353:      Build the range expression which starts from START_ELEM, and ends
                   2354:      at END_ELEM.  The result are written to MBCSET and SBCSET.
                   2355:      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
                   2356:      mbcset->range_ends, is a pointer argument sinse we may
                   2357:      update it.  */
                   2358: 
                   2359: static reg_errcode_t
                   2360: # ifdef RE_ENABLE_I18N
                   2361: build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
                   2362:      re_charset_t *mbcset;
                   2363:      int *range_alloc;
                   2364: # else /* not RE_ENABLE_I18N */
                   2365: build_range_exp (sbcset, start_elem, end_elem)
                   2366: # endif /* not RE_ENABLE_I18N */
                   2367:      re_bitset_ptr_t sbcset;
                   2368:      bracket_elem_t *start_elem, *end_elem;
                   2369: {
                   2370:   unsigned int start_ch, end_ch;
                   2371:   /* Equivalence Classes and Character Classes can't be a range start/end.  */
                   2372:   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
                   2373:          || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
                   2374:          0))
                   2375:     return REG_ERANGE;
                   2376: 
                   2377:   /* We can handle no multi character collating elements without libc
                   2378:      support.  */
                   2379:   if (BE ((start_elem->type == COLL_SYM
                   2380:           && strlen ((char *) start_elem->opr.name) > 1)
                   2381:          || (end_elem->type == COLL_SYM
                   2382:              && strlen ((char *) end_elem->opr.name) > 1), 0))
                   2383:     return REG_ECOLLATE;
                   2384: 
                   2385: # ifdef RE_ENABLE_I18N
                   2386:   {
                   2387:     wchar_t wc, start_wc, end_wc;
                   2388:     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
                   2389: 
                   2390:     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
                   2391:                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
                   2392:                   : 0));
                   2393:     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
                   2394:              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
                   2395:                 : 0));
                   2396:     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
                   2397:                ? __btowc (start_ch) : start_elem->opr.wch);
                   2398:     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
                   2399:              ? __btowc (end_ch) : end_elem->opr.wch);
                   2400:     cmp_buf[0] = start_wc;
                   2401:     cmp_buf[4] = end_wc;
                   2402:     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
                   2403:       return REG_ERANGE;
                   2404: 
                   2405:     /* Check the space of the arrays.  */
                   2406:     if (*range_alloc == mbcset->nranges)
                   2407:       {
                   2408:        /* There are not enough space, need realloc.  */
                   2409:        wchar_t *new_array_start, *new_array_end;
                   2410:        int new_nranges;
                   2411: 
                   2412:        /* +1 in case of mbcset->nranges is 0.  */
                   2413:        new_nranges = 2 * mbcset->nranges + 1;
                   2414:        /* Use realloc since mbcset->range_starts and mbcset->range_ends
                   2415:           are NULL if *range_alloc == 0.  */
                   2416:        new_array_start = re_realloc (mbcset->range_starts, wchar_t,
                   2417:                                      new_nranges);
                   2418:        new_array_end = re_realloc (mbcset->range_ends, wchar_t,
                   2419:                                    new_nranges);
                   2420: 
                   2421:        if (BE (new_array_start == NULL || new_array_end == NULL, 0))
                   2422:          return REG_ESPACE;
                   2423: 
                   2424:        mbcset->range_starts = new_array_start;
                   2425:        mbcset->range_ends = new_array_end;
                   2426:        *range_alloc = new_nranges;
                   2427:       }
                   2428: 
                   2429:     mbcset->range_starts[mbcset->nranges] = start_wc;
                   2430:     mbcset->range_ends[mbcset->nranges++] = end_wc;
                   2431: 
                   2432:     /* Build the table for single byte characters.  */
                   2433:     for (wc = 0; wc <= SBC_MAX; ++wc)
                   2434:       {
                   2435:        cmp_buf[2] = wc;
                   2436:        if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
                   2437:            && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
                   2438:          bitset_set (sbcset, wc);
                   2439:       }
                   2440:   }
                   2441: # else /* not RE_ENABLE_I18N */
                   2442:   {
                   2443:     unsigned int ch;
                   2444:     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
                   2445:                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
                   2446:                   : 0));
                   2447:     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
                   2448:              : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
                   2449:                 : 0));
                   2450:     if (start_ch > end_ch)
                   2451:       return REG_ERANGE;
                   2452:     /* Build the table for single byte characters.  */
                   2453:     for (ch = 0; ch <= SBC_MAX; ++ch)
                   2454:       if (start_ch <= ch  && ch <= end_ch)
                   2455:        bitset_set (sbcset, ch);
                   2456:   }
                   2457: # endif /* not RE_ENABLE_I18N */
                   2458:   return REG_NOERROR;
                   2459: }
                   2460: #endif /* not _LIBC */
                   2461: 
                   2462: #ifndef _LIBC
                   2463: /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
                   2464:    Build the collating element which is represented by NAME.
                   2465:    The result are written to MBCSET and SBCSET.
                   2466:    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
                   2467:    pointer argument since we may update it.  */
                   2468: 
                   2469: static reg_errcode_t
                   2470: # ifdef RE_ENABLE_I18N
                   2471: build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
                   2472:      re_charset_t *mbcset;
                   2473:      int *coll_sym_alloc;
                   2474: # else /* not RE_ENABLE_I18N */
                   2475: build_collating_symbol (sbcset, name)
                   2476: # endif /* not RE_ENABLE_I18N */
                   2477:      re_bitset_ptr_t sbcset;
                   2478:      const unsigned char *name;
                   2479: {
                   2480:   size_t name_len = strlen ((const char *) name);
                   2481:   if (BE (name_len != 1, 0))
                   2482:     return REG_ECOLLATE;
                   2483:   else
                   2484:     {
                   2485:       bitset_set (sbcset, name[0]);
                   2486:       return REG_NOERROR;
                   2487:     }
                   2488: }
                   2489: #endif /* not _LIBC */
                   2490: 
                   2491: /* This function parse bracket expression like "[abc]", "[a-c]",
                   2492:    "[[.a-a.]]" etc.  */
                   2493: 
                   2494: static bin_tree_t *
                   2495: parse_bracket_exp (regexp, dfa, token, syntax, err)
                   2496:      re_string_t *regexp;
                   2497:      re_dfa_t *dfa;
                   2498:      re_token_t *token;
                   2499:      reg_syntax_t syntax;
                   2500:      reg_errcode_t *err;
                   2501: {
                   2502: #ifdef _LIBC
                   2503:   const unsigned char *collseqmb;
                   2504:   const char *collseqwc;
                   2505:   uint32_t nrules;
                   2506:   int32_t table_size;
                   2507:   const int32_t *symb_table;
                   2508:   const unsigned char *extra;
                   2509: 
                   2510:   /* Local function for parse_bracket_exp used in _LIBC environement.
                   2511:      Seek the collating symbol entry correspondings to NAME.
                   2512:      Return the index of the symbol in the SYMB_TABLE.  */
                   2513: 
                   2514:   static inline int32_t
                   2515:   seek_collating_symbol_entry (name, name_len)
                   2516:         const unsigned char *name;
                   2517:         size_t name_len;
                   2518:     {
                   2519:       int32_t hash = elem_hash ((const char *) name, name_len);
                   2520:       int32_t elem = hash % table_size;
                   2521:       int32_t second = hash % (table_size - 2);
                   2522:       while (symb_table[2 * elem] != 0)
                   2523:        {
                   2524:          /* First compare the hashing value.  */
                   2525:          if (symb_table[2 * elem] == hash
                   2526:              /* Compare the length of the name.  */
                   2527:              && name_len == extra[symb_table[2 * elem + 1]]
                   2528:              /* Compare the name.  */
                   2529:              && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
                   2530:                         name_len) == 0)
                   2531:            {
                   2532:              /* Yep, this is the entry.  */
                   2533:              break;
                   2534:            }
                   2535: 
                   2536:          /* Next entry.  */
                   2537:          elem += second;
                   2538:        }
                   2539:       return elem;
                   2540:     }
                   2541: 
                   2542:   /* Local function for parse_bracket_exp used in _LIBC environement.
                   2543:      Look up the collation sequence value of BR_ELEM.
                   2544:      Return the value if succeeded, UINT_MAX otherwise.  */
                   2545: 
                   2546:   static inline unsigned int
                   2547:   lookup_collation_sequence_value (br_elem)
                   2548:         bracket_elem_t *br_elem;
                   2549:     {
                   2550:       if (br_elem->type == SB_CHAR)
                   2551:        {
                   2552:          /*
                   2553:          if (MB_CUR_MAX == 1)
                   2554:          */
                   2555:          if (nrules == 0)
                   2556:            return collseqmb[br_elem->opr.ch];
                   2557:          else
                   2558:            {
                   2559:              wint_t wc = __btowc (br_elem->opr.ch);
                   2560:              return collseq_table_lookup (collseqwc, wc);
                   2561:            }
                   2562:        }
                   2563:       else if (br_elem->type == MB_CHAR)
                   2564:        {
                   2565:          return collseq_table_lookup (collseqwc, br_elem->opr.wch);
                   2566:        }
                   2567:       else if (br_elem->type == COLL_SYM)
                   2568:        {
                   2569:          size_t sym_name_len = strlen ((char *) br_elem->opr.name);
                   2570:          if (nrules != 0)
                   2571:            {
                   2572:              int32_t elem, idx;
                   2573:              elem = seek_collating_symbol_entry (br_elem->opr.name,
                   2574:                                                  sym_name_len);
                   2575:              if (symb_table[2 * elem] != 0)
                   2576:                {
                   2577:                  /* We found the entry.  */
                   2578:                  idx = symb_table[2 * elem + 1];
                   2579:                  /* Skip the name of collating element name.  */
                   2580:                  idx += 1 + extra[idx];
                   2581:                  /* Skip the byte sequence of the collating element.  */
                   2582:                  idx += 1 + extra[idx];
                   2583:                  /* Adjust for the alignment.  */
                   2584:                  idx = (idx + 3) & ~3;
                   2585:                  /* Skip the multibyte collation sequence value.  */
                   2586:                  idx += sizeof (unsigned int);
                   2587:                  /* Skip the wide char sequence of the collating element.  */
                   2588:                  idx += sizeof (unsigned int) *
                   2589:                    (1 + *(unsigned int *) (extra + idx));
                   2590:                  /* Return the collation sequence value.  */
                   2591:                  return *(unsigned int *) (extra + idx);
                   2592:                }
                   2593:              else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
                   2594:                {
                   2595:                  /* No valid character.  Match it as a single byte
                   2596:                     character.  */
                   2597:                  return collseqmb[br_elem->opr.name[0]];
                   2598:                }
                   2599:            }
                   2600:          else if (sym_name_len == 1)
                   2601:            return collseqmb[br_elem->opr.name[0]];
                   2602:        }
                   2603:       return UINT_MAX;
                   2604:     }
                   2605: 
                   2606:   /* Local function for parse_bracket_exp used in _LIBC environement.
                   2607:      Build the range expression which starts from START_ELEM, and ends
                   2608:      at END_ELEM.  The result are written to MBCSET and SBCSET.
                   2609:      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
                   2610:      mbcset->range_ends, is a pointer argument sinse we may
                   2611:      update it.  */
                   2612: 
                   2613:   static inline reg_errcode_t
                   2614: # ifdef RE_ENABLE_I18N
                   2615:   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
                   2616:         re_charset_t *mbcset;
                   2617:         int *range_alloc;
                   2618: # else /* not RE_ENABLE_I18N */
                   2619:   build_range_exp (sbcset, start_elem, end_elem)
                   2620: # endif /* not RE_ENABLE_I18N */
                   2621:         re_bitset_ptr_t sbcset;
                   2622:         bracket_elem_t *start_elem, *end_elem;
                   2623:     {
                   2624:       unsigned int ch;
                   2625:       uint32_t start_collseq;
                   2626:       uint32_t end_collseq;
                   2627: 
                   2628: # ifdef RE_ENABLE_I18N
                   2629:       /* Check the space of the arrays.  */
                   2630:       if (*range_alloc == mbcset->nranges)
                   2631:        {
                   2632:          /* There are not enough space, need realloc.  */
                   2633:          uint32_t *new_array_start;
                   2634:          uint32_t *new_array_end;
                   2635:          int new_nranges;
                   2636: 
                   2637:          /* +1 in case of mbcset->nranges is 0.  */
                   2638:          new_nranges = 2 * mbcset->nranges + 1;
                   2639:          /* Use realloc since mbcset->range_starts and mbcset->range_ends
                   2640:             are NULL if *range_alloc == 0.  */
                   2641:          new_array_start = re_realloc (mbcset->range_starts, uint32_t,
                   2642:                                        new_nranges);
                   2643:          new_array_end = re_realloc (mbcset->range_ends, uint32_t,
                   2644:                                      new_nranges);
                   2645: 
                   2646:          if (BE (new_array_start == NULL || new_array_end == NULL, 0))
                   2647:            return REG_ESPACE;
                   2648: 
                   2649:          mbcset->range_starts = new_array_start;
                   2650:          mbcset->range_ends = new_array_end;
                   2651:          *range_alloc = new_nranges;
                   2652:        }
                   2653: # endif /* RE_ENABLE_I18N */
                   2654: 
                   2655:       /* Equivalence Classes and Character Classes can't be a range
                   2656:         start/end.  */
                   2657:       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
                   2658:              || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
                   2659:              0))
                   2660:        return REG_ERANGE;
                   2661: 
                   2662:       start_collseq = lookup_collation_sequence_value (start_elem);
                   2663:       end_collseq = lookup_collation_sequence_value (end_elem);
                   2664:       /* Check start/end collation sequence values.  */
                   2665:       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
                   2666:        return REG_ECOLLATE;
                   2667:       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
                   2668:        return REG_ERANGE;
                   2669: 
                   2670: # ifdef RE_ENABLE_I18N
                   2671:       /* Got valid collation sequence values, add them as a new entry.  */
                   2672:       mbcset->range_starts[mbcset->nranges] = start_collseq;
                   2673:       mbcset->range_ends[mbcset->nranges++] = end_collseq;
                   2674: # endif /* RE_ENABLE_I18N */
                   2675: 
                   2676:       /* Build the table for single byte characters.  */
                   2677:       for (ch = 0; ch <= SBC_MAX; ch++)
                   2678:        {
                   2679:          uint32_t ch_collseq;
                   2680:          /*
                   2681:          if (MB_CUR_MAX == 1)
                   2682:          */
                   2683:          if (nrules == 0)
                   2684:            ch_collseq = collseqmb[ch];
                   2685:          else
                   2686:            ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
                   2687:          if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
                   2688:            bitset_set (sbcset, ch);
                   2689:        }
                   2690:       return REG_NOERROR;
                   2691:     }
                   2692: 
                   2693:   /* Local function for parse_bracket_exp used in _LIBC environement.
                   2694:      Build the collating element which is represented by NAME.
                   2695:      The result are written to MBCSET and SBCSET.
                   2696:      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
                   2697:      pointer argument sinse we may update it.  */
                   2698: 
                   2699:   static inline reg_errcode_t
                   2700: # ifdef RE_ENABLE_I18N
                   2701:   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
                   2702:         re_charset_t *mbcset;
                   2703:         int *coll_sym_alloc;
                   2704: # else /* not RE_ENABLE_I18N */
                   2705:   build_collating_symbol (sbcset, name)
                   2706: # endif /* not RE_ENABLE_I18N */
                   2707:         re_bitset_ptr_t sbcset;
                   2708:         const unsigned char *name;
                   2709:     {
                   2710:       int32_t elem, idx;
                   2711:       size_t name_len = strlen ((const char *) name);
                   2712:       if (nrules != 0)
                   2713:        {
                   2714:          elem = seek_collating_symbol_entry (name, name_len);
                   2715:          if (symb_table[2 * elem] != 0)
                   2716:            {
                   2717:              /* We found the entry.  */
                   2718:              idx = symb_table[2 * elem + 1];
                   2719:              /* Skip the name of collating element name.  */
                   2720:              idx += 1 + extra[idx];
                   2721:            }
                   2722:          else if (symb_table[2 * elem] == 0 && name_len == 1)
                   2723:            {
                   2724:              /* No valid character, treat it as a normal
                   2725:                 character.  */
                   2726:              bitset_set (sbcset, name[0]);
                   2727:              return REG_NOERROR;
                   2728:            }
                   2729:          else
                   2730:            return REG_ECOLLATE;
                   2731: 
                   2732: # ifdef RE_ENABLE_I18N
                   2733:          /* Got valid collation sequence, add it as a new entry.  */
                   2734:          /* Check the space of the arrays.  */
                   2735:          if (*coll_sym_alloc == mbcset->ncoll_syms)
                   2736:            {
                   2737:              /* Not enough, realloc it.  */
                   2738:              /* +1 in case of mbcset->ncoll_syms is 0.  */
                   2739:              *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
                   2740:              /* Use realloc since mbcset->coll_syms is NULL
                   2741:                 if *alloc == 0.  */
                   2742:              mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
                   2743:                                              *coll_sym_alloc);
                   2744:              if (BE (mbcset->coll_syms == NULL, 0))
                   2745:                return REG_ESPACE;
                   2746:            }
                   2747:          mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
                   2748: # endif /* RE_ENABLE_I18N */
                   2749:          return REG_NOERROR;
                   2750:        }
                   2751:       else
                   2752:        {
                   2753:          if (BE (name_len != 1, 0))
                   2754:            return REG_ECOLLATE;
                   2755:          else
                   2756:            {
                   2757:              bitset_set (sbcset, name[0]);
                   2758:              return REG_NOERROR;
                   2759:            }
                   2760:        }
                   2761:     }
                   2762: #endif
                   2763: 
                   2764:   re_token_t br_token;
                   2765:   re_bitset_ptr_t sbcset;
                   2766: #ifdef RE_ENABLE_I18N
                   2767:   re_charset_t *mbcset;
                   2768:   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
                   2769:   int equiv_class_alloc = 0, char_class_alloc = 0;
                   2770: #else /* not RE_ENABLE_I18N */
                   2771:   int non_match = 0;
                   2772: #endif /* not RE_ENABLE_I18N */
                   2773:   bin_tree_t *work_tree;
                   2774:   int token_len, new_idx;
                   2775: #ifdef _LIBC
                   2776:   collseqmb = (const unsigned char *)
                   2777:     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
                   2778:   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
                   2779:   if (nrules)
                   2780:     {
                   2781:       /*
                   2782:       if (MB_CUR_MAX > 1)
                   2783:       */
                   2784:        collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
                   2785:       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
                   2786:       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
                   2787:                                                  _NL_COLLATE_SYMB_TABLEMB);
                   2788:       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
                   2789:                                                   _NL_COLLATE_SYMB_EXTRAMB);
                   2790:     }
                   2791: #endif
                   2792:   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
                   2793: #ifdef RE_ENABLE_I18N
                   2794:   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
                   2795: #endif /* RE_ENABLE_I18N */
                   2796: #ifdef RE_ENABLE_I18N
                   2797:   if (BE (sbcset == NULL || mbcset == NULL, 0))
                   2798: #else
                   2799:   if (BE (sbcset == NULL, 0))
                   2800: #endif /* RE_ENABLE_I18N */
                   2801:     {
                   2802:       *err = REG_ESPACE;
                   2803:       return NULL;
                   2804:     }
                   2805: 
                   2806:   token_len = peek_token_bracket (token, regexp, syntax);
                   2807:   if (BE (token->type == END_OF_RE, 0))
                   2808:     {
                   2809:       *err = REG_BADPAT;
                   2810:       goto parse_bracket_exp_free_return;
                   2811:     }
                   2812:   if (token->type == OP_NON_MATCH_LIST)
                   2813:     {
                   2814: #ifdef RE_ENABLE_I18N
                   2815:       int i;
                   2816:       mbcset->non_match = 1;
                   2817: #else /* not RE_ENABLE_I18N */
                   2818:       non_match = 1;
                   2819: #endif /* not RE_ENABLE_I18N */
                   2820:       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
                   2821:        bitset_set (sbcset, '\0');
                   2822:       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
                   2823:       token_len = peek_token_bracket (token, regexp, syntax);
                   2824:       if (BE (token->type == END_OF_RE, 0))
                   2825:        {
                   2826:          *err = REG_BADPAT;
                   2827:          goto parse_bracket_exp_free_return;
                   2828:        }
                   2829: #ifdef RE_ENABLE_I18N
                   2830:       if (MB_CUR_MAX > 1)
                   2831:        for (i = 0; i < SBC_MAX; ++i)
                   2832:          if (__btowc (i) == WEOF)
                   2833:            bitset_set (sbcset, i);
                   2834: #endif /* RE_ENABLE_I18N */
                   2835:     }
                   2836: 
                   2837:   /* We treat the first ']' as a normal character.  */
                   2838:   if (token->type == OP_CLOSE_BRACKET)
                   2839:     token->type = CHARACTER;
                   2840: 
                   2841:   while (1)
                   2842:     {
                   2843:       bracket_elem_t start_elem, end_elem;
                   2844:       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
                   2845:       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
                   2846:       reg_errcode_t ret;
                   2847:       int token_len2 = 0, is_range_exp = 0;
                   2848:       re_token_t token2;
                   2849: 
                   2850:       start_elem.opr.name = start_name_buf;
                   2851:       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
                   2852:                                   syntax);
                   2853:       if (BE (ret != REG_NOERROR, 0))
                   2854:        {
                   2855:          *err = ret;
                   2856:          goto parse_bracket_exp_free_return;
                   2857:        }
                   2858: 
                   2859:       token_len = peek_token_bracket (token, regexp, syntax);
                   2860:       if (BE (token->type == END_OF_RE, 0))
                   2861:        {
                   2862:          *err = REG_BADPAT;
                   2863:          goto parse_bracket_exp_free_return;
                   2864:        }
                   2865:       if (token->type == OP_CHARSET_RANGE)
                   2866:        {
                   2867:          re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
                   2868:          token_len2 = peek_token_bracket (&token2, regexp, syntax);
                   2869:          if (BE (token->type == END_OF_RE, 0))
                   2870:            {
                   2871:              *err = REG_BADPAT;
                   2872:              goto parse_bracket_exp_free_return;
                   2873:            }
                   2874:          if (token2.type == OP_CLOSE_BRACKET)
                   2875:            {
                   2876:              /* We treat the last '-' as a normal character.  */
                   2877:              re_string_skip_bytes (regexp, -token_len);
                   2878:              token->type = CHARACTER;
                   2879:            }
                   2880:          else
                   2881:            is_range_exp = 1;
                   2882:        }
                   2883: 
                   2884:       if (is_range_exp == 1)
                   2885:        {
                   2886:          end_elem.opr.name = end_name_buf;
                   2887:          ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
                   2888:                                       dfa, syntax);
                   2889:          if (BE (ret != REG_NOERROR, 0))
                   2890:            {
                   2891:              *err = ret;
                   2892:              goto parse_bracket_exp_free_return;
                   2893:            }
                   2894: 
                   2895:          token_len = peek_token_bracket (token, regexp, syntax);
                   2896:          if (BE (token->type == END_OF_RE, 0))
                   2897:            {
                   2898:              *err = REG_BADPAT;
                   2899:              goto parse_bracket_exp_free_return;
                   2900:            }
                   2901:          *err = build_range_exp (sbcset,
                   2902: #ifdef RE_ENABLE_I18N
                   2903:                                  mbcset, &range_alloc,
                   2904: #endif /* RE_ENABLE_I18N */
                   2905:                                  &start_elem, &end_elem);
                   2906:          if (BE (*err != REG_NOERROR, 0))
                   2907:            goto parse_bracket_exp_free_return;
                   2908:        }
                   2909:       else
                   2910:        {
                   2911:          switch (start_elem.type)
                   2912:            {
                   2913:            case SB_CHAR:
                   2914:              bitset_set (sbcset, start_elem.opr.ch);
                   2915:              break;
                   2916: #ifdef RE_ENABLE_I18N
                   2917:            case MB_CHAR:
                   2918:              /* Check whether the array has enough space.  */
                   2919:              if (mbchar_alloc == mbcset->nmbchars)
                   2920:                {
                   2921:                  /* Not enough, realloc it.  */
                   2922:                  /* +1 in case of mbcset->nmbchars is 0.  */
                   2923:                  mbchar_alloc = 2 * mbcset->nmbchars + 1;
                   2924:                  /* Use realloc since array is NULL if *alloc == 0.  */
                   2925:                  mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
                   2926:                                                mbchar_alloc);
                   2927:                  if (BE (mbcset->mbchars == NULL, 0))
                   2928:                    goto parse_bracket_exp_espace;
                   2929:                }
                   2930:              mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
                   2931:              break;
                   2932: #endif /* RE_ENABLE_I18N */
                   2933:            case EQUIV_CLASS:
                   2934:              *err = build_equiv_class (sbcset,
                   2935: #ifdef RE_ENABLE_I18N
                   2936:                                        mbcset, &equiv_class_alloc,
                   2937: #endif /* RE_ENABLE_I18N */
                   2938:                                        start_elem.opr.name);
                   2939:              if (BE (*err != REG_NOERROR, 0))
                   2940:                goto parse_bracket_exp_free_return;
                   2941:              break;
                   2942:            case COLL_SYM:
                   2943:              *err = build_collating_symbol (sbcset,
                   2944: #ifdef RE_ENABLE_I18N
                   2945:                                             mbcset, &coll_sym_alloc,
                   2946: #endif /* RE_ENABLE_I18N */
                   2947:                                             start_elem.opr.name);
                   2948:              if (BE (*err != REG_NOERROR, 0))
                   2949:                goto parse_bracket_exp_free_return;
                   2950:              break;
                   2951:            case CHAR_CLASS:
                   2952:              *err = build_charclass (sbcset,
                   2953: #ifdef RE_ENABLE_I18N
                   2954:                                      mbcset, &char_class_alloc,
                   2955: #endif /* RE_ENABLE_I18N */
                   2956:                                      start_elem.opr.name, syntax);
                   2957:              if (BE (*err != REG_NOERROR, 0))
                   2958:               goto parse_bracket_exp_free_return;
                   2959:              break;
                   2960:            default:
                   2961:              assert (0);
                   2962:              break;
                   2963:            }
                   2964:        }
                   2965:       if (token->type == OP_CLOSE_BRACKET)
                   2966:        break;
                   2967:     }
                   2968: 
                   2969:   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
                   2970: 
                   2971:   /* If it is non-matching list.  */
                   2972: #ifdef RE_ENABLE_I18N
                   2973:   if (mbcset->non_match)
                   2974: #else /* not RE_ENABLE_I18N */
                   2975:   if (non_match)
                   2976: #endif /* not RE_ENABLE_I18N */
                   2977:     bitset_not (sbcset);
                   2978: 
                   2979:   /* Build a tree for simple bracket.  */
                   2980:   br_token.type = SIMPLE_BRACKET;
                   2981:   br_token.opr.sbcset = sbcset;
                   2982:   new_idx = re_dfa_add_node (dfa, br_token, 0);
                   2983:   work_tree = create_tree (NULL, NULL, 0, new_idx);
                   2984:   if (BE (new_idx == -1 || work_tree == NULL, 0))
                   2985:     goto parse_bracket_exp_espace;
                   2986: 
                   2987: #ifdef RE_ENABLE_I18N
                   2988:   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
                   2989:       || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
                   2990:                                                || mbcset->non_match)))
                   2991:     {
                   2992:       re_token_t alt_token;
                   2993:       bin_tree_t *mbc_tree;
                   2994:       /* Build a tree for complex bracket.  */
                   2995:       br_token.type = COMPLEX_BRACKET;
                   2996:       br_token.opr.mbcset = mbcset;
                   2997:       dfa->has_mb_node = 1;
                   2998:       new_idx = re_dfa_add_node (dfa, br_token, 0);
                   2999:       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
                   3000:       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
                   3001:        goto parse_bracket_exp_espace;
                   3002:       /* Then join them by ALT node.  */
                   3003:       dfa->has_plural_match = 1;
                   3004:       alt_token.type = OP_ALT;
                   3005:       new_idx = re_dfa_add_node (dfa, alt_token, 0);
                   3006:       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
                   3007:       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
                   3008:        return work_tree;
                   3009:     }
                   3010:   else
                   3011:     {
                   3012:       free_charset (mbcset);
                   3013:       return work_tree;
                   3014:     }
                   3015: #else /* not RE_ENABLE_I18N */
                   3016:   return work_tree;
                   3017: #endif /* not RE_ENABLE_I18N */
                   3018: 
                   3019:  parse_bracket_exp_espace:
                   3020:   *err = REG_ESPACE;
                   3021:  parse_bracket_exp_free_return:
                   3022:   re_free (sbcset);
                   3023: #ifdef RE_ENABLE_I18N
                   3024:   free_charset (mbcset);
                   3025: #endif /* RE_ENABLE_I18N */
                   3026:   return NULL;
                   3027: }
                   3028: 
                   3029: /* Parse an element in the bracket expression.  */
                   3030: 
                   3031: static reg_errcode_t
                   3032: parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
                   3033:      bracket_elem_t *elem;
                   3034:      re_string_t *regexp;
                   3035:      re_token_t *token;
                   3036:      int token_len;
                   3037:      re_dfa_t *dfa;
                   3038:      reg_syntax_t syntax;
                   3039: {
                   3040: #ifdef RE_ENABLE_I18N
                   3041:   int cur_char_size;
                   3042:   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
                   3043:   if (cur_char_size > 1)
                   3044:     {
                   3045:       elem->type = MB_CHAR;
                   3046:       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
                   3047:       re_string_skip_bytes (regexp, cur_char_size);
                   3048:       return REG_NOERROR;
                   3049:     }
                   3050: #endif /* RE_ENABLE_I18N */
                   3051:   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
                   3052:   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
                   3053:       || token->type == OP_OPEN_EQUIV_CLASS)
                   3054:     return parse_bracket_symbol (elem, regexp, token);
                   3055:   elem->type = SB_CHAR;
                   3056:   elem->opr.ch = token->opr.c;
                   3057:   return REG_NOERROR;
                   3058: }
                   3059: 
                   3060: /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
                   3061:    such as [:<character_class>:], [.<collating_element>.], and
                   3062:    [=<equivalent_class>=].  */
                   3063: 
                   3064: static reg_errcode_t
                   3065: parse_bracket_symbol (elem, regexp, token)
                   3066:      bracket_elem_t *elem;
                   3067:      re_string_t *regexp;
                   3068:      re_token_t *token;
                   3069: {
                   3070:   unsigned char ch, delim = token->opr.c;
                   3071:   int i = 0;
                   3072:   for (;; ++i)
                   3073:     {
                   3074:       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
                   3075:        return REG_EBRACK;
                   3076:       if (token->type == OP_OPEN_CHAR_CLASS)
                   3077:        ch = re_string_fetch_byte_case (regexp);
                   3078:       else
                   3079:        ch = re_string_fetch_byte (regexp);
                   3080:       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
                   3081:        break;
                   3082:       elem->opr.name[i] = ch;
                   3083:     }
                   3084:   re_string_skip_bytes (regexp, 1);
                   3085:   elem->opr.name[i] = '\0';
                   3086:   switch (token->type)
                   3087:     {
                   3088:     case OP_OPEN_COLL_ELEM:
                   3089:       elem->type = COLL_SYM;
                   3090:       break;
                   3091:     case OP_OPEN_EQUIV_CLASS:
                   3092:       elem->type = EQUIV_CLASS;
                   3093:       break;
                   3094:     case OP_OPEN_CHAR_CLASS:
                   3095:       elem->type = CHAR_CLASS;
                   3096:       break;
                   3097:     default:
                   3098:       break;
                   3099:     }
                   3100:   return REG_NOERROR;
                   3101: }
                   3102: 
                   3103:   /* Helper function for parse_bracket_exp.
                   3104:      Build the equivalence class which is represented by NAME.
                   3105:      The result are written to MBCSET and SBCSET.
                   3106:      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
                   3107:      is a pointer argument sinse we may update it.  */
                   3108: 
                   3109: static reg_errcode_t
                   3110: #ifdef RE_ENABLE_I18N
                   3111: build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
                   3112:      re_charset_t *mbcset;
                   3113:      int *equiv_class_alloc;
                   3114: #else /* not RE_ENABLE_I18N */
                   3115: build_equiv_class (sbcset, name)
                   3116: #endif /* not RE_ENABLE_I18N */
                   3117:      re_bitset_ptr_t sbcset;
                   3118:      const unsigned char *name;
                   3119: {
                   3120: #if defined _LIBC && defined RE_ENABLE_I18N
                   3121:   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
                   3122:   if (nrules != 0)
                   3123:     {
                   3124:       const int32_t *table, *indirect;
                   3125:       const unsigned char *weights, *extra, *cp;
                   3126:       unsigned char char_buf[2];
                   3127:       int32_t idx1, idx2;
                   3128:       unsigned int ch;
                   3129:       size_t len;
                   3130:       /* This #include defines a local function!  */
                   3131: # include <locale/weight.h>
                   3132:       /* Calculate the index for equivalence class.  */
                   3133:       cp = name;
                   3134:       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
                   3135:       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
                   3136:                                               _NL_COLLATE_WEIGHTMB);
                   3137:       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
                   3138:                                                   _NL_COLLATE_EXTRAMB);
                   3139:       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
                   3140:                                                _NL_COLLATE_INDIRECTMB);
                   3141:       idx1 = findidx (&cp);
                   3142:       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
                   3143:        /* This isn't a valid character.  */
                   3144:        return REG_ECOLLATE;
                   3145: 
                   3146:       /* Build single byte matcing table for this equivalence class.  */
                   3147:       char_buf[1] = (unsigned char) '\0';
                   3148:       len = weights[idx1];
                   3149:       for (ch = 0; ch < SBC_MAX; ++ch)
                   3150:        {
                   3151:          char_buf[0] = ch;
                   3152:          cp = char_buf;
                   3153:          idx2 = findidx (&cp);
                   3154: /*
                   3155:          idx2 = table[ch];
                   3156: */
                   3157:          if (idx2 == 0)
                   3158:            /* This isn't a valid character.  */
                   3159:            continue;
                   3160:          if (len == weights[idx2])
                   3161:            {
                   3162:              int cnt = 0;
                   3163:              while (cnt <= len &&
                   3164:                     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
                   3165:                ++cnt;
                   3166: 
                   3167:              if (cnt > len)
                   3168:                bitset_set (sbcset, ch);
                   3169:            }
                   3170:        }
                   3171:       /* Check whether the array has enough space.  */
                   3172:       if (*equiv_class_alloc == mbcset->nequiv_classes)
                   3173:        {
                   3174:          /* Not enough, realloc it.  */
                   3175:          /* +1 in case of mbcset->nequiv_classes is 0.  */
                   3176:          *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
                   3177:          /* Use realloc since the array is NULL if *alloc == 0.  */
                   3178:          mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
                   3179:                                              *equiv_class_alloc);
                   3180:          if (BE (mbcset->equiv_classes == NULL, 0))
                   3181:            return REG_ESPACE;
                   3182:        }
                   3183:       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
                   3184:     }
                   3185:   else
                   3186: #endif /* _LIBC && RE_ENABLE_I18N */
                   3187:     {
                   3188:       if (BE (strlen ((const char *) name) != 1, 0))
                   3189:        return REG_ECOLLATE;
                   3190:       bitset_set (sbcset, *name);
                   3191:     }
                   3192:   return REG_NOERROR;
                   3193: }
                   3194: 
                   3195:   /* Helper function for parse_bracket_exp.
                   3196:      Build the character class which is represented by NAME.
                   3197:      The result are written to MBCSET and SBCSET.
                   3198:      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
                   3199:      is a pointer argument sinse we may update it.  */
                   3200: 
                   3201: static reg_errcode_t
                   3202: #ifdef RE_ENABLE_I18N
                   3203: build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
                   3204:      re_charset_t *mbcset;
                   3205:      int *char_class_alloc;
                   3206: #else /* not RE_ENABLE_I18N */
                   3207: build_charclass (sbcset, class_name, syntax)
                   3208: #endif /* not RE_ENABLE_I18N */
                   3209:      re_bitset_ptr_t sbcset;
                   3210:      const unsigned char *class_name;
                   3211:      reg_syntax_t syntax;
                   3212: {
                   3213:   int i;
                   3214:   const char *name = (const char *) class_name;
                   3215: 
                   3216:   /* In case of REG_ICASE "upper" and "lower" match the both of
                   3217:      upper and lower cases.  */
                   3218:   if ((syntax & RE_ICASE)
                   3219:       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
                   3220:     name = "alpha";
                   3221: 
                   3222: #ifdef RE_ENABLE_I18N
                   3223:   /* Check the space of the arrays.  */
                   3224:   if (*char_class_alloc == mbcset->nchar_classes)
                   3225:     {
                   3226:       /* Not enough, realloc it.  */
                   3227:       /* +1 in case of mbcset->nchar_classes is 0.  */
                   3228:       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
                   3229:       /* Use realloc since array is NULL if *alloc == 0.  */
                   3230:       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
                   3231:                                         *char_class_alloc);
                   3232:       if (BE (mbcset->char_classes == NULL, 0))
                   3233:        return REG_ESPACE;
                   3234:     }
                   3235:   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
                   3236: #endif /* RE_ENABLE_I18N */
                   3237: 
                   3238: #define BUILD_CHARCLASS_LOOP(ctype_func)\
                   3239:     for (i = 0; i < SBC_MAX; ++i)      \
                   3240:       {                                        \
                   3241:        if (ctype_func (i))             \
                   3242:          bitset_set (sbcset, i);       \
                   3243:       }
                   3244: 
                   3245:   if (strcmp (name, "alnum") == 0)
                   3246:     BUILD_CHARCLASS_LOOP (isalnum)
                   3247:   else if (strcmp (name, "cntrl") == 0)
                   3248:     BUILD_CHARCLASS_LOOP (iscntrl)
                   3249:   else if (strcmp (name, "lower") == 0)
                   3250:     BUILD_CHARCLASS_LOOP (islower)
                   3251:   else if (strcmp (name, "space") == 0)
                   3252:     BUILD_CHARCLASS_LOOP (isspace)
                   3253:   else if (strcmp (name, "alpha") == 0)
                   3254:     BUILD_CHARCLASS_LOOP (isalpha)
                   3255:   else if (strcmp (name, "digit") == 0)
                   3256:     BUILD_CHARCLASS_LOOP (isdigit)
                   3257:   else if (strcmp (name, "print") == 0)
                   3258:     BUILD_CHARCLASS_LOOP (isprint)
                   3259:   else if (strcmp (name, "upper") == 0)
                   3260:     BUILD_CHARCLASS_LOOP (isupper)
                   3261:   else if (strcmp (name, "blank") == 0)
                   3262:     BUILD_CHARCLASS_LOOP (isblank)
                   3263:   else if (strcmp (name, "graph") == 0)
                   3264:     BUILD_CHARCLASS_LOOP (isgraph)
                   3265:   else if (strcmp (name, "punct") == 0)
                   3266:     BUILD_CHARCLASS_LOOP (ispunct)
                   3267:   else if (strcmp (name, "xdigit") == 0)
                   3268:     BUILD_CHARCLASS_LOOP (isxdigit)
                   3269:   else
                   3270:     return REG_ECTYPE;
                   3271: 
                   3272:   return REG_NOERROR;
                   3273: }
                   3274: 
                   3275: static bin_tree_t *
                   3276: build_word_op (dfa, not, err)
                   3277:      re_dfa_t *dfa;
                   3278:      int not;
                   3279:      reg_errcode_t *err;
                   3280: {
                   3281:   re_bitset_ptr_t sbcset;
                   3282: #ifdef RE_ENABLE_I18N
                   3283:   re_charset_t *mbcset;
                   3284:   int alloc = 0;
                   3285: #else /* not RE_ENABLE_I18N */
                   3286:   int non_match = 0;
                   3287: #endif /* not RE_ENABLE_I18N */
                   3288:   reg_errcode_t ret;
                   3289:   re_token_t br_token;
                   3290:   bin_tree_t *tree;
                   3291:   int new_idx;
                   3292: 
                   3293:   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
                   3294: #ifdef RE_ENABLE_I18N
                   3295:   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
                   3296: #endif /* RE_ENABLE_I18N */
                   3297: 
                   3298: #ifdef RE_ENABLE_I18N
                   3299:   if (BE (sbcset == NULL || mbcset == NULL, 0))
                   3300: #else /* not RE_ENABLE_I18N */
                   3301:   if (BE (sbcset == NULL, 0))
                   3302: #endif /* not RE_ENABLE_I18N */
                   3303:     {
                   3304:       *err = REG_ESPACE;
                   3305:       return NULL;
                   3306:     }
                   3307: 
                   3308:   if (not)
                   3309:     {
                   3310: #ifdef RE_ENABLE_I18N
                   3311:       int i;
                   3312:       /*
                   3313:       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
                   3314:        bitset_set(cset->sbcset, '\0');
                   3315:       */
                   3316:       mbcset->non_match = 1;
                   3317:       if (MB_CUR_MAX > 1)
                   3318:        for (i = 0; i < SBC_MAX; ++i)
                   3319:          if (__btowc (i) == WEOF)
                   3320:            bitset_set (sbcset, i);
                   3321: #else /* not RE_ENABLE_I18N */
                   3322:       non_match = 1;
                   3323: #endif /* not RE_ENABLE_I18N */
                   3324:     }
                   3325: 
                   3326:   /* We don't care the syntax in this case.  */
                   3327:   ret = build_charclass (sbcset,
                   3328: #ifdef RE_ENABLE_I18N
                   3329:                         mbcset, &alloc,
                   3330: #endif /* RE_ENABLE_I18N */
                   3331:                         (const unsigned char *) "alpha", 0);
                   3332: 
                   3333:   if (BE (ret != REG_NOERROR, 0))
                   3334:     {
                   3335:       re_free (sbcset);
                   3336: #ifdef RE_ENABLE_I18N
                   3337:       free_charset (mbcset);
                   3338: #endif /* RE_ENABLE_I18N */
                   3339:       *err = ret;
                   3340:       return NULL;
                   3341:     }
                   3342:   /* \w match '_' also.  */
                   3343:   bitset_set (sbcset, '_');
                   3344: 
                   3345:   /* If it is non-matching list.  */
                   3346: #ifdef RE_ENABLE_I18N
                   3347:   if (mbcset->non_match)
                   3348: #else /* not RE_ENABLE_I18N */
                   3349:   if (non_match)
                   3350: #endif /* not RE_ENABLE_I18N */
                   3351:     bitset_not (sbcset);
                   3352: 
                   3353:   /* Build a tree for simple bracket.  */
                   3354:   br_token.type = SIMPLE_BRACKET;
                   3355:   br_token.opr.sbcset = sbcset;
                   3356:   new_idx = re_dfa_add_node (dfa, br_token, 0);
                   3357:   tree = create_tree (NULL, NULL, 0, new_idx);
                   3358:   if (BE (new_idx == -1 || tree == NULL, 0))
                   3359:     goto build_word_op_espace;
                   3360: 
                   3361: #ifdef RE_ENABLE_I18N
                   3362:   if (MB_CUR_MAX > 1)
                   3363:     {
                   3364:       re_token_t alt_token;
                   3365:       bin_tree_t *mbc_tree;
                   3366:       /* Build a tree for complex bracket.  */
                   3367:       br_token.type = COMPLEX_BRACKET;
                   3368:       br_token.opr.mbcset = mbcset;
                   3369:       dfa->has_mb_node = 1;
                   3370:       new_idx = re_dfa_add_node (dfa, br_token, 0);
                   3371:       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
                   3372:       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
                   3373:        goto build_word_op_espace;
                   3374:       /* Then join them by ALT node.  */
                   3375:       alt_token.type = OP_ALT;
                   3376:       new_idx = re_dfa_add_node (dfa, alt_token, 0);
                   3377:       tree = create_tree (tree, mbc_tree, 0, new_idx);
                   3378:       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
                   3379:        return tree;
                   3380:     }
                   3381:   else
                   3382:     {
                   3383:       free_charset (mbcset);
                   3384:       return tree;
                   3385:     }
                   3386: #else /* not RE_ENABLE_I18N */
                   3387:   return tree;
                   3388: #endif /* not RE_ENABLE_I18N */
                   3389: 
                   3390:  build_word_op_espace:
                   3391:   re_free (sbcset);
                   3392: #ifdef RE_ENABLE_I18N
                   3393:   free_charset (mbcset);
                   3394: #endif /* RE_ENABLE_I18N */
                   3395:   *err = REG_ESPACE;
                   3396:   return NULL;
                   3397: }
                   3398: 
                   3399: /* This is intended for the expressions like "a{1,3}".
                   3400:    Fetch a number from `input', and return the number.
                   3401:    Return -1, if the number field is empty like "{,1}".
                   3402:    Return -2, If an error is occured.  */
                   3403: 
                   3404: static int
                   3405: fetch_number (input, token, syntax)
                   3406:      re_string_t *input;
                   3407:      re_token_t *token;
                   3408:      reg_syntax_t syntax;
                   3409: {
                   3410:   int num = -1;
                   3411:   unsigned char c;
                   3412:   while (1)
                   3413:     {
                   3414:       *token = fetch_token (input, syntax);
                   3415:       c = token->opr.c;
                   3416:       if (BE (token->type == END_OF_RE, 0))
                   3417:        return -2;
                   3418:       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
                   3419:        break;
                   3420:       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
                   3421:             ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
                   3422:       num = (num > RE_DUP_MAX) ? -2 : num;
                   3423:     }
                   3424:   return num;
                   3425: }
                   3426: 
                   3427: #ifdef RE_ENABLE_I18N
                   3428: static void
                   3429: free_charset (re_charset_t *cset)
                   3430: {
                   3431:   re_free (cset->mbchars);
                   3432: # ifdef _LIBC
                   3433:   re_free (cset->coll_syms);
                   3434:   re_free (cset->equiv_classes);
                   3435:   re_free (cset->range_starts);
                   3436:   re_free (cset->range_ends);
                   3437: # endif
                   3438:   re_free (cset->char_classes);
                   3439:   re_free (cset);
                   3440: }
                   3441: #endif /* RE_ENABLE_I18N */
                   3442: 
                   3443: /* Functions for binary tree operation.  */
                   3444: 
                   3445: /* Create a node of tree.
                   3446:    Note: This function automatically free left and right if malloc fails.  */
                   3447: 
                   3448: static bin_tree_t *
                   3449: create_tree (left, right, type, index)
                   3450:      bin_tree_t *left;
                   3451:      bin_tree_t *right;
                   3452:      re_token_type_t type;
                   3453:      int index;
                   3454: {
                   3455:   bin_tree_t *tree;
                   3456:   tree = re_malloc (bin_tree_t, 1);
                   3457:   if (BE (tree == NULL, 0))
                   3458:     {
                   3459:       free_bin_tree (left);
                   3460:       free_bin_tree (right);
                   3461:       return NULL;
                   3462:     }
                   3463:   tree->parent = NULL;
                   3464:   tree->left = left;
                   3465:   tree->right = right;
                   3466:   tree->type = type;
                   3467:   tree->node_idx = index;
                   3468:   tree->first = -1;
                   3469:   tree->next = -1;
                   3470:   re_node_set_init_empty (&tree->eclosure);
                   3471: 
                   3472:   if (left != NULL)
                   3473:     left->parent = tree;
                   3474:   if (right != NULL)
                   3475:     right->parent = tree;
                   3476:   return tree;
                   3477: }
                   3478: 
                   3479: /* Free the sub tree pointed by TREE.  */
                   3480: 
                   3481: static void
                   3482: free_bin_tree (tree)
                   3483:      bin_tree_t *tree;
                   3484: {
                   3485:   if (tree == NULL)
                   3486:     return;
                   3487:   /*re_node_set_free (&tree->eclosure);*/
                   3488:   free_bin_tree (tree->left);
                   3489:   free_bin_tree (tree->right);
                   3490:   re_free (tree);
                   3491: }
                   3492: 
                   3493: /* Duplicate the node SRC, and return new node.  */
                   3494: 
                   3495: static bin_tree_t *
                   3496: duplicate_tree (src, dfa)
                   3497:      const bin_tree_t *src;
                   3498:      re_dfa_t *dfa;
                   3499: {
                   3500:   bin_tree_t *left = NULL, *right = NULL, *new_tree;
                   3501:   int new_node_idx;
                   3502:   /* Since node indies must be according to Post-order of the tree,
                   3503:      we must duplicate the left at first.  */
                   3504:   if (src->left != NULL)
                   3505:     {
                   3506:       left = duplicate_tree (src->left, dfa);
                   3507:       if (left == NULL)
                   3508:        return NULL;
                   3509:     }
                   3510: 
                   3511:   /* Secondaly, duplicate the right.  */
                   3512:   if (src->right != NULL)
                   3513:     {
                   3514:       right = duplicate_tree (src->right, dfa);
                   3515:       if (right == NULL)
                   3516:        {
                   3517:          free_bin_tree (left);
                   3518:          return NULL;
                   3519:        }
                   3520:     }
                   3521: 
                   3522:   /* At last, duplicate itself.  */
                   3523:   if (src->type == NON_TYPE)
                   3524:     {
                   3525:       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
                   3526:       dfa->nodes[new_node_idx].duplicated = 1;
                   3527:       if (BE (new_node_idx == -1, 0))
                   3528:        {
                   3529:          free_bin_tree (left);
                   3530:          free_bin_tree (right);
                   3531:          return NULL;
                   3532:        }
                   3533:     }
                   3534:   else
                   3535:     new_node_idx = src->type;
                   3536: 
                   3537:   new_tree = create_tree (left, right, src->type, new_node_idx);
                   3538:   if (BE (new_tree == NULL, 0))
                   3539:     {
                   3540:       free_bin_tree (left);
                   3541:       free_bin_tree (right);
                   3542:     }
                   3543:   return new_tree;
                   3544: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>