File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / smartmontools / regex / regex_internal.h
Revision 1.1.1.2 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Mon Jul 22 01:17:36 2013 UTC (11 years, 8 months ago) by misho
Branches: smartmontools, elwix, MAIN
CVS tags: v6_2, v6_1p0, v6_1, HEAD
6.1

    1: /* Extended regular expression matching and search library.
    2:    Copyright (C) 2002, 2003 Free Software Foundation, Inc.
    3:    This file is part of the GNU C Library.
    4:    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
    5: 
    6:    The GNU C Library is free software; you can redistribute it and/or
    7:    modify it under the terms of the GNU Lesser General Public
    8:    License as published by the Free Software Foundation; either
    9:    version 2.1 of the License, or (at your option) any later version.
   10: 
   11:    The GNU C Library is distributed in the hope that it will be useful,
   12:    but WITHOUT ANY WARRANTY; without even the implied warranty of
   13:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   14:    Lesser General Public License for more details.
   15: 
   16:    You should have received a copy of the GNU Lesser General Public
   17:    License along with the GNU C Library; if not, write to the Free
   18:    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
   19:    MA 02110-1301 USA.  */
   20: 
   21: #ifndef _REGEX_INTERNAL_H
   22: #define _REGEX_INTERNAL_H 1
   23: 
   24: #ifdef HAVE_CONFIG_H
   25: #include "config.h"
   26: #endif
   27: 
   28: #include <assert.h>
   29: #include <ctype.h>
   30: #include <limits.h>
   31: #include <stdio.h>
   32: #include <stdlib.h>
   33: #include <string.h>
   34: 
   35: #if defined HAVE_LOCALE_H || defined _LIBC
   36: # include <locale.h>
   37: #endif
   38: #if defined HAVE_WCHAR_H || defined _LIBC
   39: # include <wchar.h>
   40: #endif /* HAVE_WCHAR_H || _LIBC */
   41: #if defined HAVE_WCTYPE_H || defined _LIBC
   42: # include <wctype.h>
   43: #endif /* HAVE_WCTYPE_H || _LIBC */
   44: 
   45: /* In case that the system doesn't have isblank().  */
   46: #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
   47: # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
   48: #endif
   49: 
   50: #ifdef _LIBC
   51: # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
   52: #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
   53: #   include <locale/localeinfo.h>
   54: #   include <locale/elem-hash.h>
   55: #   include <locale/coll-lookup.h>
   56: # endif
   57: #endif
   58: 
   59: /* This is for other GNU distributions with internationalized messages.  */
   60: #if HAVE_LIBINTL_H || defined _LIBC
   61: # include <libintl.h>
   62: # ifdef _LIBC
   63: #  undef gettext
   64: #  define gettext(msgid) \
   65:   INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
   66: # endif
   67: #else
   68: # define gettext(msgid) (msgid)
   69: #endif
   70: 
   71: #ifndef gettext_noop
   72: /* This define is so xgettext can find the internationalizable
   73:    strings.  */
   74: # define gettext_noop(String) String
   75: #endif
   76: 
   77: #if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC
   78: # define RE_ENABLE_I18N
   79: #endif
   80: 
   81: #if __GNUC__ >= 3
   82: # define BE(expr, val) __builtin_expect (expr, val)
   83: #else
   84: # define BE(expr, val) (expr)
   85: # define inline
   86: #endif
   87: 
   88: /* Number of bits in a byte.  */
   89: #define BYTE_BITS 8
   90: /* Number of single byte character.  */
   91: #define SBC_MAX 256
   92: 
   93: #define COLL_ELEM_LEN_MAX 8
   94: 
   95: /* The character which represents newline.  */
   96: #define NEWLINE_CHAR '\n'
   97: #define WIDE_NEWLINE_CHAR L'\n'
   98: 
   99: /* Rename to standard API for using out of glibc.  */
  100: #ifndef _LIBC
  101: # define __wctype wctype
  102: # define __iswctype iswctype
  103: # define __btowc btowc
  104: # define __mempcpy mempcpy
  105: # define __wcrtomb wcrtomb
  106: # define attribute_hidden
  107: #endif /* not _LIBC */
  108: 
  109: extern const char __re_error_msgid[] attribute_hidden;
  110: extern const size_t __re_error_msgid_idx[] attribute_hidden;
  111: 
  112: /* Number of bits in an unsinged int.  */
  113: #define UINT_BITS (sizeof (unsigned int) * BYTE_BITS)
  114: /* Number of unsigned int in an bit_set.  */
  115: #define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS)
  116: typedef unsigned int bitset[BITSET_UINTS];
  117: typedef unsigned int *re_bitset_ptr_t;
  118: 
  119: #define bitset_set(set,i) (set[i / UINT_BITS] |= 1 << i % UINT_BITS)
  120: #define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1 << i % UINT_BITS))
  121: #define bitset_contain(set,i) (set[i / UINT_BITS] & (1 << i % UINT_BITS))
  122: #define bitset_empty(set) memset (set, 0, sizeof (unsigned int) * BITSET_UINTS)
  123: #define bitset_set_all(set) \
  124:   memset (set, 255, sizeof (unsigned int) * BITSET_UINTS)
  125: #define bitset_copy(dest,src) \
  126:   memcpy (dest, src, sizeof (unsigned int) * BITSET_UINTS)
  127: static inline void bitset_not (bitset set);
  128: static inline void bitset_merge (bitset dest, const bitset src);
  129: static inline void bitset_not_merge (bitset dest, const bitset src);
  130: 
  131: #define PREV_WORD_CONSTRAINT 0x0001
  132: #define PREV_NOTWORD_CONSTRAINT 0x0002
  133: #define NEXT_WORD_CONSTRAINT 0x0004
  134: #define NEXT_NOTWORD_CONSTRAINT 0x0008
  135: #define PREV_NEWLINE_CONSTRAINT 0x0010
  136: #define NEXT_NEWLINE_CONSTRAINT 0x0020
  137: #define PREV_BEGBUF_CONSTRAINT 0x0040
  138: #define NEXT_ENDBUF_CONSTRAINT 0x0080
  139: #define DUMMY_CONSTRAINT 0x0100
  140: 
  141: typedef enum
  142: {
  143:   INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
  144:   WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT,
  145:   WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT,
  146:   LINE_FIRST = PREV_NEWLINE_CONSTRAINT,
  147:   LINE_LAST = NEXT_NEWLINE_CONSTRAINT,
  148:   BUF_FIRST = PREV_BEGBUF_CONSTRAINT,
  149:   BUF_LAST = NEXT_ENDBUF_CONSTRAINT,
  150:   WORD_DELIM = DUMMY_CONSTRAINT
  151: } re_context_type;
  152: 
  153: typedef struct
  154: {
  155:   int alloc;
  156:   int nelem;
  157:   int *elems;
  158: } re_node_set;
  159: 
  160: typedef enum
  161: {
  162:   NON_TYPE = 0,
  163: 
  164:   /* Token type, these are used only by token.  */
  165:   OP_OPEN_BRACKET,
  166:   OP_CLOSE_BRACKET,
  167:   OP_CHARSET_RANGE,
  168:   OP_OPEN_DUP_NUM,
  169:   OP_CLOSE_DUP_NUM,
  170:   OP_NON_MATCH_LIST,
  171:   OP_OPEN_COLL_ELEM,
  172:   OP_CLOSE_COLL_ELEM,
  173:   OP_OPEN_EQUIV_CLASS,
  174:   OP_CLOSE_EQUIV_CLASS,
  175:   OP_OPEN_CHAR_CLASS,
  176:   OP_CLOSE_CHAR_CLASS,
  177:   OP_WORD,
  178:   OP_NOTWORD,
  179:   BACK_SLASH,
  180: 
  181:   /* Tree type, these are used only by tree. */
  182:   CONCAT,
  183:   ALT,
  184:   SUBEXP,
  185:   SIMPLE_BRACKET,
  186: #ifdef RE_ENABLE_I18N
  187:   COMPLEX_BRACKET,
  188: #endif /* RE_ENABLE_I18N */
  189: 
  190:   /* Node type, These are used by token, node, tree.  */
  191:   OP_OPEN_SUBEXP,
  192:   OP_CLOSE_SUBEXP,
  193:   OP_PERIOD,
  194:   CHARACTER,
  195:   END_OF_RE,
  196:   OP_ALT,
  197:   OP_DUP_ASTERISK,
  198:   OP_DUP_PLUS,
  199:   OP_DUP_QUESTION,
  200:   OP_BACK_REF,
  201:   ANCHOR,
  202: 
  203:   /* Dummy marker.  */
  204:   END_OF_RE_TOKEN_T
  205: } re_token_type_t;
  206: 
  207: #ifdef RE_ENABLE_I18N
  208: typedef struct
  209: {
  210:   /* Multibyte characters.  */
  211:   wchar_t *mbchars;
  212: 
  213:   /* Collating symbols.  */
  214: # ifdef _LIBC
  215:   int32_t *coll_syms;
  216: # endif
  217: 
  218:   /* Equivalence classes. */
  219: # ifdef _LIBC
  220:   int32_t *equiv_classes;
  221: # endif
  222: 
  223:   /* Range expressions. */
  224: # ifdef _LIBC
  225:   uint32_t *range_starts;
  226:   uint32_t *range_ends;
  227: # else /* not _LIBC */
  228:   wchar_t *range_starts;
  229:   wchar_t *range_ends;
  230: # endif /* not _LIBC */
  231: 
  232:   /* Character classes. */
  233:   wctype_t *char_classes;
  234: 
  235:   /* If this character set is the non-matching list.  */
  236:   unsigned int non_match : 1;
  237: 
  238:   /* # of multibyte characters.  */
  239:   int nmbchars;
  240: 
  241:   /* # of collating symbols.  */
  242:   int ncoll_syms;
  243: 
  244:   /* # of equivalence classes. */
  245:   int nequiv_classes;
  246: 
  247:   /* # of range expressions. */
  248:   int nranges;
  249: 
  250:   /* # of character classes. */
  251:   int nchar_classes;
  252: } re_charset_t;
  253: #endif /* RE_ENABLE_I18N */
  254: 
  255: typedef struct
  256: {
  257:   union
  258:   {
  259:     unsigned char c;		/* for CHARACTER */
  260:     re_bitset_ptr_t sbcset;	/* for SIMPLE_BRACKET */
  261: #ifdef RE_ENABLE_I18N
  262:     re_charset_t *mbcset;	/* for COMPLEX_BRACKET */
  263: #endif /* RE_ENABLE_I18N */
  264:     int idx;			/* for BACK_REF */
  265:     re_context_type ctx_type;	/* for ANCHOR */
  266:   } opr;
  267: #if __GNUC__ >= 2
  268:   re_token_type_t type : 8;
  269: #else
  270:   re_token_type_t type;
  271: #endif
  272:   unsigned int constraint : 10;	/* context constraint */
  273:   unsigned int duplicated : 1;
  274: #ifdef RE_ENABLE_I18N
  275:   unsigned int mb_partial : 1;
  276: #endif
  277: } re_token_t;
  278: 
  279: #define IS_EPSILON_NODE(type) \
  280:   ((type) == OP_ALT || (type) == OP_DUP_ASTERISK || (type) == OP_DUP_PLUS \
  281:    || (type) == OP_DUP_QUESTION || (type) == ANCHOR \
  282:    || (type) == OP_OPEN_SUBEXP || (type) == OP_CLOSE_SUBEXP)
  283: 
  284: #define ACCEPT_MB_NODE(type) \
  285:   ((type) == COMPLEX_BRACKET || (type) == OP_PERIOD)
  286: 
  287: struct re_string_t
  288: {
  289:   /* Indicate the raw buffer which is the original string passed as an
  290:      argument of regexec(), re_search(), etc..  */
  291:   const unsigned char *raw_mbs;
  292:   /* Store the multibyte string.  In case of "case insensitive mode" like
  293:      REG_ICASE, upper cases of the string are stored, otherwise MBS points
  294:      the same address that RAW_MBS points.  */
  295:   unsigned char *mbs;
  296:   /* Store the case sensitive multibyte string.  In case of
  297:      "case insensitive mode", the original string are stored,
  298:      otherwise MBS_CASE points the same address that MBS points.  */
  299:   unsigned char *mbs_case;
  300: #ifdef RE_ENABLE_I18N
  301:   /* Store the wide character string which is corresponding to MBS.  */
  302:   wint_t *wcs;
  303:   mbstate_t cur_state;
  304: #endif
  305:   /* Index in RAW_MBS.  Each character mbs[i] corresponds to
  306:      raw_mbs[raw_mbs_idx + i].  */
  307:   int raw_mbs_idx;
  308:   /* The length of the valid characters in the buffers.  */
  309:   int valid_len;
  310:   /* The length of the buffers MBS, MBS_CASE, and WCS.  */
  311:   int bufs_len;
  312:   /* The index in MBS, which is updated by re_string_fetch_byte.  */
  313:   int cur_idx;
  314:   /* This is length_of_RAW_MBS - RAW_MBS_IDX.  */
  315:   int len;
  316:   /* End of the buffer may be shorter than its length in the cases such
  317:      as re_match_2, re_search_2.  Then, we use STOP for end of the buffer
  318:      instead of LEN.  */
  319:   int stop;
  320: 
  321:   /* The context of mbs[0].  We store the context independently, since
  322:      the context of mbs[0] may be different from raw_mbs[0], which is
  323:      the beginning of the input string.  */
  324:   unsigned int tip_context;
  325:   /* The translation passed as a part of an argument of re_compile_pattern.  */
  326:   RE_TRANSLATE_TYPE trans;
  327:   /* 1 if REG_ICASE.  */
  328:   unsigned int icase : 1;
  329: };
  330: typedef struct re_string_t re_string_t;
  331: /* In case of REG_ICASE, we allocate the buffer dynamically for mbs.  */
  332: #define MBS_ALLOCATED(pstr) (pstr->icase)
  333: /* In case that we need translation, we allocate the buffer dynamically
  334:    for mbs_case.  Note that mbs == mbs_case if not REG_ICASE.  */
  335: #define MBS_CASE_ALLOCATED(pstr) (pstr->trans != NULL)
  336: 
  337: 
  338: static reg_errcode_t re_string_allocate (re_string_t *pstr, const char *str,
  339: 					 int len, int init_len,
  340: 					 RE_TRANSLATE_TYPE trans, int icase);
  341: static reg_errcode_t re_string_construct (re_string_t *pstr, const char *str,
  342: 					  int len, RE_TRANSLATE_TYPE trans,
  343: 					  int icase);
  344: static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx,
  345: 					    int eflags, int newline);
  346: static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
  347: 						int new_buf_len);
  348: #ifdef RE_ENABLE_I18N
  349: static void build_wcs_buffer (re_string_t *pstr);
  350: static void build_wcs_upper_buffer (re_string_t *pstr);
  351: #endif /* RE_ENABLE_I18N */
  352: static void build_upper_buffer (re_string_t *pstr);
  353: static void re_string_translate_buffer (re_string_t *pstr);
  354: static void re_string_destruct (re_string_t *pstr);
  355: #ifdef RE_ENABLE_I18N
  356: static int re_string_elem_size_at (const re_string_t *pstr, int idx);
  357: static inline int re_string_char_size_at (const re_string_t *pstr, int idx);
  358: static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx);
  359: #endif /* RE_ENABLE_I18N */
  360: static unsigned int re_string_context_at (const re_string_t *input, int idx,
  361: 					  int eflags, int newline_anchor);
  362: #define re_string_peek_byte(pstr, offset) \
  363:   ((pstr)->mbs[(pstr)->cur_idx + offset])
  364: #define re_string_peek_byte_case(pstr, offset) \
  365:   ((pstr)->mbs_case[(pstr)->cur_idx + offset])
  366: #define re_string_fetch_byte(pstr) \
  367:   ((pstr)->mbs[(pstr)->cur_idx++])
  368: #define re_string_fetch_byte_case(pstr) \
  369:   ((pstr)->mbs_case[(pstr)->cur_idx++])
  370: #define re_string_first_byte(pstr, idx) \
  371:   ((idx) == (pstr)->len || (pstr)->wcs[idx] != WEOF)
  372: #define re_string_is_single_byte_char(pstr, idx) \
  373:   ((pstr)->wcs[idx] != WEOF && ((pstr)->len == (idx) \
  374: 				|| (pstr)->wcs[(idx) + 1] != WEOF))
  375: #define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx)
  376: #define re_string_cur_idx(pstr) ((pstr)->cur_idx)
  377: #define re_string_get_buffer(pstr) ((pstr)->mbs)
  378: #define re_string_length(pstr) ((pstr)->len)
  379: #define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx])
  380: #define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx))
  381: #define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx))
  382: 
  383: #define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
  384: #define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
  385: #define re_free(p) free (p)
  386: 
  387: struct bin_tree_t
  388: {
  389:   struct bin_tree_t *parent;
  390:   struct bin_tree_t *left;
  391:   struct bin_tree_t *right;
  392: 
  393:   /* `node_idx' is the index in dfa->nodes, if `type' == 0.
  394:      Otherwise `type' indicate the type of this node.  */
  395:   re_token_type_t type;
  396:   int node_idx;
  397: 
  398:   int first;
  399:   int next;
  400:   re_node_set eclosure;
  401: };
  402: typedef struct bin_tree_t bin_tree_t;
  403: 
  404: 
  405: #define CONTEXT_WORD 1
  406: #define CONTEXT_NEWLINE (CONTEXT_WORD << 1)
  407: #define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1)
  408: #define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1)
  409: 
  410: #define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD)
  411: #define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE)
  412: #define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF)
  413: #define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF)
  414: #define IS_ORDINARY_CONTEXT(c) ((c) == 0)
  415: 
  416: #define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_')
  417: #define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR)
  418: #define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_')
  419: #define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR)
  420: 
  421: #define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \
  422:  ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
  423:   || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
  424:   || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\
  425:   || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context)))
  426: 
  427: #define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \
  428:  ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \
  429:   || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \
  430:   || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \
  431:   || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context)))
  432: 
  433: struct re_dfastate_t
  434: {
  435:   unsigned int hash;
  436:   re_node_set nodes;
  437:   re_node_set *entrance_nodes;
  438:   struct re_dfastate_t **trtable;
  439:   struct re_dfastate_t **trtable_search;
  440:   /* If this state is a special state.
  441:      A state is a special state if the state is the halt state, or
  442:      a anchor.  */
  443:   unsigned int context : 2;
  444:   unsigned int halt : 1;
  445:   /* If this state can accept `multi byte'.
  446:      Note that we refer to multibyte characters, and multi character
  447:      collating elements as `multi byte'.  */
  448:   unsigned int accept_mb : 1;
  449:   /* If this state has backreference node(s).  */
  450:   unsigned int has_backref : 1;
  451:   unsigned int has_constraint : 1;
  452: };
  453: typedef struct re_dfastate_t re_dfastate_t;
  454: 
  455: typedef struct
  456: {
  457:   /* start <= node < end  */
  458:   int start;
  459:   int end;
  460: } re_subexp_t;
  461: 
  462: struct re_state_table_entry
  463: {
  464:   int num;
  465:   int alloc;
  466:   re_dfastate_t **array;
  467: };
  468: 
  469: /* Array type used in re_sub_match_last_t and re_sub_match_top_t.  */
  470: 
  471: typedef struct
  472: {
  473:   int next_idx;
  474:   int alloc;
  475:   re_dfastate_t **array;
  476: } state_array_t;
  477: 
  478: /* Store information about the node NODE whose type is OP_CLOSE_SUBEXP.  */
  479: 
  480: typedef struct
  481: {
  482:   int node;
  483:   int str_idx; /* The position NODE match at.  */
  484:   state_array_t path;
  485: } re_sub_match_last_t;
  486: 
  487: /* Store information about the node NODE whose type is OP_OPEN_SUBEXP.
  488:    And information about the node, whose type is OP_CLOSE_SUBEXP,
  489:    corresponding to NODE is stored in LASTS.  */
  490: 
  491: typedef struct
  492: {
  493:   int str_idx;
  494:   int node;
  495:   int next_last_offset;
  496:   state_array_t *path;
  497:   int alasts; /* Allocation size of LASTS.  */
  498:   int nlasts; /* The number of LASTS.  */
  499:   re_sub_match_last_t **lasts;
  500: } re_sub_match_top_t;
  501: 
  502: struct re_backref_cache_entry
  503: {
  504:   int node;
  505:   int str_idx;
  506:   int subexp_from;
  507:   int subexp_to;
  508:   int flag;
  509: };
  510: 
  511: typedef struct
  512: {
  513:   /* EFLAGS of the argument of regexec.  */
  514:   int eflags;
  515:   /* Where the matching ends.  */
  516:   int match_last;
  517:   int last_node;
  518:   /* The string object corresponding to the input string.  */
  519:   re_string_t *input;
  520:   /* The state log used by the matcher.  */
  521:   re_dfastate_t **state_log;
  522:   int state_log_top;
  523:   /* Back reference cache.  */
  524:   int nbkref_ents;
  525:   int abkref_ents;
  526:   struct re_backref_cache_entry *bkref_ents;
  527:   int max_mb_elem_len;
  528:   int nsub_tops;
  529:   int asub_tops;
  530:   re_sub_match_top_t **sub_tops;
  531: } re_match_context_t;
  532: 
  533: typedef struct
  534: {
  535:   int cur_bkref;
  536:   int cls_subexp_idx;
  537: 
  538:   re_dfastate_t **sifted_states;
  539:   re_dfastate_t **limited_states;
  540: 
  541:   re_node_set limits;
  542: 
  543:   int last_node;
  544:   int last_str_idx;
  545:   int check_subexp;
  546: } re_sift_context_t;
  547: 
  548: struct re_fail_stack_ent_t
  549: {
  550:   int idx;
  551:   int node;
  552:   regmatch_t *regs;
  553:   re_node_set eps_via_nodes;
  554: };
  555: 
  556: struct re_fail_stack_t
  557: {
  558:   int num;
  559:   int alloc;
  560:   struct re_fail_stack_ent_t *stack;
  561: };
  562: 
  563: struct re_dfa_t
  564: {
  565:   re_bitset_ptr_t word_char;
  566: 
  567:   /* number of subexpressions `re_nsub' is in regex_t.  */
  568:   int subexps_alloc;
  569:   re_subexp_t *subexps;
  570: 
  571:   re_token_t *nodes;
  572:   int nodes_alloc;
  573:   int nodes_len;
  574:   bin_tree_t *str_tree;
  575:   int *nexts;
  576:   int *org_indices;
  577:   re_node_set *edests;
  578:   re_node_set *eclosures;
  579:   re_node_set *inveclosures;
  580:   struct re_state_table_entry *state_table;
  581:   unsigned int state_hash_mask;
  582:   re_dfastate_t *init_state;
  583:   re_dfastate_t *init_state_word;
  584:   re_dfastate_t *init_state_nl;
  585:   re_dfastate_t *init_state_begbuf;
  586:   int states_alloc;
  587:   int init_node;
  588:   int nbackref; /* The number of backreference in this dfa.  */
  589:   /* Bitmap expressing which backreference is used.  */
  590:   unsigned int used_bkref_map;
  591: #ifdef DEBUG
  592:   char* re_str;
  593: #endif
  594:   unsigned int has_plural_match : 1;
  595:   /* If this dfa has "multibyte node", which is a backreference or
  596:      a node which can accept multibyte character or multi character
  597:      collating element.  */
  598:   unsigned int has_mb_node : 1;
  599: };
  600: typedef struct re_dfa_t re_dfa_t;
  601: 
  602: static reg_errcode_t re_node_set_alloc (re_node_set *set, int size);
  603: static reg_errcode_t re_node_set_init_1 (re_node_set *set, int elem);
  604: static reg_errcode_t re_node_set_init_2 (re_node_set *set, int elem1,
  605: 					 int elem2);
  606: #define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
  607: static reg_errcode_t re_node_set_init_copy (re_node_set *dest,
  608: 					    const re_node_set *src);
  609: static reg_errcode_t re_node_set_add_intersect (re_node_set *dest,
  610: 						const re_node_set *src1,
  611: 						const re_node_set *src2);
  612: static reg_errcode_t re_node_set_init_union (re_node_set *dest,
  613: 					     const re_node_set *src1,
  614: 					     const re_node_set *src2);
  615: static reg_errcode_t re_node_set_merge (re_node_set *dest,
  616: 					const re_node_set *src);
  617: static int re_node_set_insert (re_node_set *set, int elem);
  618: static int re_node_set_compare (const re_node_set *set1,
  619: 				const re_node_set *set2);
  620: static int re_node_set_contains (const re_node_set *set, int elem);
  621: static void re_node_set_remove_at (re_node_set *set, int idx);
  622: #define re_node_set_remove(set,id) \
  623:   (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1))
  624: #define re_node_set_empty(p) ((p)->nelem = 0)
  625: #define re_node_set_free(set) re_free ((set)->elems)
  626: static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token, int mode);
  627: static re_dfastate_t *re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa,
  628: 					const re_node_set *nodes);
  629: static re_dfastate_t *re_acquire_state_context (reg_errcode_t *err,
  630: 						re_dfa_t *dfa,
  631: 						const re_node_set *nodes,
  632: 						unsigned int context);
  633: static void free_state (re_dfastate_t *state);
  634: 
  635: 
  636: typedef enum
  637: {
  638:   SB_CHAR,
  639:   MB_CHAR,
  640:   EQUIV_CLASS,
  641:   COLL_SYM,
  642:   CHAR_CLASS
  643: } bracket_elem_type;
  644: 
  645: typedef struct
  646: {
  647:   bracket_elem_type type;
  648:   union
  649:   {
  650:     unsigned char ch;
  651:     unsigned char *name;
  652:     wchar_t wch;
  653:   } opr;
  654: } bracket_elem_t;
  655: 
  656: 
  657: /* Inline functions for bitset operation.  */
  658: static inline void
  659: bitset_not (set)
  660:      bitset set;
  661: {
  662:   int bitset_i;
  663:   for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
  664:     set[bitset_i] = ~set[bitset_i];
  665: }
  666: 
  667: static inline void
  668: bitset_merge (dest, src)
  669:      bitset dest;
  670:      const bitset src;
  671: {
  672:   int bitset_i;
  673:   for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i)
  674:     dest[bitset_i] |= src[bitset_i];
  675: }
  676: 
  677: static inline void
  678: bitset_not_merge (dest, src)
  679:      bitset dest;
  680:      const bitset src;
  681: {
  682:   int i;
  683:   for (i = 0; i < BITSET_UINTS; ++i)
  684:     dest[i] |= ~src[i];
  685: }
  686: 
  687: #ifdef RE_ENABLE_I18N
  688: /* Inline functions for re_string.  */
  689: static inline int
  690: re_string_char_size_at (pstr, idx)
  691:      const re_string_t *pstr;
  692:      int idx;
  693: {
  694:   int byte_idx;
  695:   if (MB_CUR_MAX == 1)
  696:     return 1;
  697:   for (byte_idx = 1; idx + byte_idx < pstr->len; ++byte_idx)
  698:     if (pstr->wcs[idx + byte_idx] != WEOF)
  699:       break;
  700:   return byte_idx;
  701: }
  702: 
  703: static inline wint_t
  704: re_string_wchar_at (pstr, idx)
  705:      const re_string_t *pstr;
  706:      int idx;
  707: {
  708:   if (MB_CUR_MAX == 1)
  709:     return (wint_t) pstr->mbs[idx];
  710:   return (wint_t) pstr->wcs[idx];
  711: }
  712: 
  713: static int
  714: re_string_elem_size_at (pstr, idx)
  715:      const re_string_t *pstr;
  716:      int idx;
  717: {
  718: #ifdef _LIBC
  719:   const unsigned char *p, *extra;
  720:   const int32_t *table, *indirect;
  721:   int32_t tmp;
  722: # include <locale/weight.h>
  723:   uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
  724: 
  725:   if (nrules != 0)
  726:     {
  727:       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
  728:       extra = (const unsigned char *)
  729: 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
  730:       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
  731: 						_NL_COLLATE_INDIRECTMB);
  732:       p = pstr->mbs + idx;
  733:       tmp = findidx (&p);
  734:       return p - pstr->mbs - idx;
  735:     }
  736:   else
  737: #endif /* _LIBC */
  738:     return 1;
  739: }
  740: #endif /* RE_ENABLE_I18N */
  741: 
  742: #endif /*  _REGEX_INTERNAL_H */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>