Diff for /embedaddon/pcre/pcre_compile.c between versions 1.1.1.2 and 1.1.1.3

version 1.1.1.2, 2012/02/21 23:50:25 version 1.1.1.3, 2012/10/09 09:19:17
Line 489  static const char error_texts[] = Line 489  static const char error_texts[] =
   "too many forward references\0"    "too many forward references\0"
   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"    "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
   "invalid UTF-16 string\0"    "invalid UTF-16 string\0"
     /* 75 */
     "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
     "character value in \\u.... sequence is too large\0"
   ;    ;
   
 /* Table to identify digits and hex digits. This is used when compiling  /* Table to identify digits and hex digits. This is used when compiling
Line 829  else Line 832  else
           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));            c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 #endif  #endif
           }            }
   
   #ifdef COMPILE_PCRE8
           if (c > (utf ? 0x10ffff : 0xff))
   #else
   #ifdef COMPILE_PCRE16
           if (c > (utf ? 0x10ffff : 0xffff))
   #endif
   #endif
             {
             *errorcodeptr = ERR76;
             }
           else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
         }          }
       }        }
     else      else
Line 2225  for (;;) Line 2240  for (;;)
       {        {
       case OP_CHAR:        case OP_CHAR:
       case OP_CHARI:        case OP_CHARI:
         case OP_NOT:
         case OP_NOTI:
       case OP_EXACT:        case OP_EXACT:
       case OP_EXACTI:        case OP_EXACTI:
         case OP_NOTEXACT:
         case OP_NOTEXACTI:
       case OP_UPTO:        case OP_UPTO:
       case OP_UPTOI:        case OP_UPTOI:
         case OP_NOTUPTO:
         case OP_NOTUPTOI:
       case OP_MINUPTO:        case OP_MINUPTO:
       case OP_MINUPTOI:        case OP_MINUPTOI:
         case OP_NOTMINUPTO:
         case OP_NOTMINUPTOI:
       case OP_POSUPTO:        case OP_POSUPTO:
       case OP_POSUPTOI:        case OP_POSUPTOI:
         case OP_NOTPOSUPTO:
         case OP_NOTPOSUPTOI:
       case OP_STAR:        case OP_STAR:
       case OP_STARI:        case OP_STARI:
         case OP_NOTSTAR:
         case OP_NOTSTARI:
       case OP_MINSTAR:        case OP_MINSTAR:
       case OP_MINSTARI:        case OP_MINSTARI:
         case OP_NOTMINSTAR:
         case OP_NOTMINSTARI:
       case OP_POSSTAR:        case OP_POSSTAR:
       case OP_POSSTARI:        case OP_POSSTARI:
         case OP_NOTPOSSTAR:
         case OP_NOTPOSSTARI:
       case OP_PLUS:        case OP_PLUS:
       case OP_PLUSI:        case OP_PLUSI:
         case OP_NOTPLUS:
         case OP_NOTPLUSI:
       case OP_MINPLUS:        case OP_MINPLUS:
       case OP_MINPLUSI:        case OP_MINPLUSI:
         case OP_NOTMINPLUS:
         case OP_NOTMINPLUSI:
       case OP_POSPLUS:        case OP_POSPLUS:
       case OP_POSPLUSI:        case OP_POSPLUSI:
         case OP_NOTPOSPLUS:
         case OP_NOTPOSPLUSI:
       case OP_QUERY:        case OP_QUERY:
       case OP_QUERYI:        case OP_QUERYI:
         case OP_NOTQUERY:
         case OP_NOTQUERYI:
       case OP_MINQUERY:        case OP_MINQUERY:
       case OP_MINQUERYI:        case OP_MINQUERYI:
         case OP_NOTMINQUERY:
         case OP_NOTMINQUERYI:
       case OP_POSQUERY:        case OP_POSQUERY:
       case OP_POSQUERYI:        case OP_POSQUERYI:
         case OP_NOTPOSQUERY:
         case OP_NOTPOSQUERYI:
       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);        if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
       break;        break;
       }        }
Line 3069  if (next >= 0) switch(op_code) Line 3112  if (next >= 0) switch(op_code)
 #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
   return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */    return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
   
   /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These  
   opcodes are not used for multi-byte characters, because they are coded using  
   an XCLASS instead. */  
   
   case OP_NOT:    case OP_NOT:
  return (c = *previous) == next;#ifdef SUPPORT_UTF
   GETCHARTEST(c, previous);
 #else
   c = *previous;
 #endif
   return c == next;
   
   case OP_NOTI:    case OP_NOTI:
   if ((c = *previous) == next) return TRUE;  
 #ifdef SUPPORT_UTF  #ifdef SUPPORT_UTF
     GETCHARTEST(c, previous);
   #else
     c = *previous;
   #endif
     if (c == next) return TRUE;
   #ifdef SUPPORT_UTF
   if (utf)    if (utf)
     {      {
     unsigned int othercase;      unsigned int othercase;
     if (next < 128) othercase = cd->fcc[next]; else      if (next < 128) othercase = cd->fcc[next]; else
 #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
    othercase = UCD_OTHERCASE(next);    othercase = UCD_OTHERCASE((unsigned int)next);
 #else  #else
     othercase = NOTACHAR;      othercase = NOTACHAR;
 #endif  #endif
Line 3092  if (next >= 0) switch(op_code) Line 3141  if (next >= 0) switch(op_code)
     }      }
   else    else
 #endif  /* SUPPORT_UTF */  #endif  /* SUPPORT_UTF */
  return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next)));  /* Non-UTF-8 mode */  return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
   
   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.    /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
   
   case OP_DIGIT:    case OP_DIGIT:
  return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;  return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
   
   case OP_NOT_DIGIT:    case OP_NOT_DIGIT:
  return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;  return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
   
   case OP_WHITESPACE:    case OP_WHITESPACE:
  return next > 127 || (cd->ctypes[next] & ctype_space) == 0;  return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
   
   case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
  return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;  return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
   
   case OP_WORDCHAR:    case OP_WORDCHAR:
  return next > 127 || (cd->ctypes[next] & ctype_word) == 0;  return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
   
   case OP_NOT_WORDCHAR:    case OP_NOT_WORDCHAR:
  return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;  return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
   
   case OP_HSPACE:    case OP_HSPACE:
   case OP_NOT_HSPACE:    case OP_NOT_HSPACE:
Line 3191  switch(op_code) Line 3240  switch(op_code)
   switch(-next)    switch(-next)
     {      {
     case ESC_d:      case ESC_d:
    return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;    return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
   
     case ESC_D:      case ESC_D:
    return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;    return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
   
     case ESC_s:      case ESC_s:
    return c > 127 || (cd->ctypes[c] & ctype_space) == 0;    return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
   
     case ESC_S:      case ESC_S:
    return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;    return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
   
     case ESC_w:      case ESC_w:
    return c > 127 || (cd->ctypes[c] & ctype_word) == 0;    return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
   
     case ESC_W:      case ESC_W:
    return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;    return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
   
     case ESC_h:      case ESC_h:
     case ESC_H:      case ESC_H:
Line 3315  switch(op_code) Line 3364  switch(op_code)
   return next == -ESC_d;    return next == -ESC_d;
   
   case OP_WHITESPACE:    case OP_WHITESPACE:
  return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
   
   case OP_NOT_WHITESPACE:    case OP_NOT_WHITESPACE:
  return next == -ESC_s || next == -ESC_h || next == -ESC_v;  return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
   
   case OP_HSPACE:    case OP_HSPACE:
   return next == -ESC_S || next == -ESC_H || next == -ESC_d ||    return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
Line 4482  for (;; ptr++) Line 4531  for (;; ptr++)
       LONE_SINGLE_CHARACTER:        LONE_SINGLE_CHARACTER:
   
       /* Only the value of 1 matters for class_single_char. */        /* Only the value of 1 matters for class_single_char. */
   
       if (class_single_char < 2) class_single_char++;        if (class_single_char < 2) class_single_char++;
   
       /* If class_charcount is 1, we saw precisely one character. As long as        /* If class_charcount is 1, we saw precisely one character. As long as
      there were no negated characters >= 128 and there was no use of \p or \P,      there was no use of \p or \P, in other words, no use of any XCLASS
      in other words, no use of any XCLASS features, we can optimize.      features, we can optimize.
   
       In UTF-8 mode, we can optimize the negative case only if there were no  
       characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR  
       operate on single-bytes characters only. This is an historical hangover.  
       Maybe one day we can tidy these opcodes to handle multi-byte characters.  
   
       The optimization throws away the bit map. We turn the item into a        The optimization throws away the bit map. We turn the item into a
       1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.        1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
      Note that OP_NOT[I] does not support multibyte characters. In the positive      In the positive case, it can cause firstchar to be set. Otherwise, there
      case, it can cause firstchar to be set. Otherwise, there can be no first      can be no first char if this item is first, whatever repeat count may
      char if this item is first, whatever repeat count may follow. In the case      follow. In the case of reqchar, save the previous value for reinstating. */
      of reqchar, save the previous value for reinstating. */ 
   
 #ifdef SUPPORT_UTF  
       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET  
         && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))  
 #else  
       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)        if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
 #endif  
         {          {
         ptr++;          ptr++;
         zeroreqchar = reqchar;          zeroreqchar = reqchar;
   
         /* The OP_NOT[I] opcodes work on single characters only. */  
   
         if (negate_class)          if (negate_class)
           {            {
           if (firstchar == REQ_UNSET) firstchar = REQ_NONE;            if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
           zerofirstchar = firstchar;            zerofirstchar = firstchar;
           *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;            *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
          *code++ = c;#ifdef SUPPORT_UTF
           if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
             code += PRIV(ord2utf)(c, code);
           else
 #endif
             *code++ = c;
           goto NOT_CHAR;            goto NOT_CHAR;
           }            }
   
Line 4775  for (;; ptr++) Line 4817  for (;; ptr++)
   
     /* Now handle repetition for the different types of item. */      /* Now handle repetition for the different types of item. */
   
    /* If previous was a character match, abolish the item and generate a    /* If previous was a character or negated character match, abolish the item
    repeat item instead. If a char item has a minumum of more than one, ensure    and generate a repeat item instead. If a char item has a minimum of more
    that it is set in reqchar - it might not be if a sequence such as x{3} is    than one, ensure that it is set in reqchar - it might not be if a sequence
    the first thing in a branch because the x will have gone into firstchar    such as x{3} is the first thing in a branch because the x will have gone
    instead.  */    into firstchar instead.  */
   
    if (*previous == OP_CHAR || *previous == OP_CHARI)    if (*previous == OP_CHAR || *previous == OP_CHARI
         || *previous == OP_NOT || *previous == OP_NOTI)
       {        {
      op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;      switch (*previous)
         {
         default: /* Make compiler happy. */
         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
         }
   
       /* Deal with UTF characters that take up more than one character. It's        /* Deal with UTF characters that take up more than one character. It's
       easier to write this out separately than try to macrify it. Use c to        easier to write this out separately than try to macrify it. Use c to
Line 4806  for (;; ptr++) Line 4856  for (;; ptr++)
       with UTF disabled, or for a single character UTF character. */        with UTF disabled, or for a single character UTF character. */
         {          {
         c = code[-1];          c = code[-1];
        if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;        if (*previous <= OP_CHARI && repeat_min > 1)
           reqchar = c | req_caseopt | cd->req_varyopt;
         }          }
   
       /* If the repetition is unlimited, it pays to see if the next thing on        /* If the repetition is unlimited, it pays to see if the next thing on
Line 4825  for (;; ptr++) Line 4876  for (;; ptr++)
       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */        goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
       }        }
   
     /* If previous was a single negated character ([^a] or similar), we use  
     one of the special opcodes, replacing it. The code is shared with single-  
     character repeats by setting opt_type to add a suitable offset into  
     repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI  
     are currently used only for single-byte chars. */  
   
     else if (*previous == OP_NOT || *previous == OP_NOTI)  
       {  
       op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;  
       c = previous[1];  
       if (!possessive_quantifier &&  
           repeat_max < 0 &&  
           check_auto_possessive(previous, utf, ptr + 1, options, cd))  
         {  
         repeat_type = 0;    /* Force greedy */  
         possessive_quantifier = TRUE;  
         }  
       goto OUTPUT_SINGLE_REPEAT;  
       }  
   
     /* If previous was a character type match (\d or similar), abolish it and      /* If previous was a character type match (\d or similar), abolish it and
     create a suitable repeat item. The code is shared with single-character      create a suitable repeat item. The code is shared with single-character
     repeats by setting op_type to add a suitable offset into repeat_type. Note      repeats by setting op_type to add a suitable offset into repeat_type. Note
Line 5585  for (;; ptr++) Line 5616  for (;; ptr++)
         arg = ++ptr;          arg = ++ptr;
         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;          while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
         arglen = (int)(ptr - arg);          arglen = (int)(ptr - arg);
           if (arglen > (int)MAX_MARK)
             {
             *errorcodeptr = ERR75;
             goto FAILED;
             }
         }          }
   
       if (*ptr != CHAR_RIGHT_PARENTHESIS)        if (*ptr != CHAR_RIGHT_PARENTHESIS)
Line 6836  for (;; ptr++) Line 6872  for (;; ptr++)
       /* For the rest (including \X when Unicode properties are supported), we        /* For the rest (including \X when Unicode properties are supported), we
       can obtain the OP value by negating the escape value in the default        can obtain the OP value by negating the escape value in the default
       situation when PCRE_UCP is not set. When it *is* set, we substitute        situation when PCRE_UCP is not set. When it *is* set, we substitute
      Unicode property tests. */      Unicode property tests. Note that \b and \B do a one-character
       lookbehind. */
   
       else        else
         {          {
           if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
             cd->max_lookbehind = 1;
 #ifdef SUPPORT_UCP  #ifdef SUPPORT_UCP
         if (-c >= ESC_DU && -c <= ESC_wu)          if (-c >= ESC_DU && -c <= ESC_wu)
           {            {
Line 7147  for (;;) Line 7186  for (;;)
         *ptrptr = ptr;          *ptrptr = ptr;
         return FALSE;          return FALSE;
         }          }
      else { PUT(reverse_count, 0, fixed_length); }      else
         {
         if (fixed_length > cd->max_lookbehind)
           cd->max_lookbehind = fixed_length;
         PUT(reverse_count, 0, fixed_length);
         }
       }        }
     }      }
   
Line 7817  cd->start_pattern = (const pcre_uchar *)pattern; Line 7861  cd->start_pattern = (const pcre_uchar *)pattern;
 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));  cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
 cd->req_varyopt = 0;  cd->req_varyopt = 0;
 cd->assert_depth = 0;  cd->assert_depth = 0;
   cd->max_lookbehind = 0;
 cd->external_options = options;  cd->external_options = options;
 cd->external_flags = 0;  cd->external_flags = 0;
 cd->open_caps = NULL;  cd->open_caps = NULL;
Line 7867  re->magic_number = MAGIC_NUMBER; Line 7912  re->magic_number = MAGIC_NUMBER;
 re->size = (int)size;  re->size = (int)size;
 re->options = cd->external_options;  re->options = cd->external_options;
 re->flags = cd->external_flags;  re->flags = cd->external_flags;
 re->dummy1 = 0;  
 re->first_char = 0;  re->first_char = 0;
 re->req_char = 0;  re->req_char = 0;
 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);  re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
Line 7887  field; this time it's used for remembering forward ref Line 7931  field; this time it's used for remembering forward ref
 cd->final_bracount = cd->bracount;  /* Save for checking forward references */  cd->final_bracount = cd->bracount;  /* Save for checking forward references */
 cd->assert_depth = 0;  cd->assert_depth = 0;
 cd->bracount = 0;  cd->bracount = 0;
   cd->max_lookbehind = 0;
 cd->names_found = 0;  cd->names_found = 0;
 cd->name_table = (pcre_uchar *)re + re->name_table_offset;  cd->name_table = (pcre_uchar *)re + re->name_table_offset;
 codestart = cd->name_table + re->name_entry_size * re->name_count;  codestart = cd->name_table + re->name_entry_size * re->name_count;
Line 7908  code = (pcre_uchar *)codestart; Line 7953  code = (pcre_uchar *)codestart;
   &firstchar, &reqchar, NULL, cd, NULL);    &firstchar, &reqchar, NULL, cd, NULL);
 re->top_bracket = cd->bracount;  re->top_bracket = cd->bracount;
 re->top_backref = cd->top_backref;  re->top_backref = cd->top_backref;
   re->max_lookbehind = cd->max_lookbehind;
 re->flags = cd->external_flags | PCRE_MODE;  re->flags = cd->external_flags | PCRE_MODE;
   
 if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */  if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
Line 7995  if (cd->check_lookbehind) Line 8041  if (cd->check_lookbehind)
                     (fixed_length == -4)? ERR70 : ERR25;                      (fixed_length == -4)? ERR70 : ERR25;
         break;          break;
         }          }
         if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
       PUT(cc, 1, fixed_length);        PUT(cc, 1, fixed_length);
       }        }
     cc += 1 + LINK_SIZE;      cc += 1 + LINK_SIZE;

Removed from v.1.1.1.2  
changed lines
  Added in v.1.1.1.3


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>