Annotation of embedaddon/pcre/pcrecpp.cc, revision 1.1
1.1 ! misho 1: // Copyright (c) 2010, Google Inc.
! 2: // All rights reserved.
! 3: //
! 4: // Redistribution and use in source and binary forms, with or without
! 5: // modification, are permitted provided that the following conditions are
! 6: // met:
! 7: //
! 8: // * Redistributions of source code must retain the above copyright
! 9: // notice, this list of conditions and the following disclaimer.
! 10: // * Redistributions in binary form must reproduce the above
! 11: // copyright notice, this list of conditions and the following disclaimer
! 12: // in the documentation and/or other materials provided with the
! 13: // distribution.
! 14: // * Neither the name of Google Inc. nor the names of its
! 15: // contributors may be used to endorse or promote products derived from
! 16: // this software without specific prior written permission.
! 17: //
! 18: // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! 19: // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
! 20: // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
! 21: // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
! 22: // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
! 23: // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
! 24: // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
! 25: // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
! 26: // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
! 27: // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
! 28: // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
! 29: //
! 30: // Author: Sanjay Ghemawat
! 31:
! 32: #ifdef HAVE_CONFIG_H
! 33: #include "config.h"
! 34: #endif
! 35:
! 36: #include <stdlib.h>
! 37: #include <stdio.h>
! 38: #include <ctype.h>
! 39: #include <limits.h> /* for SHRT_MIN, USHRT_MAX, etc */
! 40: #include <string.h> /* for memcpy */
! 41: #include <assert.h>
! 42: #include <errno.h>
! 43: #include <string>
! 44: #include <algorithm>
! 45:
! 46: #include "pcrecpp_internal.h"
! 47: #include "pcre.h"
! 48: #include "pcrecpp.h"
! 49: #include "pcre_stringpiece.h"
! 50:
! 51:
! 52: namespace pcrecpp {
! 53:
! 54: // Maximum number of args we can set
! 55: static const int kMaxArgs = 16;
! 56: static const int kVecSize = (1 + kMaxArgs) * 3; // results + PCRE workspace
! 57:
! 58: // Special object that stands-in for no argument
! 59: Arg RE::no_arg((void*)NULL);
! 60:
! 61: // This is for ABI compatibility with old versions of pcre (pre-7.6),
! 62: // which defined a global no_arg variable instead of putting it in the
! 63: // RE class. This works on GCC >= 3, at least. It definitely works
! 64: // for ELF, but may not for other object formats (Mach-O, for
! 65: // instance, does not support aliases.) We could probably have a more
! 66: // inclusive test if we ever needed it. (Note that not only the
! 67: // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
! 68: // gnu-specific.)
! 69: #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__)
! 70: # define ULP_AS_STRING(x) ULP_AS_STRING_INTERNAL(x)
! 71: # define ULP_AS_STRING_INTERNAL(x) #x
! 72: # define USER_LABEL_PREFIX_STR ULP_AS_STRING(__USER_LABEL_PREFIX__)
! 73: extern Arg no_arg
! 74: __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
! 75: #endif
! 76:
! 77: // If a regular expression has no error, its error_ field points here
! 78: static const string empty_string;
! 79:
! 80: // If the user doesn't ask for any options, we just use this one
! 81: static RE_Options default_options;
! 82:
! 83: void RE::Init(const string& pat, const RE_Options* options) {
! 84: pattern_ = pat;
! 85: if (options == NULL) {
! 86: options_ = default_options;
! 87: } else {
! 88: options_ = *options;
! 89: }
! 90: error_ = &empty_string;
! 91: re_full_ = NULL;
! 92: re_partial_ = NULL;
! 93:
! 94: re_partial_ = Compile(UNANCHORED);
! 95: if (re_partial_ != NULL) {
! 96: re_full_ = Compile(ANCHOR_BOTH);
! 97: }
! 98: }
! 99:
! 100: void RE::Cleanup() {
! 101: if (re_full_ != NULL) (*pcre_free)(re_full_);
! 102: if (re_partial_ != NULL) (*pcre_free)(re_partial_);
! 103: if (error_ != &empty_string) delete error_;
! 104: }
! 105:
! 106:
! 107: RE::~RE() {
! 108: Cleanup();
! 109: }
! 110:
! 111:
! 112: pcre* RE::Compile(Anchor anchor) {
! 113: // First, convert RE_Options into pcre options
! 114: int pcre_options = 0;
! 115: pcre_options = options_.all_options();
! 116:
! 117: // Special treatment for anchoring. This is needed because at
! 118: // runtime pcre only provides an option for anchoring at the
! 119: // beginning of a string (unless you use offset).
! 120: //
! 121: // There are three types of anchoring we want:
! 122: // UNANCHORED Compile the original pattern, and use
! 123: // a pcre unanchored match.
! 124: // ANCHOR_START Compile the original pattern, and use
! 125: // a pcre anchored match.
! 126: // ANCHOR_BOTH Tack a "\z" to the end of the original pattern
! 127: // and use a pcre anchored match.
! 128:
! 129: const char* compile_error;
! 130: int eoffset;
! 131: pcre* re;
! 132: if (anchor != ANCHOR_BOTH) {
! 133: re = pcre_compile(pattern_.c_str(), pcre_options,
! 134: &compile_error, &eoffset, NULL);
! 135: } else {
! 136: // Tack a '\z' at the end of RE. Parenthesize it first so that
! 137: // the '\z' applies to all top-level alternatives in the regexp.
! 138: string wrapped = "(?:"; // A non-counting grouping operator
! 139: wrapped += pattern_;
! 140: wrapped += ")\\z";
! 141: re = pcre_compile(wrapped.c_str(), pcre_options,
! 142: &compile_error, &eoffset, NULL);
! 143: }
! 144: if (re == NULL) {
! 145: if (error_ == &empty_string) error_ = new string(compile_error);
! 146: }
! 147: return re;
! 148: }
! 149:
! 150: /***** Matching interfaces *****/
! 151:
! 152: bool RE::FullMatch(const StringPiece& text,
! 153: const Arg& ptr1,
! 154: const Arg& ptr2,
! 155: const Arg& ptr3,
! 156: const Arg& ptr4,
! 157: const Arg& ptr5,
! 158: const Arg& ptr6,
! 159: const Arg& ptr7,
! 160: const Arg& ptr8,
! 161: const Arg& ptr9,
! 162: const Arg& ptr10,
! 163: const Arg& ptr11,
! 164: const Arg& ptr12,
! 165: const Arg& ptr13,
! 166: const Arg& ptr14,
! 167: const Arg& ptr15,
! 168: const Arg& ptr16) const {
! 169: const Arg* args[kMaxArgs];
! 170: int n = 0;
! 171: if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
! 172: if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
! 173: if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
! 174: if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
! 175: if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
! 176: if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
! 177: if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
! 178: if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
! 179: if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
! 180: if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
! 181: if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
! 182: if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
! 183: if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
! 184: if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
! 185: if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
! 186: if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
! 187: done:
! 188:
! 189: int consumed;
! 190: int vec[kVecSize];
! 191: return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
! 192: }
! 193:
! 194: bool RE::PartialMatch(const StringPiece& text,
! 195: const Arg& ptr1,
! 196: const Arg& ptr2,
! 197: const Arg& ptr3,
! 198: const Arg& ptr4,
! 199: const Arg& ptr5,
! 200: const Arg& ptr6,
! 201: const Arg& ptr7,
! 202: const Arg& ptr8,
! 203: const Arg& ptr9,
! 204: const Arg& ptr10,
! 205: const Arg& ptr11,
! 206: const Arg& ptr12,
! 207: const Arg& ptr13,
! 208: const Arg& ptr14,
! 209: const Arg& ptr15,
! 210: const Arg& ptr16) const {
! 211: const Arg* args[kMaxArgs];
! 212: int n = 0;
! 213: if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
! 214: if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
! 215: if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
! 216: if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
! 217: if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
! 218: if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
! 219: if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
! 220: if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
! 221: if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
! 222: if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
! 223: if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
! 224: if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
! 225: if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
! 226: if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
! 227: if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
! 228: if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
! 229: done:
! 230:
! 231: int consumed;
! 232: int vec[kVecSize];
! 233: return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
! 234: }
! 235:
! 236: bool RE::Consume(StringPiece* input,
! 237: const Arg& ptr1,
! 238: const Arg& ptr2,
! 239: const Arg& ptr3,
! 240: const Arg& ptr4,
! 241: const Arg& ptr5,
! 242: const Arg& ptr6,
! 243: const Arg& ptr7,
! 244: const Arg& ptr8,
! 245: const Arg& ptr9,
! 246: const Arg& ptr10,
! 247: const Arg& ptr11,
! 248: const Arg& ptr12,
! 249: const Arg& ptr13,
! 250: const Arg& ptr14,
! 251: const Arg& ptr15,
! 252: const Arg& ptr16) const {
! 253: const Arg* args[kMaxArgs];
! 254: int n = 0;
! 255: if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
! 256: if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
! 257: if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
! 258: if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
! 259: if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
! 260: if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
! 261: if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
! 262: if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
! 263: if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
! 264: if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
! 265: if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
! 266: if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
! 267: if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
! 268: if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
! 269: if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
! 270: if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
! 271: done:
! 272:
! 273: int consumed;
! 274: int vec[kVecSize];
! 275: if (DoMatchImpl(*input, ANCHOR_START, &consumed,
! 276: args, n, vec, kVecSize)) {
! 277: input->remove_prefix(consumed);
! 278: return true;
! 279: } else {
! 280: return false;
! 281: }
! 282: }
! 283:
! 284: bool RE::FindAndConsume(StringPiece* input,
! 285: const Arg& ptr1,
! 286: const Arg& ptr2,
! 287: const Arg& ptr3,
! 288: const Arg& ptr4,
! 289: const Arg& ptr5,
! 290: const Arg& ptr6,
! 291: const Arg& ptr7,
! 292: const Arg& ptr8,
! 293: const Arg& ptr9,
! 294: const Arg& ptr10,
! 295: const Arg& ptr11,
! 296: const Arg& ptr12,
! 297: const Arg& ptr13,
! 298: const Arg& ptr14,
! 299: const Arg& ptr15,
! 300: const Arg& ptr16) const {
! 301: const Arg* args[kMaxArgs];
! 302: int n = 0;
! 303: if (&ptr1 == &no_arg) goto done; args[n++] = &ptr1;
! 304: if (&ptr2 == &no_arg) goto done; args[n++] = &ptr2;
! 305: if (&ptr3 == &no_arg) goto done; args[n++] = &ptr3;
! 306: if (&ptr4 == &no_arg) goto done; args[n++] = &ptr4;
! 307: if (&ptr5 == &no_arg) goto done; args[n++] = &ptr5;
! 308: if (&ptr6 == &no_arg) goto done; args[n++] = &ptr6;
! 309: if (&ptr7 == &no_arg) goto done; args[n++] = &ptr7;
! 310: if (&ptr8 == &no_arg) goto done; args[n++] = &ptr8;
! 311: if (&ptr9 == &no_arg) goto done; args[n++] = &ptr9;
! 312: if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
! 313: if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
! 314: if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
! 315: if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
! 316: if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
! 317: if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
! 318: if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
! 319: done:
! 320:
! 321: int consumed;
! 322: int vec[kVecSize];
! 323: if (DoMatchImpl(*input, UNANCHORED, &consumed,
! 324: args, n, vec, kVecSize)) {
! 325: input->remove_prefix(consumed);
! 326: return true;
! 327: } else {
! 328: return false;
! 329: }
! 330: }
! 331:
! 332: bool RE::Replace(const StringPiece& rewrite,
! 333: string *str) const {
! 334: int vec[kVecSize];
! 335: int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
! 336: if (matches == 0)
! 337: return false;
! 338:
! 339: string s;
! 340: if (!Rewrite(&s, rewrite, *str, vec, matches))
! 341: return false;
! 342:
! 343: assert(vec[0] >= 0);
! 344: assert(vec[1] >= 0);
! 345: str->replace(vec[0], vec[1] - vec[0], s);
! 346: return true;
! 347: }
! 348:
! 349: // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
! 350: // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
! 351: // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
! 352:
! 353: static int NewlineMode(int pcre_options) {
! 354: // TODO: if we can make it threadsafe, cache this var
! 355: int newline_mode = 0;
! 356: /* if (newline_mode) return newline_mode; */ // do this once it's cached
! 357: if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
! 358: PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
! 359: newline_mode = (pcre_options &
! 360: (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
! 361: PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
! 362: } else {
! 363: int newline;
! 364: pcre_config(PCRE_CONFIG_NEWLINE, &newline);
! 365: if (newline == 10)
! 366: newline_mode = PCRE_NEWLINE_LF;
! 367: else if (newline == 13)
! 368: newline_mode = PCRE_NEWLINE_CR;
! 369: else if (newline == 3338)
! 370: newline_mode = PCRE_NEWLINE_CRLF;
! 371: else if (newline == -1)
! 372: newline_mode = PCRE_NEWLINE_ANY;
! 373: else if (newline == -2)
! 374: newline_mode = PCRE_NEWLINE_ANYCRLF;
! 375: else
! 376: assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
! 377: }
! 378: return newline_mode;
! 379: }
! 380:
! 381: int RE::GlobalReplace(const StringPiece& rewrite,
! 382: string *str) const {
! 383: int count = 0;
! 384: int vec[kVecSize];
! 385: string out;
! 386: int start = 0;
! 387: int lastend = -1;
! 388: bool last_match_was_empty_string = false;
! 389:
! 390: while (start <= static_cast<int>(str->length())) {
! 391: // If the previous match was for the empty string, we shouldn't
! 392: // just match again: we'll match in the same way and get an
! 393: // infinite loop. Instead, we do the match in a special way:
! 394: // anchored -- to force another try at the same position --
! 395: // and with a flag saying that this time, ignore empty matches.
! 396: // If this special match returns, that means there's a non-empty
! 397: // match at this position as well, and we can continue. If not,
! 398: // we do what perl does, and just advance by one.
! 399: // Notice that perl prints '@@@' for this;
! 400: // perl -le '$_ = "aa"; s/b*|aa/@/g; print'
! 401: int matches;
! 402: if (last_match_was_empty_string) {
! 403: matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
! 404: if (matches <= 0) {
! 405: int matchend = start + 1; // advance one character.
! 406: // If the current char is CR and we're in CRLF mode, skip LF too.
! 407: // Note it's better to call pcre_fullinfo() than to examine
! 408: // all_options(), since options_ could have changed bewteen
! 409: // compile-time and now, but this is simpler and safe enough.
! 410: // Modified by PH to add ANY and ANYCRLF.
! 411: if (matchend < static_cast<int>(str->length()) &&
! 412: (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
! 413: (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
! 414: NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
! 415: NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
! 416: matchend++;
! 417: }
! 418: // We also need to advance more than one char if we're in utf8 mode.
! 419: #ifdef SUPPORT_UTF8
! 420: if (options_.utf8()) {
! 421: while (matchend < static_cast<int>(str->length()) &&
! 422: ((*str)[matchend] & 0xc0) == 0x80)
! 423: matchend++;
! 424: }
! 425: #endif
! 426: if (start < static_cast<int>(str->length()))
! 427: out.append(*str, start, matchend - start);
! 428: start = matchend;
! 429: last_match_was_empty_string = false;
! 430: continue;
! 431: }
! 432: } else {
! 433: matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
! 434: if (matches <= 0)
! 435: break;
! 436: }
! 437: int matchstart = vec[0], matchend = vec[1];
! 438: assert(matchstart >= start);
! 439: assert(matchend >= matchstart);
! 440: out.append(*str, start, matchstart - start);
! 441: Rewrite(&out, rewrite, *str, vec, matches);
! 442: start = matchend;
! 443: lastend = matchend;
! 444: count++;
! 445: last_match_was_empty_string = (matchstart == matchend);
! 446: }
! 447:
! 448: if (count == 0)
! 449: return 0;
! 450:
! 451: if (start < static_cast<int>(str->length()))
! 452: out.append(*str, start, str->length() - start);
! 453: swap(out, *str);
! 454: return count;
! 455: }
! 456:
! 457: bool RE::Extract(const StringPiece& rewrite,
! 458: const StringPiece& text,
! 459: string *out) const {
! 460: int vec[kVecSize];
! 461: int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
! 462: if (matches == 0)
! 463: return false;
! 464: out->erase();
! 465: return Rewrite(out, rewrite, text, vec, matches);
! 466: }
! 467:
! 468: /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
! 469: string result;
! 470:
! 471: // Escape any ascii character not in [A-Za-z_0-9].
! 472: //
! 473: // Note that it's legal to escape a character even if it has no
! 474: // special meaning in a regular expression -- so this function does
! 475: // that. (This also makes it identical to the perl function of the
! 476: // same name; see `perldoc -f quotemeta`.) The one exception is
! 477: // escaping NUL: rather than doing backslash + NUL, like perl does,
! 478: // we do '\0', because pcre itself doesn't take embedded NUL chars.
! 479: for (int ii = 0; ii < unquoted.size(); ++ii) {
! 480: // Note that using 'isalnum' here raises the benchmark time from
! 481: // 32ns to 58ns:
! 482: if (unquoted[ii] == '\0') {
! 483: result += "\\0";
! 484: } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
! 485: (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
! 486: (unquoted[ii] < '0' || unquoted[ii] > '9') &&
! 487: unquoted[ii] != '_' &&
! 488: // If this is the part of a UTF8 or Latin1 character, we need
! 489: // to copy this byte without escaping. Experimentally this is
! 490: // what works correctly with the regexp library.
! 491: !(unquoted[ii] & 128)) {
! 492: result += '\\';
! 493: result += unquoted[ii];
! 494: } else {
! 495: result += unquoted[ii];
! 496: }
! 497: }
! 498:
! 499: return result;
! 500: }
! 501:
! 502: /***** Actual matching and rewriting code *****/
! 503:
! 504: int RE::TryMatch(const StringPiece& text,
! 505: int startpos,
! 506: Anchor anchor,
! 507: bool empty_ok,
! 508: int *vec,
! 509: int vecsize) const {
! 510: pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
! 511: if (re == NULL) {
! 512: //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
! 513: return 0;
! 514: }
! 515:
! 516: pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
! 517: if (options_.match_limit() > 0) {
! 518: extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
! 519: extra.match_limit = options_.match_limit();
! 520: }
! 521: if (options_.match_limit_recursion() > 0) {
! 522: extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
! 523: extra.match_limit_recursion = options_.match_limit_recursion();
! 524: }
! 525:
! 526: int options = 0;
! 527: if (anchor != UNANCHORED)
! 528: options |= PCRE_ANCHORED;
! 529: if (!empty_ok)
! 530: options |= PCRE_NOTEMPTY;
! 531:
! 532: int rc = pcre_exec(re, // The regular expression object
! 533: &extra,
! 534: (text.data() == NULL) ? "" : text.data(),
! 535: text.size(),
! 536: startpos,
! 537: options,
! 538: vec,
! 539: vecsize);
! 540:
! 541: // Handle errors
! 542: if (rc == PCRE_ERROR_NOMATCH) {
! 543: return 0;
! 544: } else if (rc < 0) {
! 545: //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
! 546: // re, pattern_.c_str());
! 547: return 0;
! 548: } else if (rc == 0) {
! 549: // pcre_exec() returns 0 as a special case when the number of
! 550: // capturing subpatterns exceeds the size of the vector.
! 551: // When this happens, there is a match and the output vector
! 552: // is filled, but we miss out on the positions of the extra subpatterns.
! 553: rc = vecsize / 2;
! 554: }
! 555:
! 556: return rc;
! 557: }
! 558:
! 559: bool RE::DoMatchImpl(const StringPiece& text,
! 560: Anchor anchor,
! 561: int* consumed,
! 562: const Arg* const* args,
! 563: int n,
! 564: int* vec,
! 565: int vecsize) const {
! 566: assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
! 567: int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
! 568: assert(matches >= 0); // TryMatch never returns negatives
! 569: if (matches == 0)
! 570: return false;
! 571:
! 572: *consumed = vec[1];
! 573:
! 574: if (n == 0 || args == NULL) {
! 575: // We are not interested in results
! 576: return true;
! 577: }
! 578:
! 579: if (NumberOfCapturingGroups() < n) {
! 580: // RE has fewer capturing groups than number of arg pointers passed in
! 581: return false;
! 582: }
! 583:
! 584: // If we got here, we must have matched the whole pattern.
! 585: // We do not need (can not do) any more checks on the value of 'matches' here
! 586: // -- see the comment for TryMatch.
! 587: for (int i = 0; i < n; i++) {
! 588: const int start = vec[2*(i+1)];
! 589: const int limit = vec[2*(i+1)+1];
! 590: if (!args[i]->Parse(text.data() + start, limit-start)) {
! 591: // TODO: Should we indicate what the error was?
! 592: return false;
! 593: }
! 594: }
! 595:
! 596: return true;
! 597: }
! 598:
! 599: bool RE::DoMatch(const StringPiece& text,
! 600: Anchor anchor,
! 601: int* consumed,
! 602: const Arg* const args[],
! 603: int n) const {
! 604: assert(n >= 0);
! 605: size_t const vecsize = (1 + n) * 3; // results + PCRE workspace
! 606: // (as for kVecSize)
! 607: int space[21]; // use stack allocation for small vecsize (common case)
! 608: int* vec = vecsize <= 21 ? space : new int[vecsize];
! 609: bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
! 610: if (vec != space) delete [] vec;
! 611: return retval;
! 612: }
! 613:
! 614: bool RE::Rewrite(string *out, const StringPiece &rewrite,
! 615: const StringPiece &text, int *vec, int veclen) const {
! 616: for (const char *s = rewrite.data(), *end = s + rewrite.size();
! 617: s < end; s++) {
! 618: int c = *s;
! 619: if (c == '\\') {
! 620: c = *++s;
! 621: if (isdigit(c)) {
! 622: int n = (c - '0');
! 623: if (n >= veclen) {
! 624: //fprintf(stderr, requested group %d in regexp %.*s\n",
! 625: // n, rewrite.size(), rewrite.data());
! 626: return false;
! 627: }
! 628: int start = vec[2 * n];
! 629: if (start >= 0)
! 630: out->append(text.data() + start, vec[2 * n + 1] - start);
! 631: } else if (c == '\\') {
! 632: *out += '\\';
! 633: } else {
! 634: //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
! 635: // rewrite.size(), rewrite.data());
! 636: return false;
! 637: }
! 638: } else {
! 639: *out += c;
! 640: }
! 641: }
! 642: return true;
! 643: }
! 644:
! 645: // Return the number of capturing subpatterns, or -1 if the
! 646: // regexp wasn't valid on construction.
! 647: int RE::NumberOfCapturingGroups() const {
! 648: if (re_partial_ == NULL) return -1;
! 649:
! 650: int result;
! 651: int pcre_retval = pcre_fullinfo(re_partial_, // The regular expression object
! 652: NULL, // We did not study the pattern
! 653: PCRE_INFO_CAPTURECOUNT,
! 654: &result);
! 655: assert(pcre_retval == 0);
! 656: return result;
! 657: }
! 658:
! 659: /***** Parsers for various types *****/
! 660:
! 661: bool Arg::parse_null(const char* str, int n, void* dest) {
! 662: // We fail if somebody asked us to store into a non-NULL void* pointer
! 663: return (dest == NULL);
! 664: }
! 665:
! 666: bool Arg::parse_string(const char* str, int n, void* dest) {
! 667: if (dest == NULL) return true;
! 668: reinterpret_cast<string*>(dest)->assign(str, n);
! 669: return true;
! 670: }
! 671:
! 672: bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
! 673: if (dest == NULL) return true;
! 674: reinterpret_cast<StringPiece*>(dest)->set(str, n);
! 675: return true;
! 676: }
! 677:
! 678: bool Arg::parse_char(const char* str, int n, void* dest) {
! 679: if (n != 1) return false;
! 680: if (dest == NULL) return true;
! 681: *(reinterpret_cast<char*>(dest)) = str[0];
! 682: return true;
! 683: }
! 684:
! 685: bool Arg::parse_uchar(const char* str, int n, void* dest) {
! 686: if (n != 1) return false;
! 687: if (dest == NULL) return true;
! 688: *(reinterpret_cast<unsigned char*>(dest)) = str[0];
! 689: return true;
! 690: }
! 691:
! 692: // Largest number spec that we are willing to parse
! 693: static const int kMaxNumberLength = 32;
! 694:
! 695: // REQUIRES "buf" must have length at least kMaxNumberLength+1
! 696: // REQUIRES "n > 0"
! 697: // Copies "str" into "buf" and null-terminates if necessary.
! 698: // Returns one of:
! 699: // a. "str" if no termination is needed
! 700: // b. "buf" if the string was copied and null-terminated
! 701: // c. "" if the input was invalid and has no hope of being parsed
! 702: static const char* TerminateNumber(char* buf, const char* str, int n) {
! 703: if ((n > 0) && isspace(*str)) {
! 704: // We are less forgiving than the strtoxxx() routines and do not
! 705: // allow leading spaces.
! 706: return "";
! 707: }
! 708:
! 709: // See if the character right after the input text may potentially
! 710: // look like a digit.
! 711: if (isdigit(str[n]) ||
! 712: ((str[n] >= 'a') && (str[n] <= 'f')) ||
! 713: ((str[n] >= 'A') && (str[n] <= 'F'))) {
! 714: if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
! 715: memcpy(buf, str, n);
! 716: buf[n] = '\0';
! 717: return buf;
! 718: } else {
! 719: // We can parse right out of the supplied string, so return it.
! 720: return str;
! 721: }
! 722: }
! 723:
! 724: bool Arg::parse_long_radix(const char* str,
! 725: int n,
! 726: void* dest,
! 727: int radix) {
! 728: if (n == 0) return false;
! 729: char buf[kMaxNumberLength+1];
! 730: str = TerminateNumber(buf, str, n);
! 731: char* end;
! 732: errno = 0;
! 733: long r = strtol(str, &end, radix);
! 734: if (end != str + n) return false; // Leftover junk
! 735: if (errno) return false;
! 736: if (dest == NULL) return true;
! 737: *(reinterpret_cast<long*>(dest)) = r;
! 738: return true;
! 739: }
! 740:
! 741: bool Arg::parse_ulong_radix(const char* str,
! 742: int n,
! 743: void* dest,
! 744: int radix) {
! 745: if (n == 0) return false;
! 746: char buf[kMaxNumberLength+1];
! 747: str = TerminateNumber(buf, str, n);
! 748: if (str[0] == '-') return false; // strtoul() on a negative number?!
! 749: char* end;
! 750: errno = 0;
! 751: unsigned long r = strtoul(str, &end, radix);
! 752: if (end != str + n) return false; // Leftover junk
! 753: if (errno) return false;
! 754: if (dest == NULL) return true;
! 755: *(reinterpret_cast<unsigned long*>(dest)) = r;
! 756: return true;
! 757: }
! 758:
! 759: bool Arg::parse_short_radix(const char* str,
! 760: int n,
! 761: void* dest,
! 762: int radix) {
! 763: long r;
! 764: if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
! 765: if (r < SHRT_MIN || r > SHRT_MAX) return false; // Out of range
! 766: if (dest == NULL) return true;
! 767: *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
! 768: return true;
! 769: }
! 770:
! 771: bool Arg::parse_ushort_radix(const char* str,
! 772: int n,
! 773: void* dest,
! 774: int radix) {
! 775: unsigned long r;
! 776: if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
! 777: if (r > USHRT_MAX) return false; // Out of range
! 778: if (dest == NULL) return true;
! 779: *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
! 780: return true;
! 781: }
! 782:
! 783: bool Arg::parse_int_radix(const char* str,
! 784: int n,
! 785: void* dest,
! 786: int radix) {
! 787: long r;
! 788: if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
! 789: if (r < INT_MIN || r > INT_MAX) return false; // Out of range
! 790: if (dest == NULL) return true;
! 791: *(reinterpret_cast<int*>(dest)) = r;
! 792: return true;
! 793: }
! 794:
! 795: bool Arg::parse_uint_radix(const char* str,
! 796: int n,
! 797: void* dest,
! 798: int radix) {
! 799: unsigned long r;
! 800: if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
! 801: if (r > UINT_MAX) return false; // Out of range
! 802: if (dest == NULL) return true;
! 803: *(reinterpret_cast<unsigned int*>(dest)) = r;
! 804: return true;
! 805: }
! 806:
! 807: bool Arg::parse_longlong_radix(const char* str,
! 808: int n,
! 809: void* dest,
! 810: int radix) {
! 811: #ifndef HAVE_LONG_LONG
! 812: return false;
! 813: #else
! 814: if (n == 0) return false;
! 815: char buf[kMaxNumberLength+1];
! 816: str = TerminateNumber(buf, str, n);
! 817: char* end;
! 818: errno = 0;
! 819: #if defined HAVE_STRTOQ
! 820: long long r = strtoq(str, &end, radix);
! 821: #elif defined HAVE_STRTOLL
! 822: long long r = strtoll(str, &end, radix);
! 823: #elif defined HAVE__STRTOI64
! 824: long long r = _strtoi64(str, &end, radix);
! 825: #elif defined HAVE_STRTOIMAX
! 826: long long r = strtoimax(str, &end, radix);
! 827: #else
! 828: #error parse_longlong_radix: cannot convert input to a long-long
! 829: #endif
! 830: if (end != str + n) return false; // Leftover junk
! 831: if (errno) return false;
! 832: if (dest == NULL) return true;
! 833: *(reinterpret_cast<long long*>(dest)) = r;
! 834: return true;
! 835: #endif /* HAVE_LONG_LONG */
! 836: }
! 837:
! 838: bool Arg::parse_ulonglong_radix(const char* str,
! 839: int n,
! 840: void* dest,
! 841: int radix) {
! 842: #ifndef HAVE_UNSIGNED_LONG_LONG
! 843: return false;
! 844: #else
! 845: if (n == 0) return false;
! 846: char buf[kMaxNumberLength+1];
! 847: str = TerminateNumber(buf, str, n);
! 848: if (str[0] == '-') return false; // strtoull() on a negative number?!
! 849: char* end;
! 850: errno = 0;
! 851: #if defined HAVE_STRTOQ
! 852: unsigned long long r = strtouq(str, &end, radix);
! 853: #elif defined HAVE_STRTOLL
! 854: unsigned long long r = strtoull(str, &end, radix);
! 855: #elif defined HAVE__STRTOI64
! 856: unsigned long long r = _strtoui64(str, &end, radix);
! 857: #elif defined HAVE_STRTOIMAX
! 858: unsigned long long r = strtoumax(str, &end, radix);
! 859: #else
! 860: #error parse_ulonglong_radix: cannot convert input to a long-long
! 861: #endif
! 862: if (end != str + n) return false; // Leftover junk
! 863: if (errno) return false;
! 864: if (dest == NULL) return true;
! 865: *(reinterpret_cast<unsigned long long*>(dest)) = r;
! 866: return true;
! 867: #endif /* HAVE_UNSIGNED_LONG_LONG */
! 868: }
! 869:
! 870: bool Arg::parse_double(const char* str, int n, void* dest) {
! 871: if (n == 0) return false;
! 872: static const int kMaxLength = 200;
! 873: char buf[kMaxLength];
! 874: if (n >= kMaxLength) return false;
! 875: memcpy(buf, str, n);
! 876: buf[n] = '\0';
! 877: errno = 0;
! 878: char* end;
! 879: double r = strtod(buf, &end);
! 880: if (end != buf + n) return false; // Leftover junk
! 881: if (errno) return false;
! 882: if (dest == NULL) return true;
! 883: *(reinterpret_cast<double*>(dest)) = r;
! 884: return true;
! 885: }
! 886:
! 887: bool Arg::parse_float(const char* str, int n, void* dest) {
! 888: double r;
! 889: if (!parse_double(str, n, &r)) return false;
! 890: if (dest == NULL) return true;
! 891: *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
! 892: return true;
! 893: }
! 894:
! 895:
! 896: #define DEFINE_INTEGER_PARSERS(name) \
! 897: bool Arg::parse_##name(const char* str, int n, void* dest) { \
! 898: return parse_##name##_radix(str, n, dest, 10); \
! 899: } \
! 900: bool Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
! 901: return parse_##name##_radix(str, n, dest, 16); \
! 902: } \
! 903: bool Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
! 904: return parse_##name##_radix(str, n, dest, 8); \
! 905: } \
! 906: bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
! 907: return parse_##name##_radix(str, n, dest, 0); \
! 908: }
! 909:
! 910: DEFINE_INTEGER_PARSERS(short) /* */
! 911: DEFINE_INTEGER_PARSERS(ushort) /* */
! 912: DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
! 913: DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
! 914: DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
! 915: DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
! 916: DEFINE_INTEGER_PARSERS(longlong) /* */
! 917: DEFINE_INTEGER_PARSERS(ulonglong) /* */
! 918:
! 919: #undef DEFINE_INTEGER_PARSERS
! 920:
! 921: } // namespace pcrecpp
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>