File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / pcre_scanner.h
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Mon Jul 22 08:25:55 2013 UTC (10 years, 10 months ago) by misho
Branches: pcre, MAIN
CVS tags: v8_34, v8_33, v8_31, v8_30, v8_21, HEAD
8.33

    1: // Copyright (c) 2005, Google Inc.
    2: // All rights reserved.
    3: //
    4: // Redistribution and use in source and binary forms, with or without
    5: // modification, are permitted provided that the following conditions are
    6: // met:
    7: //
    8: //     * Redistributions of source code must retain the above copyright
    9: // notice, this list of conditions and the following disclaimer.
   10: //     * Redistributions in binary form must reproduce the above
   11: // copyright notice, this list of conditions and the following disclaimer
   12: // in the documentation and/or other materials provided with the
   13: // distribution.
   14: //     * Neither the name of Google Inc. nor the names of its
   15: // contributors may be used to endorse or promote products derived from
   16: // this software without specific prior written permission.
   17: //
   18: // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   19: // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   20: // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   21: // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   22: // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   23: // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   24: // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   25: // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   26: // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   27: // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   28: // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   29: //
   30: // Author: Sanjay Ghemawat
   31: //
   32: // Regular-expression based scanner for parsing an input stream.
   33: //
   34: // Example 1: parse a sequence of "var = number" entries from input:
   35: //
   36: //      Scanner scanner(input);
   37: //      string var;
   38: //      int number;
   39: //      scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter
   40: //      while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) {
   41: //        ...;
   42: //      }
   43: 
   44: #ifndef _PCRE_SCANNER_H
   45: #define _PCRE_SCANNER_H
   46: 
   47: #include <assert.h>
   48: #include <string>
   49: #include <vector>
   50: 
   51: #include <pcrecpp.h>
   52: #include <pcre_stringpiece.h>
   53: 
   54: namespace pcrecpp {
   55: 
   56: class PCRECPP_EXP_DEFN Scanner {
   57:  public:
   58:   Scanner();
   59:   explicit Scanner(const std::string& input);
   60:   ~Scanner();
   61: 
   62:   // Return current line number.  The returned line-number is
   63:   // one-based.  I.e. it returns 1 + the number of consumed newlines.
   64:   //
   65:   // Note: this method may be slow.  It may take time proportional to
   66:   // the size of the input.
   67:   int LineNumber() const;
   68: 
   69:   // Return the byte-offset that the scanner is looking in the
   70:   // input data;
   71:   int Offset() const;
   72: 
   73:   // Return true iff the start of the remaining input matches "re"
   74:   bool LookingAt(const RE& re) const;
   75: 
   76:   // Return true iff all of the following are true
   77:   //    a. the start of the remaining input matches "re",
   78:   //    b. if any arguments are supplied, matched sub-patterns can be
   79:   //       parsed and stored into the arguments.
   80:   // If it returns true, it skips over the matched input and any
   81:   // following input that matches the "skip" regular expression.
   82:   bool Consume(const RE& re,
   83:                const Arg& arg0 = RE::no_arg,
   84:                const Arg& arg1 = RE::no_arg,
   85:                const Arg& arg2 = RE::no_arg
   86:                // TODO: Allow more arguments?
   87:                );
   88: 
   89:   // Set the "skip" regular expression.  If after consuming some data,
   90:   // a prefix of the input matches this RE, it is automatically
   91:   // skipped.  For example, a programming language scanner would use
   92:   // a skip RE that matches white space and comments.
   93:   //
   94:   //    scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/");
   95:   //
   96:   // Skipping repeats as long as it succeeds.  We used to let people do
   97:   // this by writing "(...)*" in the regular expression, but that added
   98:   // up to lots of recursive calls within the pcre library, so now we
   99:   // control repetition explicitly via the function call API.
  100:   //
  101:   // You can pass NULL for "re" if you do not want any data to be skipped.
  102:   void Skip(const char* re);   // DEPRECATED; does *not* repeat
  103:   void SetSkipExpression(const char* re);
  104: 
  105:   // Temporarily pause "skip"ing. This
  106:   //   Skip("Foo"); code ; DisableSkip(); code; EnableSkip()
  107:   // is similar to
  108:   //   Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo");
  109:   // but avoids creating/deleting new RE objects.
  110:   void DisableSkip();
  111: 
  112:   // Reenable previously paused skipping.  Any prefix of the input
  113:   // that matches the skip pattern is immediately dropped.
  114:   void EnableSkip();
  115: 
  116:   /***** Special wrappers around SetSkip() for some common idioms *****/
  117: 
  118:   // Arranges to skip whitespace, C comments, C++ comments.
  119:   // The overall RE is a disjunction of the following REs:
  120:   //    \\s                     whitespace
  121:   //    //.*\n                  C++ comment
  122:   //    /[*](.|\n)*?[*]/        C comment (x*? means minimal repetitions of x)
  123:   // We get repetition via the semantics of SetSkipExpression, not by using *
  124:   void SkipCXXComments() {
  125:     SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/");
  126:   }
  127: 
  128:   void set_save_comments(bool comments) {
  129:     save_comments_ = comments;
  130:   }
  131: 
  132:   bool save_comments() {
  133:     return save_comments_;
  134:   }
  135: 
  136:   // Append to vector ranges the comments found in the
  137:   // byte range [start,end] (inclusive) of the input data.
  138:   // Only comments that were extracted entirely within that
  139:   // range are returned: no range splitting of atomically-extracted
  140:   // comments is performed.
  141:   void GetComments(int start, int end, std::vector<StringPiece> *ranges);
  142: 
  143:   // Append to vector ranges the comments added
  144:   // since the last time this was called. This
  145:   // functionality is provided for efficiency when
  146:   // interleaving scanning with parsing.
  147:   void GetNextComments(std::vector<StringPiece> *ranges);
  148: 
  149:  private:
  150:   std::string   data_;          // All the input data
  151:   StringPiece   input_;         // Unprocessed input
  152:   RE*           skip_;          // If non-NULL, RE for skipping input
  153:   bool          should_skip_;   // If true, use skip_
  154:   bool          skip_repeat_;   // If true, repeat skip_ as long as it works
  155:   bool          save_comments_; // If true, aggregate the skip expression
  156: 
  157:   // the skipped comments
  158:   // TODO: later consider requiring that the StringPieces be added
  159:   // in order by their start position
  160:   std::vector<StringPiece> *comments_;
  161: 
  162:   // the offset into comments_ that has been returned by GetNextComments
  163:   int           comments_offset_;
  164: 
  165:   // helper function to consume *skip_ and honour
  166:   // save_comments_
  167:   void ConsumeSkip();
  168: };
  169: 
  170: }   // namespace pcrecpp
  171: 
  172: #endif /* _PCRE_SCANNER_H */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>