Annotation of embedaddon/php/ext/mbstring/oniguruma/doc/RE, revision 1.1

1.1     ! misho       1: Oniguruma Regular Expressions Version 4.3.0    2006/08/17
        !             2: 
        !             3: syntax: ONIG_SYNTAX_RUBY (default)
        !             4: 
        !             5: 
        !             6: 1. Syntax elements
        !             7: 
        !             8:   \       escape (enable or disable meta character meaning)
        !             9:   |       alternation
        !            10:   (...)   group
        !            11:   [...]   character class  
        !            12: 
        !            13: 
        !            14: 2. Characters
        !            15: 
        !            16:   \t           horizontal tab (0x09)
        !            17:   \v           vertical tab   (0x0B)
        !            18:   \n           newline        (0x0A)
        !            19:   \r           return         (0x0D)
        !            20:   \b           back space     (0x08)
        !            21:   \f           form feed      (0x0C)
        !            22:   \a           bell           (0x07)
        !            23:   \e           escape         (0x1B)
        !            24:   \nnn         octal char            (encoded byte value)
        !            25:   \xHH         hexadecimal char      (encoded byte value)
        !            26:   \x{7HHHHHHH} wide hexadecimal char (character code point value)
        !            27:   \cx          control char          (character code point value)
        !            28:   \C-x         control char          (character code point value)
        !            29:   \M-x         meta  (x|0x80)        (character code point value)
        !            30:   \M-\C-x      meta control char     (character code point value)
        !            31: 
        !            32:  (* \b is effective in character class [...] only)
        !            33: 
        !            34: 
        !            35: 3. Character types
        !            36: 
        !            37:   .        any character (except newline)
        !            38: 
        !            39:   \w       word character
        !            40: 
        !            41:            Not Unicode:
        !            42:              alphanumeric, "_" and multibyte char. 
        !            43: 
        !            44:            Unicode:
        !            45:              General_Category -- (Letter|Mark|Number|Connector_Punctuation)
        !            46: 
        !            47:   \W       non word char
        !            48: 
        !            49:   \s       whitespace char
        !            50: 
        !            51:            Not Unicode:
        !            52:              \t, \n, \v, \f, \r, \x20
        !            53: 
        !            54:            Unicode:
        !            55:              0009, 000A, 000B, 000C, 000D, 0085(NEL), 
        !            56:              General_Category -- Line_Separator
        !            57:                               -- Paragraph_Separator
        !            58:                               -- Space_Separator
        !            59: 
        !            60:   \S       non whitespace char
        !            61: 
        !            62:   \d       decimal digit char
        !            63: 
        !            64:            Unicode: General_Category -- Decimal_Number
        !            65: 
        !            66:   \D       non decimal digit char
        !            67: 
        !            68:   \h       hexadecimal digit char   [0-9a-fA-F]
        !            69: 
        !            70:   \H       non hexadecimal digit char
        !            71: 
        !            72: 
        !            73: 4. Quantifier
        !            74: 
        !            75:   greedy
        !            76: 
        !            77:     ?       1 or 0 times
        !            78:     *       0 or more times
        !            79:     +       1 or more times
        !            80:     {n,m}   at least n but not more than m times
        !            81:     {n,}    at least n times
        !            82:     {,n}    at least 0 but not more than n times ({0,n})
        !            83:     {n}     n times
        !            84: 
        !            85:   reluctant
        !            86: 
        !            87:     ??      1 or 0 times
        !            88:     *?      0 or more times
        !            89:     +?      1 or more times
        !            90:     {n,m}?  at least n but not more than m times  
        !            91:     {n,}?   at least n times
        !            92:     {,n}?   at least 0 but not more than n times (== {0,n}?)
        !            93: 
        !            94:   possessive (greedy and does not backtrack after repeated)
        !            95: 
        !            96:     ?+      1 or 0 times
        !            97:     *+      0 or more times
        !            98:     ++      1 or more times
        !            99: 
        !           100:     ({n,m}+, {n,}+, {n}+ are possessive op. in ONIG_SYNTAX_JAVA only)
        !           101: 
        !           102:     ex. /a*+/ === /(?>a*)/
        !           103: 
        !           104: 
        !           105: 5. Anchors
        !           106: 
        !           107:   ^       beginning of the line
        !           108:   $       end of the line
        !           109:   \b      word boundary
        !           110:   \B      not word boundary
        !           111:   \A      beginning of string
        !           112:   \Z      end of string, or before newline at the end
        !           113:   \z      end of string
        !           114:   \G      matching start position (*)
        !           115: 
        !           116:           * Ruby Regexp:
        !           117:                  previous end-of-match position
        !           118:                 (This specification is not related to this library.)
        !           119: 
        !           120: 
        !           121: 6. Character class
        !           122: 
        !           123:   ^...    negative class (lowest precedence operator)
        !           124:   x-y     range from x to y
        !           125:   [...]   set (character class in character class)
        !           126:   ..&&..  intersection (low precedence at the next of ^)
        !           127:           
        !           128:     ex. [a-w&&[^c-g]z] ==> ([a-w] AND ([^c-g] OR z)) ==> [abh-w]
        !           129: 
        !           130:   * If you want to use '[', '-', ']' as a normal character
        !           131:     in a character class, you should escape these characters by '\'.
        !           132: 
        !           133: 
        !           134:   POSIX bracket ([:xxxxx:], negate [:^xxxxx:])
        !           135: 
        !           136:     Not Unicode Case:
        !           137: 
        !           138:     alnum    alphabet or digit char
        !           139:     alpha    alphabet
        !           140:     ascii    code value: [0 - 127]
        !           141:     blank    \t, \x20
        !           142:     cntrl
        !           143:     digit    0-9
        !           144:     graph    include all of multibyte encoded characters
        !           145:     lower
        !           146:     print    include all of multibyte encoded characters
        !           147:     punct
        !           148:     space    \t, \n, \v, \f, \r, \x20
        !           149:     upper
        !           150:     xdigit   0-9, a-f, A-F
        !           151: 
        !           152: 
        !           153:     Unicode Case:
        !           154: 
        !           155:     alnum    Letter | Mark | Decimal_Number
        !           156:     alpha    Letter | Mark
        !           157:     ascii    0000 - 007F
        !           158:     blank    Space_Separator | 0009
        !           159:     cntrl    Control | Format | Unassigned | Private_Use | Surrogate
        !           160:     digit    Decimal_Number
        !           161:     graph    [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
        !           162:     lower    Lowercase_Letter
        !           163:     print    [[:graph:]] | [[:space:]]
        !           164:     punct    Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
        !           165:              Final_Punctuation | Initial_Punctuation | Other_Punctuation |
        !           166:              Open_Punctuation
        !           167:     space    Space_Separator | Line_Separator | Paragraph_Separator |
        !           168:              0009 | 000A | 000B | 000C | 000D | 0085
        !           169:     upper    Uppercase_Letter
        !           170:     xdigit   0030 - 0039 | 0041 - 0046 | 0061 - 0066
        !           171:              (0-9, a-f, A-F)
        !           172: 
        !           173: 
        !           174: 7. Extended groups
        !           175: 
        !           176:   (?#...)            comment
        !           177: 
        !           178:   (?imx-imx)         option on/off
        !           179:                          i: ignore case
        !           180:                          m: multi-line (dot(.) match newline)
        !           181:                          x: extended form
        !           182:   (?imx-imx:subexp)  option on/off for subexp
        !           183: 
        !           184:   (?:subexp)         not captured group
        !           185:   (subexp)           captured group
        !           186: 
        !           187:   (?=subexp)         look-ahead
        !           188:   (?!subexp)         negative look-ahead
        !           189:   (?<=subexp)        look-behind
        !           190:   (?<!subexp)        negative look-behind
        !           191: 
        !           192:                      Subexp of look-behind must be fixed character length.
        !           193:                      But different character length is allowed in top level
        !           194:                      alternatives only.
        !           195:                      ex. (?<=a|bc) is OK. (?<=aaa(?:b|cd)) is not allowed.
        !           196: 
        !           197:                      In negative-look-behind, captured group isn't allowed, 
        !           198:                      but shy group(?:) is allowed.
        !           199: 
        !           200:   (?>subexp)         atomic group
        !           201:                      don't backtrack in subexp.
        !           202: 
        !           203:   (?<name>subexp)    define named group
        !           204:                      (All characters of the name must be a word character.
        !           205:                      And first character must not be a digit or uppper case)
        !           206: 
        !           207:                      Not only a name but a number is assigned like a captured
        !           208:                      group.
        !           209: 
        !           210:                      Assigning the same name as two or more subexps is allowed.
        !           211:                      In this case, a subexp call can not be performed although
        !           212:                      the back reference is possible.
        !           213: 
        !           214: 
        !           215: 8. Back reference
        !           216: 
        !           217:   \n          back reference by group number (n >= 1)
        !           218:   \k<name>    back reference by group name
        !           219: 
        !           220:   In the back reference by the multiplex definition name,
        !           221:   a subexp with a large number is referred to preferentially.
        !           222:   (When not matched, a group of the small number is referred to.)
        !           223: 
        !           224:   * Back reference by group number is forbidden if named group is defined 
        !           225:     in the pattern and ONIG_OPTION_CAPTURE_GROUP is not setted.
        !           226: 
        !           227: 
        !           228:   back reference with nest level
        !           229: 
        !           230:     (This function is disabled in Ruby 1.9.)
        !           231: 
        !           232:     \k<name+n>     n: 0, 1, 2, ...
        !           233:     \k<name-n>     n: 0, 1, 2, ...
        !           234: 
        !           235:     Destinate relative nest level from back reference position.    
        !           236: 
        !           237:     ex 1.
        !           238: 
        !           239:       /\A(?<a>|.|(?:(?<b>.)\g<a>\k<b+0>))\z/.match("reer")
        !           240: 
        !           241:     ex 2.
        !           242: 
        !           243:       r = Regexp.compile(<<'__REGEXP__'.strip, Regexp::EXTENDED)
        !           244:       (?<element> \g<stag> \g<content>* \g<etag> ){0}
        !           245:       (?<stag> < \g<name> \s* > ){0}
        !           246:       (?<name> [a-zA-Z_:]+ ){0}
        !           247:       (?<content> [^<&]+ (\g<element> | [^<&]+)* ){0}
        !           248:       (?<etag> </ \k<name+1> >){0}
        !           249:       \g<element>
        !           250:       __REGEXP__
        !           251: 
        !           252:       p r.match('<foo>f<bar>bbb</bar>f</foo>').captures
        !           253: 
        !           254: 
        !           255: 
        !           256: 9. Subexp call ("Tanaka Akira special")
        !           257: 
        !           258:   \g<name>    call by group name
        !           259:   \g<n>       call by group number (n >= 1)
        !           260: 
        !           261:   * left-most recursive call is not allowed.
        !           262:      ex. (?<name>a|\g<name>b)   => error
        !           263:          (?<name>a|b\g<name>c)  => OK
        !           264: 
        !           265:   * Call by group number is forbidden if named group is defined in the pattern
        !           266:     and ONIG_OPTION_CAPTURE_GROUP is not setted.
        !           267: 
        !           268:   * If the option status of called group is different from calling position
        !           269:     then the group's option is effective.
        !           270: 
        !           271:     ex. (?-i:\g<name>)(?i:(?<name>a)){0}  match to "A"
        !           272: 
        !           273: 
        !           274: 10. Captured group
        !           275: 
        !           276:   Behavior of the no-named group (...) changes with the following conditions.
        !           277:   (But named group is not changed.)
        !           278: 
        !           279:   case 1. /.../     (named group is not used, no option)
        !           280: 
        !           281:      (...) is treated as a captured group.
        !           282: 
        !           283:   case 2. /.../g    (named group is not used, 'g' option)
        !           284: 
        !           285:      (...) is treated as a no-captured group (?:...).
        !           286: 
        !           287:   case 3. /..(?<name>..)../   (named group is used, no option)
        !           288: 
        !           289:      (...) is treated as a no-captured group (?:...).
        !           290:      numbered-backref/call is not allowed.
        !           291: 
        !           292:   case 4. /..(?<name>..)../G  (named group is used, 'G' option)
        !           293: 
        !           294:      (...) is treated as a captured group.
        !           295:      numbered-backref/call is allowed.
        !           296: 
        !           297:   where
        !           298:     g: ONIG_OPTION_DONT_CAPTURE_GROUP
        !           299:     G: ONIG_OPTION_CAPTURE_GROUP
        !           300: 
        !           301:   ('g' and 'G' options are argued in ruby-dev ML)
        !           302: 
        !           303:   These options are not implemented in Ruby level.
        !           304: 
        !           305: 
        !           306: -----------------------------
        !           307: A-1. Syntax depend options
        !           308: 
        !           309:    + ONIG_SYNTAX_RUBY
        !           310:      (?m): dot(.) match newline
        !           311: 
        !           312:    + ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA
        !           313:      (?s): dot(.) match newline
        !           314:      (?m): ^ match after newline, $ match before newline
        !           315: 
        !           316: 
        !           317: A-2. Original extensions
        !           318: 
        !           319:    + hexadecimal digit char type  \h, \H
        !           320:    + named group                  (?<name>...)
        !           321:    + named backref                \k<name>
        !           322:    + subexp call                  \g<name>, \g<group-num>
        !           323: 
        !           324: 
        !           325: A-3. Lacked features compare with perl 5.8.0
        !           326: 
        !           327:    + [:word:]
        !           328:    + \N{name}
        !           329:    + \l,\u,\L,\U, \X, \C
        !           330:    + (?{code})
        !           331:    + (??{code})
        !           332:    + (?(condition)yes-pat|no-pat)
        !           333: 
        !           334:    * \Q...\E
        !           335:      This is effective on ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA.
        !           336: 
        !           337:    * \p{property}, \P{property}
        !           338:      This is effective on ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA.
        !           339:      Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower,
        !           340:      Print, Punct, Space, Upper, XDigit, ASCII are supported.
        !           341: 
        !           342:      Prefix 'Is' of property name is allowed in ONIG_SYNTAX_PERL only.
        !           343:      ex. \p{IsXDigit}.
        !           344: 
        !           345:      Negation operator of property is supported in ONIG_SYNTAX_PERL only.
        !           346:      \p{^...}, \P{^...}
        !           347: 
        !           348: 
        !           349: A-4. Differences with Japanized GNU regex(version 0.12) of Ruby
        !           350: 
        !           351:    + add hexadecimal digit char type (\h, \H)
        !           352:    + add look-behind
        !           353:      (?<=fixed-char-length-pattern), (?<!fixed-char-length-pattern)
        !           354:    + add possessive quantifier. ?+, *+, ++
        !           355:    + add operations in character class. [], &&
        !           356:      ('[' must be escaped as an usual char in character class.)
        !           357:    + add named group and subexp call.
        !           358:    + octal or hexadecimal number sequence can be treated as 
        !           359:      a multibyte code char in character class if multibyte encoding
        !           360:      is specified.
        !           361:      (ex. [\xa1\xa2], [\xa1\xa7-\xa4\xa1])
        !           362:    + allow the range of single byte char and multibyte char in character
        !           363:      class.
        !           364:      ex. /[a-<<any EUC-JP character>>]/ in EUC-JP encoding.
        !           365:    + effect range of isolated option is to next ')'.
        !           366:      ex. (?:(?i)a|b) is interpreted as (?:(?i:a|b)), not (?:(?i:a)|b).
        !           367:    + isolated option is not transparent to previous pattern.
        !           368:      ex. a(?i)* is a syntax error pattern.
        !           369:    + allowed incompleted left brace as an usual string.
        !           370:      ex. /{/, /({)/, /a{2,3/ etc...
        !           371:    + negative POSIX bracket [:^xxxx:] is supported.
        !           372:    + POSIX bracket [:ascii:] is added.
        !           373:    + repeat of look-ahead is not allowed.
        !           374:      ex. /(?=a)*/, /(?!b){5}/
        !           375:    + Ignore case option is effective to numbered character.
        !           376:      ex. /\x61/i =~ "A"
        !           377:    + In the range quantifier, the number of the minimum is omissible.
        !           378:      /a{,n}/ == /a{0,n}/
        !           379:      The simultanious abbreviation of the number of times of the minimum
        !           380:      and the maximum is not allowed. (/a{,}/)
        !           381:    + /a{n}?/ is not a non-greedy operator.
        !           382:      /a{n}?/ == /(?:a{n})?/
        !           383:    + invalid back reference is checked and cause error.
        !           384:      /\1/, /(a)\2/
        !           385:    + Zero-length match in infinite repeat stops the repeat,
        !           386:      then changes of the capture group status are checked as stop condition.
        !           387:      /(?:()|())*\1\2/ =~ ""
        !           388:      /(?:\1a|())*/ =~ "a"
        !           389: 
        !           390: 
        !           391: A-5. Disabled functions by default syntax
        !           392: 
        !           393:    + capture history
        !           394: 
        !           395:      (?@...) and (?@<name>...)
        !           396: 
        !           397:      ex. /(?@a)*/.match("aaa") ==> [<0-1>, <1-2>, <2-3>]
        !           398: 
        !           399:      see sample/listcap.c file.
        !           400: 
        !           401: 
        !           402: A-6. Problems
        !           403: 
        !           404:    + Invalid encoding byte sequence is not checked in UTF-8.
        !           405: 
        !           406:      * Invalid first byte is treated as a character.
        !           407:        /./u =~ "\xa3"
        !           408: 
        !           409:      * Incomplete byte sequence is not checked.
        !           410:        /\w+/ =~ "a\xf3\x8ec"
        !           411: 
        !           412: // END

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>