Annotation of embedaddon/php/ext/mbstring/oniguruma/doc/RE, revision 1.1.1.1

1.1       misho       1: Oniguruma Regular Expressions Version 4.3.0    2006/08/17
                      2: 
                      3: syntax: ONIG_SYNTAX_RUBY (default)
                      4: 
                      5: 
                      6: 1. Syntax elements
                      7: 
                      8:   \       escape (enable or disable meta character meaning)
                      9:   |       alternation
                     10:   (...)   group
                     11:   [...]   character class  
                     12: 
                     13: 
                     14: 2. Characters
                     15: 
                     16:   \t           horizontal tab (0x09)
                     17:   \v           vertical tab   (0x0B)
                     18:   \n           newline        (0x0A)
                     19:   \r           return         (0x0D)
                     20:   \b           back space     (0x08)
                     21:   \f           form feed      (0x0C)
                     22:   \a           bell           (0x07)
                     23:   \e           escape         (0x1B)
                     24:   \nnn         octal char            (encoded byte value)
                     25:   \xHH         hexadecimal char      (encoded byte value)
                     26:   \x{7HHHHHHH} wide hexadecimal char (character code point value)
                     27:   \cx          control char          (character code point value)
                     28:   \C-x         control char          (character code point value)
                     29:   \M-x         meta  (x|0x80)        (character code point value)
                     30:   \M-\C-x      meta control char     (character code point value)
                     31: 
                     32:  (* \b is effective in character class [...] only)
                     33: 
                     34: 
                     35: 3. Character types
                     36: 
                     37:   .        any character (except newline)
                     38: 
                     39:   \w       word character
                     40: 
                     41:            Not Unicode:
                     42:              alphanumeric, "_" and multibyte char. 
                     43: 
                     44:            Unicode:
                     45:              General_Category -- (Letter|Mark|Number|Connector_Punctuation)
                     46: 
                     47:   \W       non word char
                     48: 
                     49:   \s       whitespace char
                     50: 
                     51:            Not Unicode:
                     52:              \t, \n, \v, \f, \r, \x20
                     53: 
                     54:            Unicode:
                     55:              0009, 000A, 000B, 000C, 000D, 0085(NEL), 
                     56:              General_Category -- Line_Separator
                     57:                               -- Paragraph_Separator
                     58:                               -- Space_Separator
                     59: 
                     60:   \S       non whitespace char
                     61: 
                     62:   \d       decimal digit char
                     63: 
                     64:            Unicode: General_Category -- Decimal_Number
                     65: 
                     66:   \D       non decimal digit char
                     67: 
                     68:   \h       hexadecimal digit char   [0-9a-fA-F]
                     69: 
                     70:   \H       non hexadecimal digit char
                     71: 
                     72: 
                     73: 4. Quantifier
                     74: 
                     75:   greedy
                     76: 
                     77:     ?       1 or 0 times
                     78:     *       0 or more times
                     79:     +       1 or more times
                     80:     {n,m}   at least n but not more than m times
                     81:     {n,}    at least n times
                     82:     {,n}    at least 0 but not more than n times ({0,n})
                     83:     {n}     n times
                     84: 
                     85:   reluctant
                     86: 
                     87:     ??      1 or 0 times
                     88:     *?      0 or more times
                     89:     +?      1 or more times
                     90:     {n,m}?  at least n but not more than m times  
                     91:     {n,}?   at least n times
                     92:     {,n}?   at least 0 but not more than n times (== {0,n}?)
                     93: 
                     94:   possessive (greedy and does not backtrack after repeated)
                     95: 
                     96:     ?+      1 or 0 times
                     97:     *+      0 or more times
                     98:     ++      1 or more times
                     99: 
                    100:     ({n,m}+, {n,}+, {n}+ are possessive op. in ONIG_SYNTAX_JAVA only)
                    101: 
                    102:     ex. /a*+/ === /(?>a*)/
                    103: 
                    104: 
                    105: 5. Anchors
                    106: 
                    107:   ^       beginning of the line
                    108:   $       end of the line
                    109:   \b      word boundary
                    110:   \B      not word boundary
                    111:   \A      beginning of string
                    112:   \Z      end of string, or before newline at the end
                    113:   \z      end of string
                    114:   \G      matching start position (*)
                    115: 
                    116:           * Ruby Regexp:
                    117:                  previous end-of-match position
                    118:                 (This specification is not related to this library.)
                    119: 
                    120: 
                    121: 6. Character class
                    122: 
                    123:   ^...    negative class (lowest precedence operator)
                    124:   x-y     range from x to y
                    125:   [...]   set (character class in character class)
                    126:   ..&&..  intersection (low precedence at the next of ^)
                    127:           
                    128:     ex. [a-w&&[^c-g]z] ==> ([a-w] AND ([^c-g] OR z)) ==> [abh-w]
                    129: 
                    130:   * If you want to use '[', '-', ']' as a normal character
                    131:     in a character class, you should escape these characters by '\'.
                    132: 
                    133: 
                    134:   POSIX bracket ([:xxxxx:], negate [:^xxxxx:])
                    135: 
                    136:     Not Unicode Case:
                    137: 
                    138:     alnum    alphabet or digit char
                    139:     alpha    alphabet
                    140:     ascii    code value: [0 - 127]
                    141:     blank    \t, \x20
                    142:     cntrl
                    143:     digit    0-9
                    144:     graph    include all of multibyte encoded characters
                    145:     lower
                    146:     print    include all of multibyte encoded characters
                    147:     punct
                    148:     space    \t, \n, \v, \f, \r, \x20
                    149:     upper
                    150:     xdigit   0-9, a-f, A-F
                    151: 
                    152: 
                    153:     Unicode Case:
                    154: 
                    155:     alnum    Letter | Mark | Decimal_Number
                    156:     alpha    Letter | Mark
                    157:     ascii    0000 - 007F
                    158:     blank    Space_Separator | 0009
                    159:     cntrl    Control | Format | Unassigned | Private_Use | Surrogate
                    160:     digit    Decimal_Number
                    161:     graph    [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
                    162:     lower    Lowercase_Letter
                    163:     print    [[:graph:]] | [[:space:]]
                    164:     punct    Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
                    165:              Final_Punctuation | Initial_Punctuation | Other_Punctuation |
                    166:              Open_Punctuation
                    167:     space    Space_Separator | Line_Separator | Paragraph_Separator |
                    168:              0009 | 000A | 000B | 000C | 000D | 0085
                    169:     upper    Uppercase_Letter
                    170:     xdigit   0030 - 0039 | 0041 - 0046 | 0061 - 0066
                    171:              (0-9, a-f, A-F)
                    172: 
                    173: 
                    174: 7. Extended groups
                    175: 
                    176:   (?#...)            comment
                    177: 
                    178:   (?imx-imx)         option on/off
                    179:                          i: ignore case
                    180:                          m: multi-line (dot(.) match newline)
                    181:                          x: extended form
                    182:   (?imx-imx:subexp)  option on/off for subexp
                    183: 
                    184:   (?:subexp)         not captured group
                    185:   (subexp)           captured group
                    186: 
                    187:   (?=subexp)         look-ahead
                    188:   (?!subexp)         negative look-ahead
                    189:   (?<=subexp)        look-behind
                    190:   (?<!subexp)        negative look-behind
                    191: 
                    192:                      Subexp of look-behind must be fixed character length.
                    193:                      But different character length is allowed in top level
                    194:                      alternatives only.
                    195:                      ex. (?<=a|bc) is OK. (?<=aaa(?:b|cd)) is not allowed.
                    196: 
                    197:                      In negative-look-behind, captured group isn't allowed, 
                    198:                      but shy group(?:) is allowed.
                    199: 
                    200:   (?>subexp)         atomic group
                    201:                      don't backtrack in subexp.
                    202: 
                    203:   (?<name>subexp)    define named group
                    204:                      (All characters of the name must be a word character.
                    205:                      And first character must not be a digit or uppper case)
                    206: 
                    207:                      Not only a name but a number is assigned like a captured
                    208:                      group.
                    209: 
                    210:                      Assigning the same name as two or more subexps is allowed.
                    211:                      In this case, a subexp call can not be performed although
                    212:                      the back reference is possible.
                    213: 
                    214: 
                    215: 8. Back reference
                    216: 
                    217:   \n          back reference by group number (n >= 1)
                    218:   \k<name>    back reference by group name
                    219: 
                    220:   In the back reference by the multiplex definition name,
                    221:   a subexp with a large number is referred to preferentially.
                    222:   (When not matched, a group of the small number is referred to.)
                    223: 
                    224:   * Back reference by group number is forbidden if named group is defined 
                    225:     in the pattern and ONIG_OPTION_CAPTURE_GROUP is not setted.
                    226: 
                    227: 
                    228:   back reference with nest level
                    229: 
                    230:     (This function is disabled in Ruby 1.9.)
                    231: 
                    232:     \k<name+n>     n: 0, 1, 2, ...
                    233:     \k<name-n>     n: 0, 1, 2, ...
                    234: 
                    235:     Destinate relative nest level from back reference position.    
                    236: 
                    237:     ex 1.
                    238: 
                    239:       /\A(?<a>|.|(?:(?<b>.)\g<a>\k<b+0>))\z/.match("reer")
                    240: 
                    241:     ex 2.
                    242: 
                    243:       r = Regexp.compile(<<'__REGEXP__'.strip, Regexp::EXTENDED)
                    244:       (?<element> \g<stag> \g<content>* \g<etag> ){0}
                    245:       (?<stag> < \g<name> \s* > ){0}
                    246:       (?<name> [a-zA-Z_:]+ ){0}
                    247:       (?<content> [^<&]+ (\g<element> | [^<&]+)* ){0}
                    248:       (?<etag> </ \k<name+1> >){0}
                    249:       \g<element>
                    250:       __REGEXP__
                    251: 
                    252:       p r.match('<foo>f<bar>bbb</bar>f</foo>').captures
                    253: 
                    254: 
                    255: 
                    256: 9. Subexp call ("Tanaka Akira special")
                    257: 
                    258:   \g<name>    call by group name
                    259:   \g<n>       call by group number (n >= 1)
                    260: 
                    261:   * left-most recursive call is not allowed.
                    262:      ex. (?<name>a|\g<name>b)   => error
                    263:          (?<name>a|b\g<name>c)  => OK
                    264: 
                    265:   * Call by group number is forbidden if named group is defined in the pattern
                    266:     and ONIG_OPTION_CAPTURE_GROUP is not setted.
                    267: 
                    268:   * If the option status of called group is different from calling position
                    269:     then the group's option is effective.
                    270: 
                    271:     ex. (?-i:\g<name>)(?i:(?<name>a)){0}  match to "A"
                    272: 
                    273: 
                    274: 10. Captured group
                    275: 
                    276:   Behavior of the no-named group (...) changes with the following conditions.
                    277:   (But named group is not changed.)
                    278: 
                    279:   case 1. /.../     (named group is not used, no option)
                    280: 
                    281:      (...) is treated as a captured group.
                    282: 
                    283:   case 2. /.../g    (named group is not used, 'g' option)
                    284: 
                    285:      (...) is treated as a no-captured group (?:...).
                    286: 
                    287:   case 3. /..(?<name>..)../   (named group is used, no option)
                    288: 
                    289:      (...) is treated as a no-captured group (?:...).
                    290:      numbered-backref/call is not allowed.
                    291: 
                    292:   case 4. /..(?<name>..)../G  (named group is used, 'G' option)
                    293: 
                    294:      (...) is treated as a captured group.
                    295:      numbered-backref/call is allowed.
                    296: 
                    297:   where
                    298:     g: ONIG_OPTION_DONT_CAPTURE_GROUP
                    299:     G: ONIG_OPTION_CAPTURE_GROUP
                    300: 
                    301:   ('g' and 'G' options are argued in ruby-dev ML)
                    302: 
                    303:   These options are not implemented in Ruby level.
                    304: 
                    305: 
                    306: -----------------------------
                    307: A-1. Syntax depend options
                    308: 
                    309:    + ONIG_SYNTAX_RUBY
                    310:      (?m): dot(.) match newline
                    311: 
                    312:    + ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA
                    313:      (?s): dot(.) match newline
                    314:      (?m): ^ match after newline, $ match before newline
                    315: 
                    316: 
                    317: A-2. Original extensions
                    318: 
                    319:    + hexadecimal digit char type  \h, \H
                    320:    + named group                  (?<name>...)
                    321:    + named backref                \k<name>
                    322:    + subexp call                  \g<name>, \g<group-num>
                    323: 
                    324: 
                    325: A-3. Lacked features compare with perl 5.8.0
                    326: 
                    327:    + [:word:]
                    328:    + \N{name}
                    329:    + \l,\u,\L,\U, \X, \C
                    330:    + (?{code})
                    331:    + (??{code})
                    332:    + (?(condition)yes-pat|no-pat)
                    333: 
                    334:    * \Q...\E
                    335:      This is effective on ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA.
                    336: 
                    337:    * \p{property}, \P{property}
                    338:      This is effective on ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA.
                    339:      Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower,
                    340:      Print, Punct, Space, Upper, XDigit, ASCII are supported.
                    341: 
                    342:      Prefix 'Is' of property name is allowed in ONIG_SYNTAX_PERL only.
                    343:      ex. \p{IsXDigit}.
                    344: 
                    345:      Negation operator of property is supported in ONIG_SYNTAX_PERL only.
                    346:      \p{^...}, \P{^...}
                    347: 
                    348: 
                    349: A-4. Differences with Japanized GNU regex(version 0.12) of Ruby
                    350: 
                    351:    + add hexadecimal digit char type (\h, \H)
                    352:    + add look-behind
                    353:      (?<=fixed-char-length-pattern), (?<!fixed-char-length-pattern)
                    354:    + add possessive quantifier. ?+, *+, ++
                    355:    + add operations in character class. [], &&
                    356:      ('[' must be escaped as an usual char in character class.)
                    357:    + add named group and subexp call.
                    358:    + octal or hexadecimal number sequence can be treated as 
                    359:      a multibyte code char in character class if multibyte encoding
                    360:      is specified.
                    361:      (ex. [\xa1\xa2], [\xa1\xa7-\xa4\xa1])
                    362:    + allow the range of single byte char and multibyte char in character
                    363:      class.
                    364:      ex. /[a-<<any EUC-JP character>>]/ in EUC-JP encoding.
                    365:    + effect range of isolated option is to next ')'.
                    366:      ex. (?:(?i)a|b) is interpreted as (?:(?i:a|b)), not (?:(?i:a)|b).
                    367:    + isolated option is not transparent to previous pattern.
                    368:      ex. a(?i)* is a syntax error pattern.
                    369:    + allowed incompleted left brace as an usual string.
                    370:      ex. /{/, /({)/, /a{2,3/ etc...
                    371:    + negative POSIX bracket [:^xxxx:] is supported.
                    372:    + POSIX bracket [:ascii:] is added.
                    373:    + repeat of look-ahead is not allowed.
                    374:      ex. /(?=a)*/, /(?!b){5}/
                    375:    + Ignore case option is effective to numbered character.
                    376:      ex. /\x61/i =~ "A"
                    377:    + In the range quantifier, the number of the minimum is omissible.
                    378:      /a{,n}/ == /a{0,n}/
                    379:      The simultanious abbreviation of the number of times of the minimum
                    380:      and the maximum is not allowed. (/a{,}/)
                    381:    + /a{n}?/ is not a non-greedy operator.
                    382:      /a{n}?/ == /(?:a{n})?/
                    383:    + invalid back reference is checked and cause error.
                    384:      /\1/, /(a)\2/
                    385:    + Zero-length match in infinite repeat stops the repeat,
                    386:      then changes of the capture group status are checked as stop condition.
                    387:      /(?:()|())*\1\2/ =~ ""
                    388:      /(?:\1a|())*/ =~ "a"
                    389: 
                    390: 
                    391: A-5. Disabled functions by default syntax
                    392: 
                    393:    + capture history
                    394: 
                    395:      (?@...) and (?@<name>...)
                    396: 
                    397:      ex. /(?@a)*/.match("aaa") ==> [<0-1>, <1-2>, <2-3>]
                    398: 
                    399:      see sample/listcap.c file.
                    400: 
                    401: 
                    402: A-6. Problems
                    403: 
                    404:    + Invalid encoding byte sequence is not checked in UTF-8.
                    405: 
                    406:      * Invalid first byte is treated as a character.
                    407:        /./u =~ "\xa3"
                    408: 
                    409:      * Incomplete byte sequence is not checked.
                    410:        /\w+/ =~ "a\xf3\x8ec"
                    411: 
                    412: // END

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>