Annotation of embedaddon/php/ext/ereg/regex/tests, revision 1.1.1.1
1.1 misho 1: # regular expression test set
2: # Lines are at least three fields, separated by one or more tabs. "" stands
3: # for an empty field. First field is an RE. Second field is flags. If
4: # C flag given, regcomp() is expected to fail, and the third field is the
5: # error name (minus the leading REG_).
6: #
7: # Otherwise it is expected to succeed, and the third field is the string to
8: # try matching it against. If there is no fourth field, the match is
9: # expected to fail. If there is a fourth field, it is the substring that
10: # the RE is expected to match. If there is a fifth field, it is a comma-
11: # separated list of what the subexpressions should match, with - indicating
12: # no match for that one. In both the fourth and fifth fields, a (sub)field
13: # starting with @ indicates that the (sub)expression is expected to match
14: # a null string followed by the stuff after the @; this provides a way to
15: # test where null strings match. The character `N' in REs and strings
16: # is newline, `S' is space, `T' is tab, `Z' is NUL.
17: #
18: # The full list of flags:
19: # - placeholder, does nothing
20: # b RE is a BRE, not an ERE
21: # & try it as both an ERE and a BRE
22: # C regcomp() error expected, third field is error name
23: # i REG_ICASE
24: # m ("mundane") REG_NOSPEC
25: # s REG_NOSUB (not really testable)
26: # n REG_NEWLINE
27: # ^ REG_NOTBOL
28: # $ REG_NOTEOL
29: # # REG_STARTEND (see below)
30: # p REG_PEND
31: #
32: # For REG_STARTEND, the start/end offsets are those of the substring
33: # enclosed in ().
34:
35: # basics
36: a & a a
37: abc & abc abc
38: abc|de - abc abc
39: a|b|c - abc a
40:
41: # parentheses and perversions thereof
42: a(b)c - abc abc
43: a\(b\)c b abc abc
44: a( C EPAREN
45: a( b a( a(
46: a\( - a( a(
47: a\( bC EPAREN
48: a\(b bC EPAREN
49: a(b C EPAREN
50: a(b b a(b a(b
51: # gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly)
52: a) - a) a)
53: ) - ) )
54: # end gagging (in a just world, those *should* give EPAREN)
55: a) b a) a)
56: a\) bC EPAREN
57: \) bC EPAREN
58: a()b - ab ab
59: a\(\)b b ab ab
60:
61: # anchoring and REG_NEWLINE
62: ^abc$ & abc abc
63: a^b - a^b
64: a^b b a^b a^b
65: a$b - a$b
66: a$b b a$b a$b
67: ^ & abc @abc
68: $ & abc @
69: ^$ & "" @
70: $^ - "" @
71: \($\)\(^\) b "" @
72: # stop retching, those are legitimate (although disgusting)
73: ^^ - "" @
74: $$ - "" @
75: b$ & abNc
76: b$ &n abNc b
77: ^b$ & aNbNc
78: ^b$ &n aNbNc b
79: ^$ &n aNNb @Nb
80: ^$ n abc
81: ^$ n abcN @
82: $^ n aNNb @Nb
83: \($\)\(^\) bn aNNb @Nb
84: ^^ n^ aNNb @Nb
85: $$ n aNNb @NN
86: ^a ^ a
87: a$ $ a
88: ^a ^n aNb
89: ^b ^n aNb b
90: a$ $n bNa
91: b$ $n bNa b
92: a*(^b$)c* - b b
93: a*\(^b$\)c* b b b
94:
95: # certain syntax errors and non-errors
96: | C EMPTY
97: | b | |
98: * C BADRPT
99: * b * *
100: + C BADRPT
101: ? C BADRPT
102: "" &C EMPTY
103: () - abc @abc
104: \(\) b abc @abc
105: a||b C EMPTY
106: |ab C EMPTY
107: ab| C EMPTY
108: (|a)b C EMPTY
109: (a|)b C EMPTY
110: (*a) C BADRPT
111: (+a) C BADRPT
112: (?a) C BADRPT
113: ({1}a) C BADRPT
114: \(\{1\}a\) bC BADRPT
115: (a|*b) C BADRPT
116: (a|+b) C BADRPT
117: (a|?b) C BADRPT
118: (a|{1}b) C BADRPT
119: ^* C BADRPT
120: ^* b * *
121: ^+ C BADRPT
122: ^? C BADRPT
123: ^{1} C BADRPT
124: ^\{1\} bC BADRPT
125:
126: # metacharacters, backslashes
127: a.c & abc abc
128: a[bc]d & abd abd
129: a\*c & a*c a*c
130: a\\b & a\b a\b
131: a\\\*b & a\*b a\*b
132: a\bc & abc abc
133: a\ &C EESCAPE
134: a\\bc & a\bc a\bc
135: \{ bC BADRPT
136: a\[b & a[b a[b
137: a[b &C EBRACK
138: # trailing $ is a peculiar special case for the BRE code
139: a$ & a a
140: a$ & a$
141: a\$ & a
142: a\$ & a$ a$
143: a\\$ & a
144: a\\$ & a$
145: a\\$ & a\$
146: a\\$ & a\ a\
147:
148: # back references, ugh
149: a\(b\)\2c bC ESUBREG
150: a\(b\1\)c bC ESUBREG
151: a\(b*\)c\1d b abbcbbd abbcbbd bb
152: a\(b*\)c\1d b abbcbd
153: a\(b*\)c\1d b abbcbbbd
154: ^\(.\)\1 b abc
155: a\([bc]\)\1d b abcdabbd abbd b
156: a\(\([bc]\)\2\)*d b abbccd abbccd
157: a\(\([bc]\)\2\)*d b abbcbd
158: # actually, this next one probably ought to fail, but the spec is unclear
159: a\(\(b\)*\2\)*d b abbbd abbbd
160: # here is a case that no NFA implementation does right
161: \(ab*\)[ab]*\1 b ababaaa ababaaa a
162: # check out normal matching in the presence of back refs
163: \(a\)\1bcd b aabcd aabcd
164: \(a\)\1bc*d b aabcd aabcd
165: \(a\)\1bc*d b aabd aabd
166: \(a\)\1bc*d b aabcccd aabcccd
167: \(a\)\1bc*[ce]d b aabcccd aabcccd
168: ^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd
169:
170: # ordinary repetitions
171: ab*c & abc abc
172: ab+c - abc abc
173: ab?c - abc abc
174: a\(*\)b b a*b a*b
175: a\(**\)b b ab ab
176: a\(***\)b bC BADRPT
177: *a b *a *a
178: **a b a a
179: ***a bC BADRPT
180:
181: # the dreaded bounded repetitions
182: { & { {
183: {abc & {abc {abc
184: {1 C BADRPT
185: {1} C BADRPT
186: a{b & a{b a{b
187: a{1}b - ab ab
188: a\{1\}b b ab ab
189: a{1,}b - ab ab
190: a\{1,\}b b ab ab
191: a{1,2}b - aab aab
192: a\{1,2\}b b aab aab
193: a{1 C EBRACE
194: a\{1 bC EBRACE
195: a{1a C EBRACE
196: a\{1a bC EBRACE
197: a{1a} C BADBR
198: a\{1a\} bC BADBR
199: a{,2} - a{,2} a{,2}
200: a\{,2\} bC BADBR
201: a{,} - a{,} a{,}
202: a\{,\} bC BADBR
203: a{1,x} C BADBR
204: a\{1,x\} bC BADBR
205: a{1,x C EBRACE
206: a\{1,x bC EBRACE
207: a{300} C BADBR
208: a\{300\} bC BADBR
209: a{1,0} C BADBR
210: a\{1,0\} bC BADBR
211: ab{0,0}c - abcac ac
212: ab\{0,0\}c b abcac ac
213: ab{0,1}c - abcac abc
214: ab\{0,1\}c b abcac abc
215: ab{0,3}c - abbcac abbc
216: ab\{0,3\}c b abbcac abbc
217: ab{1,1}c - acabc abc
218: ab\{1,1\}c b acabc abc
219: ab{1,3}c - acabc abc
220: ab\{1,3\}c b acabc abc
221: ab{2,2}c - abcabbc abbc
222: ab\{2,2\}c b abcabbc abbc
223: ab{2,4}c - abcabbc abbc
224: ab\{2,4\}c b abcabbc abbc
225: ((a{1,10}){1,10}){1,10} - a a a,a
226:
227: # multiple repetitions
228: a** &C BADRPT
229: a++ C BADRPT
230: a?? C BADRPT
231: a*+ C BADRPT
232: a*? C BADRPT
233: a+* C BADRPT
234: a+? C BADRPT
235: a?* C BADRPT
236: a?+ C BADRPT
237: a{1}{1} C BADRPT
238: a*{1} C BADRPT
239: a+{1} C BADRPT
240: a?{1} C BADRPT
241: a{1}* C BADRPT
242: a{1}+ C BADRPT
243: a{1}? C BADRPT
244: a*{b} - a{b} a{b}
245: a\{1\}\{1\} bC BADRPT
246: a*\{1\} bC BADRPT
247: a\{1\}* bC BADRPT
248:
249: # brackets, and numerous perversions thereof
250: a[b]c & abc abc
251: a[ab]c & abc abc
252: a[^ab]c & adc adc
253: a[]b]c & a]c a]c
254: a[[b]c & a[c a[c
255: a[-b]c & a-c a-c
256: a[^]b]c & adc adc
257: a[^-b]c & adc adc
258: a[b-]c & a-c a-c
259: a[b &C EBRACK
260: a[] &C EBRACK
261: a[1-3]c & a2c a2c
262: a[3-1]c &C ERANGE
263: a[1-3-5]c &C ERANGE
264: a[[.-.]--]c & a-c a-c
265: a[1- &C ERANGE
266: a[[. &C EBRACK
267: a[[.x &C EBRACK
268: a[[.x. &C EBRACK
269: a[[.x.] &C EBRACK
270: a[[.x.]] & ax ax
271: a[[.x,.]] &C ECOLLATE
272: a[[.one.]]b & a1b a1b
273: a[[.notdef.]]b &C ECOLLATE
274: a[[.].]]b & a]b a]b
275: a[[:alpha:]]c & abc abc
276: a[[:notdef:]]c &C ECTYPE
277: a[[: &C EBRACK
278: a[[:alpha &C EBRACK
279: a[[:alpha:] &C EBRACK
280: a[[:alpha,:] &C ECTYPE
281: a[[:]:]]b &C ECTYPE
282: a[[:-:]]b &C ECTYPE
283: a[[:alph:]] &C ECTYPE
284: a[[:alphabet:]] &C ECTYPE
285: [[:alnum:]]+ - -%@a0X- a0X
286: [[:alpha:]]+ - -%@aX0- aX
287: [[:blank:]]+ - aSSTb SST
288: [[:cntrl:]]+ - aNTb NT
289: [[:digit:]]+ - a019b 019
290: [[:graph:]]+ - Sa%bS a%b
291: [[:lower:]]+ - AabC ab
292: [[:print:]]+ - NaSbN aSb
293: [[:punct:]]+ - S%-&T %-&
294: [[:space:]]+ - aSNTb SNT
295: [[:upper:]]+ - aBCd BC
296: [[:xdigit:]]+ - p0f3Cq 0f3C
297: a[[=b=]]c & abc abc
298: a[[= &C EBRACK
299: a[[=b &C EBRACK
300: a[[=b= &C EBRACK
301: a[[=b=] &C EBRACK
302: a[[=b,=]] &C ECOLLATE
303: a[[=one=]]b & a1b a1b
304:
305: # complexities
306: a(((b)))c - abc abc
307: a(b|(c))d - abd abd
308: a(b*|c)d - abbd abbd
309: # just gotta have one DFA-buster, of course
310: a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
311: # and an inline expansion in case somebody gets tricky
312: a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
313: # and in case somebody just slips in an NFA...
314: a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights
315: # fish for anomalies as the number of states passes 32
316: 12345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789
317: 123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890
318: 1234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901
319: 12345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012
320: 123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123
321: # and one really big one, beyond any plausible word width
322: 1234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890
323: # fish for problems as brackets go past 8
324: [ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm
325: [ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo
326: [ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq
327: [ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq
328:
329: # subtleties of matching
330: abc & xabcy abc
331: a\(b\)?c\1d b acd
332: aBc i Abc Abc
333: a[Bc]*d i abBCcd abBCcd
334: 0[[:upper:]]1 &i 0a1 0a1
335: 0[[:lower:]]1 &i 0A1 0A1
336: a[^b]c &i abc
337: a[^b]c &i aBc
338: a[^b]c &i adc adc
339: [a]b[c] - abc abc
340: [a]b[a] - aba aba
341: [abc]b[abc] - abc abc
342: [abc]b[abd] - abd abd
343: a(b?c)+d - accd accd
344: (wee|week)(knights|night) - weeknights weeknights
345: (we|wee|week|frob)(knights|night|day) - weeknights weeknights
346: a[bc]d - xyzaaabcaababdacd abd
347: a[ab]c - aaabc abc
348: abc s abc abc
349: a* & b @b
350:
351: # Let's have some fun -- try to match a C comment.
352: # first the obvious, which looks okay at first glance...
353: /\*.*\*/ - /*x*/ /*x*/
354: # but...
355: /\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/
356: # okay, we must not match */ inside; try to do that...
357: /\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/
358: /\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/
359: # but...
360: /\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/
361: # and a still fancier version, which does it right (I think)...
362: /\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/
363: /\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/
364: /\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/
365: /\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/
366: /\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/
367: /\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/
368:
369: # subexpressions
370: a(b)(c)d - abcd abcd b,c
371: a(((b)))c - abc abc b,b,b
372: a(b|(c))d - abd abd b,-
373: a(b*|c|e)d - abbd abbd bb
374: a(b*|c|e)d - acd acd c
375: a(b*|c|e)d - ad ad @d
376: a(b?)c - abc abc b
377: a(b?)c - ac ac @c
378: a(b+)c - abc abc b
379: a(b+)c - abbbc abbbc bbb
380: a(b*)c - ac ac @c
381: (a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de
382: # the regression tester only asks for 9 subexpressions
383: a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j
384: a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k
385: a([bc]?)c - abc abc b
386: a([bc]?)c - ac ac @c
387: a([bc]+)c - abc abc b
388: a([bc]+)c - abcc abcc bc
389: a([bc]+)bc - abcbc abcbc bc
390: a(bb+|b)b - abb abb b
391: a(bbb+|bb+|b)b - abb abb b
392: a(bbb+|bb+|b)b - abbb abbb bb
393: a(bbb+|bb+|b)bb - abbb abbb b
394: (.*).* - abcdef abcdef abcdef
395: (a*)* - bc @b @b
396:
397: # do we get the right subexpression when it is used more than once?
398: a(b|c)*d - ad ad -
399: a(b|c)*d - abcd abcd c
400: a(b|c)+d - abd abd b
401: a(b|c)+d - abcd abcd c
402: a(b|c?)+d - ad ad @d
403: a(b|c?)+d - abcd abcd @d
404: a(b|c){0,0}d - ad ad -
405: a(b|c){0,1}d - ad ad -
406: a(b|c){0,1}d - abd abd b
407: a(b|c){0,2}d - ad ad -
408: a(b|c){0,2}d - abcd abcd c
409: a(b|c){0,}d - ad ad -
410: a(b|c){0,}d - abcd abcd c
411: a(b|c){1,1}d - abd abd b
412: a(b|c){1,1}d - acd acd c
413: a(b|c){1,2}d - abd abd b
414: a(b|c){1,2}d - abcd abcd c
415: a(b|c){1,}d - abd abd b
416: a(b|c){1,}d - abcd abcd c
417: a(b|c){2,2}d - acbd acbd b
418: a(b|c){2,2}d - abcd abcd c
419: a(b|c){2,4}d - abcd abcd c
420: a(b|c){2,4}d - abcbd abcbd b
421: a(b|c){2,4}d - abcbcd abcbcd c
422: a(b|c){2,}d - abcd abcd c
423: a(b|c){2,}d - abcbd abcbd b
424: a(b+|((c)*))+d - abd abd @d,@d,-
425: a(b+|((c)*))+d - abcd abcd @d,@d,-
426:
427: # check out the STARTEND option
428: [abc] &# a(b)c b
429: [abc] &# a(d)c
430: [abc] &# a(bc)d b
431: [abc] &# a(dc)d c
432: . &# a()c
433: b.*c &# b(bc)c bc
434: b.* &# b(bc)c bc
435: .*c &# b(bc)c bc
436:
437: # plain strings, with the NOSPEC flag
438: abc m abc abc
439: abc m xabcy abc
440: abc m xyz
441: a*b m aba*b a*b
442: a*b m ab
443: "" mC EMPTY
444:
445: # cases involving NULs
446: aZb & a a
447: aZb &p a
448: aZb &p# (aZb) aZb
449: aZ*b &p# (ab) ab
450: a.b &# (aZb) aZb
451: a.* &# (aZb)c aZb
452:
453: # word boundaries (ick)
454: [[:<:]]a & a a
455: [[:<:]]a & ba
456: [[:<:]]a & -a a
457: a[[:>:]] & a a
458: a[[:>:]] & ab
459: a[[:>:]] & a- a
460: [[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc
461: [[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc
462: [[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc
463: [[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc
464: [[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_
465: [[:<:]]a_b[[:>:]] & x_a_b
466:
467: # past problems, and suspected problems
468: (A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1
469: abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop
470: abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv
471: (ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11
472: CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11
473: Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz
474: a?b - ab ab
475: -\{0,1\}[0-9]*$ b -5 -5
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>