embedaddon/pcre/perltest.pl - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / pcre / perltest.pl
Revision 1.1.1.3 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Mon Jul 22 08:25:55 2013 UTC (10 years, 11 months ago) by misho
Branches: pcre, MAIN
CVS tags: v8_33, HEAD

8.33

1: #! /usr/bin/env perl 2: 3: # Program for testing regular expressions with perl to check that PCRE handles 4: # them the same. This version supports /8 for UTF-8 testing. However, it needs 5: # to have "use utf8" at the start for running the UTF-8 tests, but *not* for 6: # the other tests. The only way I've found for doing this is to cat this line 7: # in explicitly in the RunPerlTest script. I've also used this method to supply 8: # "require Encode" for the UTF-8 tests, so that the main test will still run 9: # where Encode is not installed. 10: 11: # use locale; # With this included, \x0b matches \s! 12: 13: # Function for turning a string into a string of printing chars. 14: 15: #use utf8; 16: #require Encode; 17: 18: sub pchars { 19: my($t) = ""; 20: 21: if ($utf8) 22: { 23: @p = unpack('U*', $_[0]); 24: foreach $c (@p) 25: { 26: if ($c >= 32 && $c < 127) { $t .= chr $c; } 27: else { $t .= sprintf("\\x{%02x}", $c); 28: } 29: } 30: } 31: else 32: { 33: foreach $c (split(//, $_[0])) 34: { 35: if (ord $c >= 32 && ord $c < 127) { $t .= $c; } 36: else { $t .= sprintf("\\x%02x", ord $c); } 37: } 38: } 39: 40: $t; 41: } 42: 43: 44: # Read lines from named file or stdin and write to named file or stdout; lines 45: # consist of a regular expression, in delimiters and optionally followed by 46: # options, followed by a set of test data, terminated by an empty line. 47: 48: # Sort out the input and output files 49: 50: if (@ARGV > 0) 51: { 52: open(INFILE, "<$ARGV[0]") || die "Failed to open $ARGV[0]\n"; 53: $infile = "INFILE"; 54: } 55: else { $infile = "STDIN"; } 56: 57: if (@ARGV > 1) 58: { 59: open(OUTFILE, ">$ARGV[1]") || die "Failed to open $ARGV[1]\n"; 60: $outfile = "OUTFILE"; 61: } 62: else { $outfile = "STDOUT"; } 63: 64: printf($outfile "Perl $] Regular Expressions\n\n"); 65: 66: # Main loop 67: 68: NEXT_RE: 69: for (;;) 70: { 71: printf " re> " if $infile eq "STDIN"; 72: last if ! ($_ = <$infile>); 73: printf $outfile "$_" if $infile ne "STDIN"; 74: next if ($_ eq ""); 75: 76: $pattern = $_; 77: 78: while ($pattern !~ /^\s*(.).*\1/s) 79: { 80: printf " > " if $infile eq "STDIN"; 81: last if ! ($_ = <$infile>); 82: printf $outfile "$_" if $infile ne "STDIN"; 83: $pattern .= $_; 84: } 85: 86: chomp($pattern); 87: $pattern =~ s/\s+$//; 88: 89: # The private /+ modifier means "print $' afterwards". 90: 91: $showrest = ($pattern =~ s/\+(?=[a-zA-Z]*$)//); 92: 93: # A doubled version is used by pcretest to print remainders after captures 94: 95: $pattern =~ s/\+(?=[a-zA-Z]*$)//; 96: 97: # Remove /8 from a UTF-8 pattern. 98: 99: $utf8 = $pattern =~ s/8(?=[a-zA-Z]*$)//; 100: 101: # Remove /J from a pattern with duplicate names. 102: 103: $pattern =~ s/J(?=[a-zA-Z]*$)//; 104: 105: # Remove /K from a pattern (asks pcretest to check MARK data) */ 106: 107: $pattern =~ s/K(?=[a-zA-Z]*$)//; 108: 109: # Remove /W from a pattern (asks pcretest to set PCRE_UCP) 110: 111: $pattern =~ s/W(?=[a-zA-Z]*$)//; 112: 113: # Remove /S or /SS from a pattern (asks pcretest to study or not to study) 114: 115: $pattern =~ s/S(?=[a-zA-Z]*$)//g; 116: 117: # Remove /Y from a pattern (asks pcretest to disable PCRE optimization) 118: 119: $pattern =~ s/Y(?=[a-zA-Z]*$)//; 120: 121: # Check that the pattern is valid 122: 123: eval "\$_ =~ ${pattern}"; 124: if ($@) 125: { 126: printf $outfile "Error: $@"; 127: next NEXT_RE; 128: } 129: 130: # If the /g modifier is present, we want to put a loop round the matching; 131: # otherwise just a single "if". 132: 133: $cmd = ($pattern =~ /g[a-z]*$/)? "while" : "if"; 134: 135: # If the pattern is actually the null string, Perl uses the most recently 136: # executed (and successfully compiled) regex is used instead. This is a 137: # nasty trap for the unwary! The PCRE test suite does contain null strings 138: # in places - if they are allowed through here all sorts of weird and 139: # unexpected effects happen. To avoid this, we replace such patterns with 140: # a non-null pattern that has the same effect. 141: 142: $pattern = "/(?#)/$2" if ($pattern =~ /^(.)\1(.*)$/); 143: 144: # Read data lines and test them 145: 146: for (;;) 147: { 148: printf "data> " if $infile eq "STDIN"; 149: last NEXT_RE if ! ($_ = <$infile>); 150: chomp; 151: printf $outfile "$_\n" if $infile ne "STDIN"; 152: 153: s/\s+$//; # Remove trailing space 154: s/^\s+//; # Remove leading space 155: s/\\Y//g; # Remove \Y (pcretest flag to set PCRE_NO_START_OPTIMIZE) 156: 157: last if ($_ eq ""); 158: $x = eval "\"$_\""; # To get escapes processed 159: 160: # Empty array for holding results, ensure $REGERROR and $REGMARK are 161: # unset, then do the matching. 162: 163: @subs = (); 164: 165: $pushes = "push \@subs,\$&;" . 166: "push \@subs,\$1;" . 167: "push \@subs,\$2;" . 168: "push \@subs,\$3;" . 169: "push \@subs,\$4;" . 170: "push \@subs,\$5;" . 171: "push \@subs,\$6;" . 172: "push \@subs,\$7;" . 173: "push \@subs,\$8;" . 174: "push \@subs,\$9;" . 175: "push \@subs,\$10;" . 176: "push \@subs,\$11;" . 177: "push \@subs,\$12;" . 178: "push \@subs,\$13;" . 179: "push \@subs,\$14;" . 180: "push \@subs,\$15;" . 181: "push \@subs,\$16;" . 182: "push \@subs,\$'; }"; 183: 184: undef $REGERROR; 185: undef $REGMARK; 186: 187: eval "${cmd} (\$x =~ ${pattern}) {" . $pushes; 188: 189: if ($@) 190: { 191: printf $outfile "Error: $@\n"; 192: next NEXT_RE; 193: } 194: elsif (scalar(@subs) == 0) 195: { 196: printf $outfile "No match"; 197: if (defined $REGERROR && $REGERROR != 1) 198: { printf $outfile (", mark = %s", &pchars($REGERROR)); } 199: printf $outfile "\n"; 200: } 201: else 202: { 203: while (scalar(@subs) != 0) 204: { 205: printf $outfile (" 0: %s\n", &pchars($subs[0])); 206: printf $outfile (" 0+ %s\n", &pchars($subs[17])) if $showrest; 207: $last_printed = 0; 208: for ($i = 1; $i <= 16; $i++) 209: { 210: if (defined $subs[$i]) 211: { 212: while ($last_printed++ < $i-1) 213: { printf $outfile ("%2d: <unset>\n", $last_printed); } 214: printf $outfile ("%2d: %s\n", $i, &pchars($subs[$i])); 215: $last_printed = $i; 216: } 217: } 218: splice(@subs, 0, 18); 219: } 220: 221: # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is 222: # set and the input pattern was a UTF-8 string. We can, however, force 223: # it to be so marked. 224: 225: if (defined $REGMARK && $REGMARK != 1) 226: { 227: $xx = $REGMARK; 228: $xx = Encode::decode_utf8($xx) if $utf8; 229: printf $outfile ("MK: %s\n", &pchars($xx)); 230: } 231: } 232: } 233: } 234: 235: # printf $outfile "\n"; 236: 237: # End 238: