--- embedaddon/pcre/perltest.pl 2012/02/21 23:05:51 1.1.1.1 +++ embedaddon/pcre/perltest.pl 2014/06/15 19:46:04 1.1.1.4 @@ -1,16 +1,16 @@ #! /usr/bin/env perl # Program for testing regular expressions with perl to check that PCRE handles -# them the same. This is the version that supports /8 for UTF-8 testing. As it -# stands, it requires at least Perl 5.8 for UTF-8 support. However, it needs to -# have "use utf8" at the start for running the UTF-8 tests, but *not* for the -# other tests. The only way I've found for doing this is to cat this line in -# explicitly in the RunPerlTest script. +# them the same. This version needs to have "use utf8" at the start for running +# the UTF-8 tests, but *not* for the other tests. The only way I've found for +# doing this is to cat this line in explicitly in the RunPerlTest script. I've +# also used this method to supply "require Encode" for the UTF-8 tests, so that +# the main test will still run where Encode is not installed. -# use locale; # With this included, \x0b matches \s! +#use utf8; +#require Encode; -# Function for turning a string into a string of printing chars. There are -# currently problems with UTF-8 strings; this fudges round them. +# Function for turning a string into a string of printing chars. sub pchars { my($t) = ""; @@ -21,10 +21,10 @@ if ($utf8) foreach $c (@p) { if ($c >= 32 && $c < 127) { $t .= chr $c; } - else { $t .= sprintf("\\x{%02x}", $c); } + else { $t .= sprintf("\\x{%02x}", $c); + } } } - else { foreach $c (split(//, $_[0])) @@ -68,7 +68,7 @@ for (;;) printf " re> " if $infile eq "STDIN"; last if ! ($_ = <$infile>); printf $outfile "$_" if $infile ne "STDIN"; - next if ($_ eq ""); + next if ($_ =~ /^\s*$/ || $_ =~ /^< forbid/); $pattern = $_; @@ -103,17 +103,17 @@ for (;;) $pattern =~ s/K(?=[a-zA-Z]*$)//; - # Remove /W from a pattern (asks pcretest to set PCRE_UCP) + # /W asks pcretest to set PCRE_UCP; change this to /u for Perl - $pattern =~ s/W(?=[a-zA-Z]*$)//; + $pattern =~ s/W(?=[a-zA-Z]*$)/u/; # Remove /S or /SS from a pattern (asks pcretest to study or not to study) $pattern =~ s/S(?=[a-zA-Z]*$)//g; - # Remove /Y from a pattern (asks pcretest to disable PCRE optimization) + # Remove /Y and /O from a pattern (disable PCRE optimizations) - $pattern =~ s/Y(?=[a-zA-Z]*$)//; + $pattern =~ s/[YO](?=[a-zA-Z]*$)//; # Check that the pattern is valid @@ -192,7 +192,7 @@ for (;;) { printf $outfile "No match"; if (defined $REGERROR && $REGERROR != 1) - { print $outfile (", mark = $REGERROR"); } + { printf $outfile (", mark = %s", &pchars($REGERROR)); } printf $outfile "\n"; } else @@ -214,8 +214,17 @@ for (;;) } splice(@subs, 0, 18); } + + # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is + # set and the input pattern was a UTF-8 string. We can, however, force + # it to be so marked. + if (defined $REGMARK && $REGMARK != 1) - { print $outfile ("MK: $REGMARK\n"); } + { + $xx = $REGMARK; + $xx = Encode::decode_utf8($xx) if $utf8; + printf $outfile ("MK: %s\n", &pchars($xx)); + } } } }