From 3f48486a5cd2e66713430885f8826bd2af39412d Mon Sep 17 00:00:00 2001 From: Kornel Benko Date: Sun, 6 Oct 2024 18:04:00 +0200 Subject: [PATCH] Cmake build tests: check urls optimized The original test took about ~50 minutes. Making the checks in parallel, now it takes about 150 seconds. --- development/checkurls/CheckURL.pm | 173 ++++++------ development/checkurls/knownInvalidURLS | 1 + development/checkurls/knownToRegisterURLS | 1 + development/checkurls/search_url.pl | 326 ++++++++++++++-------- 4 files changed, 304 insertions(+), 197 deletions(-) diff --git a/development/checkurls/CheckURL.pm b/development/checkurls/CheckURL.pm index 4ab70abfcb..330f7fe3f7 100755 --- a/development/checkurls/CheckURL.pm +++ b/development/checkurls/CheckURL.pm @@ -1,5 +1,6 @@ # -*- mode: perl; -*- package CheckURL; + # file CheckURL.pm # # This file is part of LyX, the document processor. @@ -13,9 +14,10 @@ package CheckURL; # use strict; -our(@EXPORT, @ISA); +our (@EXPORT, @ISA); + BEGIN { - use Exporter (); + use Exporter (); @ISA = qw(Exporter); @EXPORT = qw(check_url); } @@ -25,16 +27,18 @@ sub check_http_url($$$$); sub check_ftp_dir_entry($$); sub check_ftp_url($$$$); sub check_unknown_url($$$$); -sub check_url($$); +sub check_url($$$$); ################ -sub check_http_url($$$$) -{ +my $fe; +my $fs; + +sub check_http_url($$$$) { require LWP::UserAgent; my ($protocol, $host, $path, $file) = @_; - my $ua = LWP::UserAgent->new; + my $ua = LWP::UserAgent->new(timeout => 20); my $getp = "/"; if ($path ne "") { $getp .= $path; @@ -54,11 +58,11 @@ sub check_http_url($$$$) $buf = $response->decoded_content; } else { - print " " . $response->status_line . ": "; + print $fe " " . $response->status_line . ": "; return 3; } my @title = (); - my $res = 0; + my $res = 0; while ($buf =~ s/\([^\<]*)\<\/title\>//i) { my $title = $1; $title =~ s/[\r\n]/ /g; @@ -66,9 +70,9 @@ sub check_http_url($$$$) $title =~ s/^ //; $title =~ s/ $//; push(@title, $title); - print "title = \"$title\": "; + print $fe "title = \"$title\": "; if ($title =~ /Error 404|Not Found/) { - print " Page reports 'Not Found' from \"$protocol://$host$getp\": "; + print $fe " Page reports 'Not Found' from \"$protocol://$host$getp\": "; $res = 3; } } @@ -79,39 +83,39 @@ sub check_http_url($$$$) # returns 0, x if file does not match entry # 1, x everything OK # 2, x if not accesible (permission) -sub check_ftp_dir_entry($$) -{ +sub check_ftp_dir_entry($$) { my ($file, $e) = @_; my $other = '---'; my $isdir = 0; - #print "Checking '$file' against '$e'\n"; + #print $fe "Checking '$file' against '$e'\n"; $file =~ s/^\///; $isdir = 1 if ($e =~ /^d/); - return(0,$isdir) if ($e !~ /\s$file$/); + return (0, $isdir) if ($e !~ /\s$file$/); if ($e =~ /^.[-rwx]{6}([-rwx]{3})\s/) { $other = $1; } else { - #print "Invalid entry\n"; + #print $fe "Invalid entry\n"; # Invalid entry - return(0,$isdir); + return (0, $isdir); } - return(2,$isdir) if ($other !~ /^r/); # not readable + return (2, $isdir) if ($other !~ /^r/); # not readable if ($isdir) { + #return(2,$isdir) if ($other !~ /x$/); # directory, but not executable } - return(1,$isdir); + return (1, $isdir); } -sub check_ftp2_url($$$$) -{ +sub check_ftp2_url($$$$) { my ($protocol, $host, $path, $file) = @_; my $checkentry = 1; - print "\nhost $host\n"; - print "path $path\n"; - print "file $file\n"; + + #print $fe "\nhost $host\n"; + #print $fe "path $path\n"; + #print $fe "file $file\n"; my $url = "$protocol://$host"; $path =~ s/\/$//; if (defined($file)) { @@ -120,7 +124,8 @@ sub check_ftp2_url($$$$) else { $url = "$url/$path/."; } - print "curl $url, file = $file\n"; + + #print $fe "curl $url, file = $file\n"; my %listfiles = (); if (open(FFTP, "curl --anyauth -l $url|")) { while (my $l = ) { @@ -130,44 +135,44 @@ sub check_ftp2_url($$$$) close(FFTP); } if (%listfiles) { - if (! defined($file)) { - return(0, "OK"); + if (!defined($file)) { + return (0, "OK"); } elsif (defined($listfiles{$file})) { - return(0, "OK"); + return (0, "OK"); } elsif (defined($listfiles{"ftpinfo.txt"})) { - return(0, "Probably a directory"); + return (0, "Probably a directory"); } else { - return(1, "Not found"); + return (1, "Not found"); } } else { - return(1, "Error"); + return (1, "Error"); } } -sub check_ftp_url($$$$) -{ +sub check_ftp_url($$$$) { use Net::FTP; my ($protocol, $host, $path, $file) = @_; - my $res = 0; + my $res = 0; my $message = ""; - my $ftp = Net::FTP->new($host, Debug => 0, Timeout => 120); - if(!$ftp) { - return(3,"Cannot connect to $host"); + my $ftp = Net::FTP->new($host, Debug => 0, Timeout => 20); + if (!$ftp) { + return (3, "Cannot connect to $host"); } - if (! $ftp->login("anonymous",'-anonymous@')) { + if (!$ftp->login("anonymous", '-anonymous@')) { $message = $ftp->message; - $res = 3; + $res = 3; } else { my $rEntries; if ($path ne "") { - #print "Path = $path\n"; + + #print $fe "Path = $path\n"; #if (!$ftp->cwd($path)) { # $message = $ftp->message; # $res = 3; @@ -177,41 +182,43 @@ sub check_ftp_url($$$$) else { $rEntries = $ftp->dir(); } - if (! $rEntries) { - $res = 3; + if (!$rEntries) { + $res = 3; $message = "Could not read directory \"$path\""; } elsif (defined($file)) { - my $found = 0; + my $found = 0; my $found2 = 0; - for my $f ( @{$rEntries}) { - #print "Entry: $path $f\n"; - my ($res1,$isdir) = check_ftp_dir_entry($file,$f); - if ($res1 == 1) { - $found = 1; - last; - } - elsif ($res1 == 2) { - # found, but not accessible - $found2 = 1; - $message = "Permission denied for '$file'"; - } + for my $f (@{$rEntries}) { + + #print $fe "Entry: $path $f\n"; + my ($res1, $isdir) = check_ftp_dir_entry($file, $f); + if ($res1 == 1) { + $found = 1; + last; + } + elsif ($res1 == 2) { + + # found, but not accessible + $found2 = 1; + $message = "Permission denied for '$file'"; + } } - if (! $found) { - $res = 4; - if (! $found2) { - $message = "File or directory '$file' not found"; - } + if (!$found) { + $res = 4; + if (!$found2) { + $message = "File or directory '$file' not found"; + } } } } $ftp->quit; - #print "returning ($res,$message)\n"; - return($res, $message); + + #print $fe "returning ($res,$message)\n"; + return ($res, $message); } -sub check_unknown_url($$$$) -{ +sub check_unknown_url($$$$) { use LWP::Simple; my ($protocol, $host, $path, $file) = @_; @@ -226,49 +233,53 @@ sub check_unknown_url($$$$) $url .= "/$path"; } } - if(defined($file)) { - #print "Trying $url$file\n"; + if (defined($file)) { + + #print $fe "Trying $url$file\n"; $res = head("$url/$file"); - if(! $res) { + if (!$res) { + # try to check for directory '/'; - #print "Trying $url$file/\n"; + #print $fe "Trying $url$file/\n"; $res = head("$url/$file/"); } } else { - #print "Trying $url\n"; + #print $fe "Trying $url\n"; $res = head($url); } - return(! $res); + return (!$res); } # # Main entry -sub check_url($$) -{ - my($url,$use_curl) = @_; +sub check_url($$$$) { + my ($url, $use_curl, $fex, $fsx) = @_; + $fe = $fex; + $fs = $fsx; my $file = undef; - my ($protocol,$host,$path); + my ($protocol, $host, $path); my $res = 0; # Split the url to protocol,host,path if ($url =~ /^([a-z]+):\/\/([^\/]+)(.*)$/) { $protocol = $1; - $host = $2; - $path = $3; + $host = $2; + $path = $3; $path =~ s/^\///; - if($path =~ s/\/([^\/]+)$//) { + if ($path =~ s/\/([^\/]+)$//) { $file = $1; - if($file =~ / /) { - # Filename contains ' ', maybe invalid. Don't check - $file = undef; + if ($file =~ / /) { + + # Filename contains ' ', maybe invalid. Don't check + $file = undef; } $path .= "/"; } } else { - print " Invalid url '$url'"; + print $fe " Invalid url '$url'"; return 2; } if ($protocol =~ /^https?$/) { @@ -286,7 +297,7 @@ sub check_url($$) } else { # it never should reach this point - print " What protocol is '$protocol'?"; + print $fe " What protocol is '$protocol'?"; $res = check_unknown_url($protocol, $host, $path, $file); return $res; } diff --git a/development/checkurls/knownInvalidURLS b/development/checkurls/knownInvalidURLS index 02509713e3..4157c712ef 100644 --- a/development/checkurls/knownInvalidURLS +++ b/development/checkurls/knownInvalidURLS @@ -2,3 +2,4 @@ http://www.uon.edu/doe ftp://www.test.test http://www.test.test #proto://host.xx.ab/abcd +http://example.com/%20foo diff --git a/development/checkurls/knownToRegisterURLS b/development/checkurls/knownToRegisterURLS index 335deae1d8..b6e0681024 100644 --- a/development/checkurls/knownToRegisterURLS +++ b/development/checkurls/knownToRegisterURLS @@ -10,6 +10,7 @@ http://jasa.peerx-press.org/html/jasa/Using_LaTeX http://spie.org/app/Publications/index.cfm?fuseaction=authinfo&type=proceedings http://www.jstatsoft.org/downloads/JSSstyle.zip http://www.photogrammetry.ethz.ch/tarasp_workshop/isprs.cls +https://journals.aps.org/revtex # The following ftp url is correct, but # ftp commands like 'dir', 'get' do not work. diff --git a/development/checkurls/search_url.pl b/development/checkurls/search_url.pl index 8bba11c1fc..04b11bfb1c 100755 --- a/development/checkurls/search_url.pl +++ b/development/checkurls/search_url.pl @@ -31,47 +31,55 @@ use strict; -BEGIN { +BEGIN { use File::Spec; my $p = File::Spec->rel2abs(__FILE__); $p =~ s/[\/\\]?[^\/\\]+$//; unshift(@INC, "$p"); } +use warnings; +use Cwd qw(abs_path); use CheckURL; use Try::Tiny; use locale; use POSIX qw(locale_h); -setlocale(LC_CTYPE, ""); +setlocale(LC_CTYPE, ""); setlocale(LC_MESSAGES, "en_US.UTF-8"); +use File::Temp qw/ tempfile tempdir /; +use File::Spec; +use Fcntl qw(:flock SEEK_END); + # Prototypes sub printNotUsedURLS($\%); sub replaceSpecialChar($); sub readUrls($\%); sub parse_file($ ); sub handle_url($$$ ); +sub printx($$$$); ########## -my %URLS = (); -my %ignoredURLS = (); -my %revertedURLS = (); -my %extraURLS = (); -my %selectedURLS = (); +my %URLS = (); +my %ignoredURLS = (); +my %revertedURLS = (); +my %extraURLS = (); +my %selectedURLS = (); my %knownToRegisterURLS = (); -my $summaryFile = undef; +my $summaryFile = undef; my $checkSelectedOnly = 0; for my $arg (@ARGV) { die("Bad argument \"$arg\"") if ($arg !~ /=/); - my ($type,$val) = split("=", $arg); + my ($type, $val) = split("=", $arg); if ($type eq "filesToScan") { + #The file should be a list of files to search in if (open(FLIST, $val)) { while (my $l = ) { - chomp($l); - parse_file($l); + chomp($l); + parse_file($l); } close(FLIST); } @@ -102,7 +110,9 @@ for my $arg (@ARGV) { } } -my @urls = sort keys %URLS, keys %extraURLS; +my @urls = sort keys %URLS, keys %extraURLS; +my @testvals = (); + # Tests #my @urls = ("ftp://ftp.edpsciences.org/pub/aa/readme.html", "ftp://ftp.springer.de/pub/tex/latex/compsc/proc/author"); my $errorcount = 0; @@ -126,57 +136,146 @@ for my $u (@urls) { if (defined($selectedURLS{$u})) { ${selectedURLS}{$u}->{count} += 1; } - next if ($checkSelectedOnly && ! defined($selectedURLS{$u})); + next if ($checkSelectedOnly && !defined($selectedURLS{$u})); $URLScount++; - print "Checking '$u': "; - my ($res, $prnt, $outSum); - try { - $res = check_url($u, $use_curl); - if ($res) { - print "Failed\n"; - $prnt = ""; - $outSum = 1; - } - else { - $prnt = "OK\n"; - $outSum = 0; + push(@testvals, {u => $u, use_curl => $use_curl,}); +} + +# Ready to go multitasking +my ($vol, $dir, $file) = File::Spec->splitpath($summaryFile); +my $tempdir = tempdir("$dir/CounterXXXXXXX", CLEANUP => 1); +my $countfile = "$tempdir/counter"; +my $counter = 0; +if (open(my $FO, '>', $countfile)) { + print {$FO} $counter; + close($FO); +} +else { + unlink($countfile); + die("Could not write to $countfile"); +} + +print "Using tempdir \"" . abs_path($tempdir) . "\"\n"; + +my @wait = (); +for (my $i = 0; $i < 10; $i++) { # Number of subprocesses + my $pid = fork(); + if ($pid == 0) { + + # I am child + open(my $fe, '>:encoding(UTF-8)', "$tempdir/xxxError$i"); + open(my $fs, '>:encoding(UTF-8)', "$tempdir/xxxSum$i"); + while (1) { + open(my $fh, '+<', $countfile) or die("cannot open $countfile"); + flock($fh, LOCK_EX) or die "$i: Cannot lock $countfile - $!\n"; + my $l = <$fh>; # get actual count number + my $diff = undef; + if (defined($testvals[$l + 150])) { + $diff = 5; + } + elsif (defined($testvals[$l + 50])) { + $diff = 3; + } + elsif (defined($testvals[$l + 20])) { + $diff = 2; + } + elsif (defined($testvals[$l])) { + $diff = 1; + } + else { + close($fs); + print $fe "NumberOfErrors $errorcount\n"; + close($fe); + exit(0); + } + my $next = $l + $diff; + seek($fh, 0, 0); + truncate($fh, 0); + print $fh $next; + close($fh); + for (my $i = 0; $i < $diff; $i++) { + my $entryidx = $l + $i; + my $rentry = $testvals[$entryidx]; + next if (!defined($rentry)); + my $u = $rentry->{u}; + my $use_curl = $rentry->{use_curl}; + + print $fe "Checking($entryidx) '$u': "; + my ($res, $prnt, $outSum); + try { + $res = check_url($u, $use_curl, $fe, $fs); + if ($res) { + print $fe "Failed\n"; + $prnt = ""; + $outSum = 1; + } + else { + $prnt = "OK\n"; + $outSum = 0; + } + } + catch { + $prnt = "Failed, caught error: $_\n"; + $outSum = 1; + $res = 700; + }; + printx("$prnt", $outSum, $fe, $fs); + my $printSourceFiles = 0; + my $err_txt = "Error url:"; + + if ($res || $checkSelectedOnly) { + $printSourceFiles = 1; + } + if ($res && defined($revertedURLS{$u})) { + $err_txt = "Failed url:"; + } + $res = !$res if (defined($revertedURLS{$u})); + if ($res || $checkSelectedOnly) { + printx("$err_txt \"$u\"\n", $outSum, $fe, $fs); + } + if ($printSourceFiles) { + if (defined($URLS{$u})) { + for my $f (sort keys %{$URLS{$u}}) { + my $lines = ":" . join(',', @{$URLS{$u}->{$f}}); + printx(" $f$lines\n", $outSum, $fe, $fs); + } + } + if ($res) { + $errorcount++; + } + } + } } } - catch { - $prnt = "Failed, caught error: $_\n"; - $outSum = 1; - $res = 700; - }; - printx("$prnt", $outSum); - my $printSourceFiles = 0; - my $err_txt = "Error url:"; - - if ($res || $checkSelectedOnly) { - $printSourceFiles = 1; - } - if ($res && defined($revertedURLS{$u})) { - $err_txt = "Failed url:"; - } - $res = ! $res if (defined($revertedURLS{$u})); - if ($res || $checkSelectedOnly) { - printx("$err_txt \"$u\"\n", $outSum); - } - if ($printSourceFiles) { - if (defined($URLS{$u})) { - for my $f(sort keys %{$URLS{$u}}) { - my $lines = ":" . join(',', @{$URLS{$u}->{$f}}); - printx(" $f$lines\n", $outSum); + $wait[$i] = $pid; +} + +for (my $i = 0; $i < 10; $i++) { + my $p = $wait[$i]; + if ($p > 0) { + waitpid($p, 0); + open(my $fe, '<', "$tempdir/xxxError$i"); + while (my $l = <$fe>) { + if ($l =~ /^NumberOfErrors\s(\d+)/) { + $errorcount += $1; + } + else { + print $l; } } - if ($res ) { - $errorcount++; + close($fe); + open(my $fs, '<', "$tempdir/xxxSum$i"); + while (my $l = <$fs>) { + print SFO $l; } + close($fs); } } +unlink($countfile); if (%URLS) { - printNotUsedURLS("Ignored", %ignoredURLS); - printNotUsedURLS("Selected", %selectedURLS); + printNotUsedURLS("Ignored", %ignoredURLS); + printNotUsedURLS("Selected", %selectedURLS); printNotUsedURLS("KnownInvalid", %extraURLS); } @@ -187,124 +286,119 @@ if (defined($summaryFile)) { exit($errorcount); ############################################################################### -sub printx($$) -{ - my ($txt, $outSum) = @_; - print "$txt"; +sub printx($$$$) { + my ($txt, $outSum, $fe, $fs) = @_; + print $fe "$txt"; if ($outSum && defined($summaryFile)) { - print SFO "$txt"; + print $fs "$txt"; } } -sub printNotUsedURLS($\%) -{ +sub printNotUsedURLS($\%) { my ($txt, $rURLS) = @_; my @msg = (); - for my $u ( sort keys %{$rURLS}) { + for my $u (sort keys %{$rURLS}) { if ($rURLS->{$u}->{count} < 2) { my @submsg = (); for my $f (sort keys %{$rURLS->{$u}}) { - next if ($f eq "count"); - push(@submsg, "$f:" . $rURLS->{$u}->{$f}); + next if ($f eq "count"); + push(@submsg, "$f:" . $rURLS->{$u}->{$f}); } push(@msg, "\n $u\n " . join("\n ", @submsg) . "\n"); } } if (@msg) { - print "\n$txt URLs not found in sources: " . join(' ',@msg) . "\n"; + print "\n$txt URLs not found in sources: " . join(' ', @msg) . "\n"; } } -sub replaceSpecialChar($) -{ +sub replaceSpecialChar($) { my ($l) = @_; $l =~ s/\\SpecialChar(NoPassThru)?\s*(TeX|LaTeX|LyX)[\s]?/\2/; - return($l); + return ($l); } -sub readUrls($\%) -{ +sub readUrls($\%) { my ($file, $rUrls) = @_; - die("Could not read file $file") if (! open(ULIST, $file)); + die("Could not read file $file") if (!open(ULIST, $file)); my $line = 0; while (my $l = ) { $line++; - $l =~ s/[\r\n]+$//; # remove eol - $l =~ s/\s*\#.*$//; # remove comment + $l =~ s/[\r\n]+$//; # remove eol + $l =~ s/\s*\#.*$//; # remove comment $l = &replaceSpecialChar($l); next if ($l eq ""); my $use_curl = 0; if ($l =~ s/^\s*UseCurl\s*//) { $use_curl = 1; } - if (! defined($rUrls->{$l} )) { + if (!defined($rUrls->{$l})) { $rUrls->{$l} = {$file => $line, count => 1, use_curl => $use_curl}; } } close(ULIST); } -sub parse_file($) -{ - my($f) = @_; - my $status = "out"; # outside of URL/href +sub parse_file($) { + my ($f) = @_; + my $status = "out"; # outside of URL/href return if ($f =~ /\/attic\//); - if(open(FI, $f)) { + if (open(FI, $f)) { my $line = 0; - while(my $l = ) { + while (my $l = ) { $line++; - $l =~ s/[\r\n]+$//; # Simulate chomp + $l =~ s/[\r\n]+$//; # Simulate chomp if ($status eq "out") { - # searching for "\begin_inset Flex URL" - if($l =~ /^\s*\\begin_inset\s+Flex\s+URL\s*$/) { - $status = "inUrlInset"; - } - elsif ($l =~ /^\s*\\begin_inset\s+CommandInset\s+href\s*$/) { - $status = "inHrefInset"; - } - else { - # Outside of url, check also - if ($l =~ /"((ftp|http|https):\/\/[^ ]+)"/) { - my $url = $1; - handle_url($url, $f, "x$line"); - } - } + + # searching for "\begin_inset Flex URL" + if ($l =~ /^\s*\\begin_inset\s+Flex\s+URL\s*$/) { + $status = "inUrlInset"; + } + elsif ($l =~ /^\s*\\begin_inset\s+CommandInset\s+href\s*$/) { + $status = "inHrefInset"; + } + else { + # Outside of url, check also + if ($l =~ /"((ftp|http|https):\/\/[^ ]+)"/) { + my $url = $1; + handle_url($url, $f, "x$line"); + } + } } else { - if($l =~ /^\s*\\end_(layout|inset)\s*$/) { - $status = "out"; - } - elsif ($status eq "inUrlInset") { - if ($l =~ /\s*([a-z]+:\/\/.+)\s*$/) { - my $url = $1; - $status = "out"; - handle_url($url, $f, "u$line"); - } - } - elsif ($status eq "inHrefInset") { - if ($l =~ /^target\s+"([a-z]+:\/\/[^ ]+)"$/) { - my $url = $1; - $status = "out"; - handle_url($url, $f, "h$line"); - } - } + if ($l =~ /^\s*\\end_(layout|inset)\s*$/) { + $status = "out"; + } + elsif ($status eq "inUrlInset") { + if ($l =~ /\s*([a-z]+:\/\/.+)\s*$/) { + my $url = $1; + $status = "out"; + handle_url($url, $f, "u$line"); + } + } + elsif ($status eq "inHrefInset") { + if ($l =~ /^target\s+"([a-z]+:\/\/[^ ]+)"$/) { + my $url = $1; + $status = "out"; + handle_url($url, $f, "h$line"); + } + } } } close(FI); } } -sub handle_url($$$) -{ - my($url, $f, $line) = @_; +sub handle_url($$$) { + my ($url, $f, $line) = @_; $url = &replaceSpecialChar($url); - if(!defined($URLS{$url})) { + if (!defined($URLS{$url})) { $URLS{$url} = {}; } - if(!defined($URLS{$url}->{$f})) { + if (!defined($URLS{$url}->{$f})) { $URLS{$url}->{$f} = []; } push(@{$URLS{$url}->{$f}}, $line); -- 2.39.5