development/checkurls/search_url.pl

   1 #! /usr/bin/env perl
   2 # -*- mode: perl; -*-
   3 #
   4 # file search_url.pl
   5 # script to search for url's in lyxfiles
   6 # and testing their validity.
   7 #
   8 # Syntax: search_url.pl [(filesToScan|(ignored|reverted|extra|selected)URLS)={path_to_control]*
   9 # Param value is a path to a file containing list of xxx:
  10 # filesToScan={xxx = lyx-file-names to be scanned for}
  11 # ignoredURLS={xxx = urls that are discarded from test}
  12 # revertedURLS={xxx = urls that should fail, to test the test with invalid urls}
  13 # extraURLS={xxx = urls which should be also checked}
  14 #
  15 # This file is free software; you can redistribute it and/or
  16 # modify it under the terms of the GNU General Public
  17 # License as published by the Free Software Foundation; either
  18 # version 2 of the License, or (at your option) any later version.
  19 #
  20 # This software is distributed in the hope that it will be useful,
  21 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23 # General Public License for more details.
  24 #
  25 # You should have received a copy of the GNU General Public
  26 # License along with this software; if not, write to the Free Software
  27 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  28 #
  29 # Copyright (c) 2013 Kornel Benko <kornel@lyx.org>
  30 #           (c) 2013 Scott Kostyshak <skotysh@lyx.org>
  31
  32 use strict;
  33
  34 BEGIN  {
  35   use File::Spec;
  36   my $p = File::Spec->rel2abs(__FILE__);
  37   $p =~ s/[\/\\]?[^\/\\]+$//;
  38   unshift(@INC, "$p");
  39 }
  40
  41 use CheckURL;
  42 use Try::Tiny;
  43 use locale;
  44 use POSIX qw(locale_h);
  45
  46 setlocale(LC_CTYPE, "");
  47 setlocale(LC_MESSAGES, "en_US.UTF-8");
  48
  49 # Prototypes
  50 sub printNotUsedURLS($\%);
  51 sub readUrls($\%);
  52 sub parse_file($ );
  53 sub handle_url($$$ );
  54 ##########
  55
  56 my %URLS = ();
  57 my %ignoredURLS = ();
  58 my %revertedURLS = ();
  59 my %extraURLS = ();
  60 my %selectedURLS = ();
  61
  62 my $checkSelectedOnly = 0;
  63 for my $arg (@ARGV) {
  64   die("Bad argument \"$arg\"") if ($arg !~ /=/);
  65   my ($type,$val) = split("=", $arg);
  66   if ($type eq "filesToScan") {
  67     #The file should be a list of files to search in
  68     if (open(FLIST, $val)) {
  69       while (my $l = <FLIST>) {
  70         chomp($l);
  71         parse_file($l);
  72       }
  73       close(FLIST);
  74     }
  75   }
  76   elsif ($type eq "ignoredURLS") {
  77     readUrls($val, %ignoredURLS);
  78   }
  79   elsif ($type eq "revertedURLS") {
  80     readUrls($val, %revertedURLS);
  81   }
  82   elsif ($type eq "extraURLS") {
  83     readUrls($val,  %extraURLS);
  84   }
  85   elsif ($type eq "selectedURLS") {
  86     $checkSelectedOnly = 1;
  87     readUrls($val,  %selectedURLS);
  88   }
  89   else {
  90     die("Invalid argument \"$arg\"");
  91   }
  92 }
  93
  94 my @urls = sort keys %URLS, keys %extraURLS;
  95 my $errorcount = 0;
  96
  97 my $URLScount = 0;
  98
  99 for my $u (@urls) {
 100   if (defined($selectedURLS{$u})) {
 101     ${selectedURLS}{$u}->{count} += 1;
 102   }
 103   if (defined($ignoredURLS{$u})) {
 104     $ignoredURLS{$u}->{count} += 1;
 105     next;
 106   }
 107   next if ($checkSelectedOnly && ! defined($selectedURLS{$u}));
 108   $URLScount++;
 109   print "Checking '$u': ";
 110   my ($res, $prnt);
 111   try {
 112     $res = check_url($u);
 113     if ($res) {
 114      $prnt = "Failed";
 115     }
 116     else {
 117       $prnt = "OK";
 118     }
 119   }
 120   catch {
 121     $prnt = "Failed, caught error: $_";
 122     $res = 700;
 123   };
 124   print "$prnt\n";
 125   my $printSourceFiles = 0;
 126   my $err_txt = "Error url:";
 127
 128   if ($res || $checkSelectedOnly) {
 129     $printSourceFiles = 1;
 130   }
 131   if ($res && defined($revertedURLS{$u})) {
 132     $err_txt = "Failed url:";
 133   }
 134   $res = ! $res if (defined($revertedURLS{$u}));
 135   if ($res || $checkSelectedOnly) {
 136     print "$err_txt \"$u\"\n";
 137   }
 138   if ($printSourceFiles) {
 139     if (defined($URLS{$u})) {
 140       for my $f(sort keys %{$URLS{$u}}) {
 141         my $lines = ":" . join(',', @{$URLS{$u}->{$f}});
 142         print "  $f$lines\n";
 143       }
 144     }
 145     if ($res ) {
 146       $errorcount++;
 147     }
 148   }
 149 }
 150
 151 if (%URLS) {
 152   printNotUsedURLS("Ignored", %ignoredURLS);
 153   printNotUsedURLS("Selected", %selectedURLS);
 154   printNotUsedURLS("KnownInvalid", %extraURLS);
 155 }
 156
 157 print "\n$errorcount URL-tests failed out of $URLScount\n\n";
 158 exit($errorcount);
 159
 160 ###############################################################################
 161
 162 sub printNotUsedURLS($\%)
 163 {
 164   my ($txt, $rURLS) = @_;
 165   my @msg = ();
 166   for my $u ( sort keys %{$rURLS}) {
 167     if ($rURLS->{$u}->{count} < 2) {
 168       my @submsg = ();
 169       for my $f (sort keys %{$rURLS->{$u}}) {
 170         next if ($f eq "count");
 171         push(@submsg, "$f:" . $rURLS->{$u}->{$f});
 172       }
 173       push(@msg, "\n  $u\n    " . join("\n    ", @submsg) . "\n");
 174     }
 175   }
 176   if (@msg) {
 177     print "\n$txt URLs not found in sources: " . join(' ',@msg) . "\n";
 178   }
 179 }
 180
 181 sub readUrls($\%)
 182 {
 183   my ($file, $rUrls) = @_;
 184
 185   die("Could not read file $file") if (! open(ULIST, $file));
 186   my $line = 0;
 187   while (my $l = <ULIST>) {
 188     $line++;
 189     $l =~ s/[\r\n]+$//;         # remove eol
 190     $l =~ s/\s*\#.*$//;         # remove comment
 191     next if ($l eq "");
 192     if (! defined($rUrls->{$l} )) {
 193       $rUrls->{$l} = {$file => $line, count => 1};
 194     }
 195   }
 196   close(ULIST);
 197 }
 198
 199 sub parse_file($)
 200 {
 201   my($f) = @_;
 202   my $status = "out";           # outside of URL/href
 203
 204   return if ($f =~ /\/attic\//);
 205   if(open(FI, $f)) {
 206     my $line = 0;
 207     while(my $l = <FI>) {
 208       $line++;
 209       $l =~ s/[\r\n]+$//;       #  Simulate chomp
 210       if ($status eq "out") {
 211         # searching for "\begin_inset Flex URL"
 212         if($l =~ /^\s*\\begin_inset\s+Flex\s+URL\s*$/) {
 213           $status = "inUrlInset";
 214         }
 215         elsif ($l =~ /^\s*\\begin_inset\s+CommandInset\s+href\s*$/) {
 216           $status = "inHrefInset";
 217         }
 218         else {
 219           # Outside of url, check also
 220           if ($l =~ /"((ftp|http|https):\/\/[^ ]+)"/) {
 221             my $url = $1;
 222             handle_url($url, $f, "x$line");
 223           }
 224         }
 225       }
 226       else {
 227         if($l =~ /^\s*\\end_(layout|inset)\s*$/) {
 228           $status = "out";
 229         }
 230         elsif ($status eq "inUrlInset") {
 231           if ($l =~ /\s*([a-z]+:\/\/.+)\s*$/) {
 232             my $url = $1;
 233             $status = "out";
 234             handle_url($url, $f, "u$line");
 235           }
 236         }
 237         elsif ($status eq "inHrefInset") {
 238           if ($l =~ /^target\s+"([a-z]+:\/\/[^ ]+)"$/) {
 239             my $url = $1;
 240             $status = "out";
 241             handle_url($url, $f, "h$line");
 242           }
 243         }
 244       }
 245     }
 246     close(FI);
 247   }
 248 }
 249
 250 sub handle_url($$$)
 251 {
 252   my($url, $f, $line) = @_;
 253
 254   if(!defined($URLS{$url})) {
 255     $URLS{$url} = {};
 256   }
 257   if(!defined($URLS{$url}->{$f})) {
 258     $URLS{$url}->{$f} = [];
 259   }
 260   push(@{$URLS{$url}->{$f}}, $line);
 261 }