#! /usr/bin/env perl use Getopt::Long; $Getopt::Long::autoabbrev = 1; $Getopt::Long::ignorecase = 0; $base = ""; $infile = ""; $outfile = ""; $verb = "F"; $img = ""; $limit = "F"; @limits = (); $exclude = "T"; @excludes = (); ProcessArguments(@ARGV); %hrefs = (); %imgs = () if ($img ne ""); $base .= "/" if ($base !~ m%/$/%); open LFILE, "<$infile" or die "Error opening $infile in Input mode : $!\n"; @flist = ; chomp @flist; close LFILE; open LFILE, ">$outfile" or die "Error opening $outfile in Write mode : $!\n"; if ($img ne "") { open IFILE, ">$img" or die "Error opening $img in Write mode : $!\n"; } print STDERR "Processing files " if ($verb eq "T"); foreach $fname (@flist) { warn "Warning : file $fname inexistant\n", next if (! -e $fname); open INFILE , "<$fname" or die "Error : Could not open \"$fname\" in read mode : $!\n"; $fname =~ s%^\./%%; $dir = $1 if ($fname =~ m%^(.*)/[^/]*$%); print STDERR "$fname : " if ($verb eq "X"); while ($line = ) { # Hrefs while ($line =~ s/href\s*\=\s*\"([^\"]+)\"//i) { $addok = "T"; $href = &makeurl($1); $href = &urlit($href); # First let's check we can add it ... &checklimit($href); # Then let's remove some entries &checkexclude($href); push @{$hrefs{$href}}, $fname if (($href ne "") && ($addok eq "T")); print STDERR "." if ($verb eq "X"); } # Imgs ... } close INFILE; print STDERR "." if ($verb eq "T"); print STDERR " : done\n" if ($verb eq "X"); } print STDERR " done\n" if ($verb eq "T"); print STDERR "\nWritting result : " if ($verb eq "T"); foreach $val (sort keys %hrefs) { print LFILE "$val\t"; foreach $val2 (sort @{$hrefs{$val}}) { print LFILE "$val2 "; } print LFILE "\n"; print STDERR "." if ($verb eq "T"); } print STDERR " : done\n" if ($verb eq "T"); #### Process the arguments sub ProcessArguments { # sub my (@options, $val); local ($opt_base); @options = ( "base=s", # base of the url to be added to every file (Req Option) "verb", # verbose mode "Verb", # Very verbose mode "img=s", # also check images ? (req file for storing results) "limit=s", # strings to be matched to add to result ("," sep list) "exclude=s" # strings to be matched to remove from result ("," sep list) ); die "\n$usage\n" unless GetOptions @options; die "infile and outfile needed\n" if (scalar @ARGV != 2); die "Error : base option required\n" if (! defined $opt_base); $base = $opt_base if (defined $opt_base); die "Error : verb and Verb can not be choosen at the same time\n" if ((defined $opt_verb) && (defined $opt_Verb)); $verb = "X" if (defined $opt_Verb); $verb = "T" if (defined $opt_verb); $img = $opt_img if (defined $opt_img); if (defined $opt_limit) { $limit = "T"; @limits = split /,/, $opt_limit; } if (defined $opt_exclude) { $exclude = "T"; @excludes = split /,/, $opt_exclude; } $infile = shift @ARGV; $outfile = shift @ARGV; } sub makeurl { my $tmp = shift @_; $tmp =~ s%http[s]?\:%%i if ($tmp =~ m%^http[s]?\:[^/][^/]%i); if ($tmp =~ /^\#/) { $href = "$base$fname$tmp"; } elsif ($tmp !~ /^(http|ftp|mailto)/i) { $href = "$base$dir/$tmp"; } elsif ($tmp =~ /^(http|ftp)/i) { $href = "$tmp"; } elsif ($tmp =~ /^mailto/i) { $href = ""; } else { $href = ""; warn "Warning : strange url ($tmp) in current processed file ($fname)\n"; } return $href; } sub urlit { my $tmp = shift @_; while ($tmp =~ s%/\./%/%) {}; while ($tmp =~ s%/[^/]+/\.\./%/%) {}; return $tmp; } sub checklimit { my $url = shift @_; my $val; return if ($limit eq "F"); $addok = "F"; foreach $val (@limits) { $addok = "T", return if (grep /$val/, $url); } } sub checkexclude { my $url = shift @_; my $val; return if ($exclude eq "F"); foreach $val (@excludes) { $addok = "F", return if (grep /$val/, $url); } }