# Copyright (c) 1994 Regents of the University of California. # All rights reserved. # $Id: momavoid.pl,v 1.7 1994/08/10 10:18:29 fielding Exp $ # --------------------------------------------------------------------------- # momavoid: A package for keeping track of where a World-Wide Web spider # has visited and what parts of the Web are off-limits or # restricted to being terminal nodes (leafs). # # This software has been developed by Roy Fielding as # part of the Arcadia project at the University of California, Irvine. # # Redistribution and use in source and binary forms are permitted, # subject to the restriction noted below, provided that the above # copyright notice and this paragraph and the following paragraphs are # duplicated in all such forms and that any documentation, advertising # materials, and other materials related to such distribution and use # acknowledge that the software was developed in part by the University of # California, Irvine. The name of the University may not be used to # endorse or promote products derived from this software without # specific prior written permission. THIS SOFTWARE IS PROVIDED ``AS IS'' # AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT # LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE. # # Use of this software in any way or in any form, source or binary, # is not allowed in any country which prohibits disclaimers of any # implied warranties of merchantability or fitness for a particular # purpose or any disclaimers of a similar nature. # # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY # FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES # ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION # (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY # OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # If you have any suggestions, bug reports, fixes, or enhancements, # send them to the author Roy Fielding at . # --------------------------------------------------------------------------- require "www.pl"; require "wwwurl.pl"; require "wwwerror.pl"; require "wwwdates.pl"; require "momconfig.pl"; package momavoid; # ========================================================================== # Grab any needed defaults from momconfig.pl $Version = $momconfig'Version; # The following is the standard name for the URL which defines for # any site where Robots are not allowed. See Martijn Koster's proposal # at for more info. $RobotsURL = ($momconfig'RobotsURL || "/robots.txt"); # Number of seconds to wait for a response from a site. $Timeout = ($momconfig'Timeout || 30); # Specify the default interval between checks of a site's RobotsURL file. if (!defined($momconfig'CheckInterval) || ($momconfig'CheckInterval < 2)) { $SitesCheck = 15 * 86400; # Fifteen days } else { $SitesCheck = $momconfig'CheckInterval * 86400; } # ========================================================================== # Initialize the Avoids and Sites tables $AvoidNum = 0; @AvoidType = (); @AvoidURL = (); @AvoidExp = (); @AvoidSave = (); $SitesNum = 0; @SitesAddr = (); @SitesExp = (); @SitesSave = (); %Sites = (); # Setup global information about the Avoid and Sites files $avoidTag = 'Avoid'; $leafTag = 'Leaf'; $siteTag = 'Site'; $BackedUpAvoid = 0; # Used to prevent the overwriting of backups $BackedUpSites = 0; $IsaveAvoid = ''; # Filename for interim saves $IsaveSites = ''; $TopAvoid = <<"EOF"; # $Version Avoid File: Lists URL prefixes to avoid or leaf. # New URL prefixes can be added when the program is not running. # The file format is: EntryType URLprefix [ExpireDate] # where EntryType = "$avoidTag" or "$leafTag" # URLprefix = the full URL prefix for which this entry applies # ExpireDate = [*] for never expire # or [date] (see wwwdates.pl for valid date formats) # This file is automatically generated, so don't bother changing the format. EOF $TopSites = <<"EOF"; # $Version Sites File: Lists IPaddress:port locations we've checked # for a $RobotsURL file and followed its directions. # New sites can be added when the program is not running. # The file format is: EntryType IPaddress:Port [ExpireDate] # where EntryType = "$siteTag" # IPaddress = the full hostname or IP address for the site # Port = the numeric TCP port for the site (write 80 for default) # ExpireDate = [*] for never expire (i.e. never check this site) # or [date] (see wwwdates.pl for valid date formats) # This file is automatically generated, so don't bother changing the format. EOF # ========================================================================== # ========================================================================== # setCheckInterval($days): Set the default number of days between site # checks and thus the default expiration time for new avoid table entries. # sub setCheckInterval { $SitesCheck = $_[0] * 86400; } # ========================================================================== # load(): Load the named avoidfile and sitesfile into their respective arrays # and tag them as writable according to rwtag. The rwtag is used so # that information loaded from systemwide files are not written to # the user's files during momavoid'save. # Assumes ($rwtag eq 'R' || $rwtag eq 'W'). # sub load { local($avoidfile, $sitesfile, $rwtag) = @_; local($type, $url, $date, $site); if ($rwtag eq 'W') { $IsaveAvoid = $avoidfile; $IsaveSites = $sitesfile; } if ($avoidfile) { if (!(-e $avoidfile)) { print "No existing AvoidFile -- starting from scratch\n"; $BackedUpAvoid = 1; } elsif (!open(AVOIDF, $avoidfile)) { print STDERR "Failed to open avoid file $avoidfile for load: $!\n"; } else { while () { next if ( /^$/ || /^\#/ ); # Ignore blank and comment lines if ( /^($avoidTag|$leafTag)\s+(\S+)\s+\[(.+)\]/ ) { $type = $1; $url = $2; $date = $3; } else { print STDERR "Invalid line $. of avoid file $avoidfile\n"; next; } if ($date ne '*') # Unless expiration date is [*] { # see if this entry is too old next if (time > &wwwdates'get_gmtime($date)); } $url = &wwwurl'absolute('',$url); if ($type eq $leafTag) { &leaf($url, $date, $rwtag); } else { &avoid($url, $date, $rwtag); } } close AVOIDF; } } if ($sitesfile) { if (!(-e $sitesfile)) { print "No existing SitesFile -- starting from scratch\n"; $BackedUpSites = 1; } elsif (!open(SITESF, $sitesfile)) { print STDERR "Failed to open sites file $sitesfile for load: $!\n"; } else { while () { next if ( /^$/ || /^\#/ ); # Ignore blank and comment lines if ( /^$siteTag\s+(\S+)\s+\[(.+)\]/ ) { $site = &wwwurl'get_site("http://$1/"); $date = $2; } else { print STDERR "Invalid line $. of sites file $sitesfile\n"; next; } if ($date ne '*') # Unless expiration date is [*] { # see if this entry is too old next if (time > &wwwdates'get_gmtime($date)); } &addsite($site, $date, $rwtag); } close SITESF; } } } # ========================================================================== # save(): Save the Avoid and Sites tables into the passed-in filenames avoid # and sites files. Only writes those entries marked as Save # (those originated by the user). This has been set up for multiple # calls at any time during processing. # sub save { local($avoidfile, $sitesfile) = @_; local($idx); if ($avoidfile) { if (!$BackedUpAvoid) { if (!rename($avoidfile, "$avoidfile".'.bak')) { print STDERR "Failed to backup avoid file $avoidfile: $!\n"; } $BackedUpAvoid = 1; } if (!open(AVOIDF, "> $avoidfile")) { print STDERR "Failed to open avoid file $avoidfile for save: $!\n"; print STDERR "Here is what should have been written:\n"; print STDERR $TopAvoid; for ($idx = 1; $idx <= $AvoidNum; $idx++) { if ($AvoidURL[$idx] && ($AvoidSave[$idx] eq 'W')) { printf(STDERR "%-5s %s [%s]\n", $AvoidType[$idx], $AvoidURL[$idx], $AvoidExp[$idx]); } } } else { print AVOIDF $TopAvoid; for ($idx = 1; $idx <= $AvoidNum; $idx++) { if ($AvoidURL[$idx] && ($AvoidSave[$idx] eq 'W')) { printf(AVOIDF "%-5s %s [%s]\n", $AvoidType[$idx], $AvoidURL[$idx], $AvoidExp[$idx]); } } close(AVOIDF); } } if ($sitesfile) { if (!$BackedUpSites) { if (!rename($sitesfile, "$sitesfile".'.bak')) { print STDERR "Failed to backup sites file $sitesfile: $!\n"; } $BackedUpSites = 1; } if (!open(SITESF, "> $sitesfile")) { print STDERR "Failed to open sites file $sitesfile for save: $!\n"; print STDERR "Here is what should have been written:\n"; print STDERR $TopSites; for ($idx = 1; $idx <= $SitesNum; $idx++) { next unless ($SitesSave[$idx]); printf(STDERR "$siteTag %s [%s]\n", $SitesAddr[$idx], $SitesExp[$idx]); } } else { print SITESF $TopSites; for ($idx = 1; $idx <= $SitesNum; $idx++) { next unless ($SitesSave[$idx]); printf(SITESF "$siteTag %s [%s]\n", $SitesAddr[$idx], $SitesExp[$idx]); } close(SITESF); } } } # ========================================================================== # snapshot(): Print a snapshot of the current contents of the # Avoid and Sites Tables. Used only for debugging. # sub snapshot { local($rwtag, $idx); print "Snapshot of Avoid Table ($AvoidNum entries):\n"; for ($idx = 1; $idx <= $AvoidNum; $idx++) { if ($AvoidURL[$idx]) { printf("%4d %s %-5s %s [%s]\n", $idx, $AvoidSave[$idx], $AvoidType[$idx], $AvoidURL[$idx], $AvoidExp[$idx]); } } print "Snapshot of Sites Table ($SitesNum entries):\n"; for ($idx = 1; $idx <= $SitesNum; $idx++) { $rwtag = (($SitesSave[$idx] && 'W') || 'R'); printf("%4d %s %s [%s]\n", $idx, $rwtag, $SitesAddr[$idx], $SitesExp[$idx]); } } # ========================================================================== # avoid(): Add the given url to the avoid table with the expiration date # given ('' for default, '*' for never) and a read/write tag # indicating whether or not to save the entry at process end. # sub avoid { local($url, $expdate, $rwtag) = @_; local($date, $pos, $old); if (!$expdate) # Expire at default days from now { $date = &wwwdates'wtime((time + $SitesCheck),'GMT'); } else # or just when expdate indicates we should { $date = $expdate; } # All existing avoid/leaf entries must be checked first for # duplication/overlapping. undef $pos; foreach $idx (1 .. $AvoidNum) { if (!$AvoidURL[$idx]) # Fill any gaps { $pos = $idx; next; } $old = $AvoidURL[$idx]; if ($url eq $old) # URL duplicates an old one? { if (($AvoidType[$idx] ne $avoidTag) || ($AvoidSave[$idx] ne 'W') || ($AvoidExp[$idx] ne '*')) { $pos = $idx; last; } else { return; } } if ($url =~ m#^$old#) # URL less general than an old one? { return if (($AvoidType[$idx] eq $avoidTag) && (($rwtag ne 'W') || ($AvoidSave[$idx] eq 'W'))); next; } if ($old =~ m#^$url#) # URL more general than an old one? { if (($AvoidType[$idx] ne $avoidTag) || ($AvoidSave[$idx] ne 'W') || ($AvoidExp[$idx] ne '*') || (($date eq '*') && ($rwtag eq 'W'))) { $pos = $idx; last; } else { next; } } } if (!defined($pos)) { $pos = ++$AvoidNum; } $AvoidType[$pos] = $avoidTag; $AvoidURL[$pos] = $url; $AvoidExp[$pos] = $date; $AvoidSave[$pos] = $rwtag; } # ========================================================================== # leaf(): Add the given url to the leaf table with the expiration date # given ('' for default, '*' for never) and a read/write tag # indicating whether or not to save the entry at process end. # sub leaf { local($url, $expdate, $rwtag) = @_; local($date, $pos, $old); if (!$expdate) # Expire at default days from now { $date = &wwwdates'wtime((time + $SitesCheck),'GMT'); } else # or just when expdate indicates we should { $date = $expdate; } # All existing avoid/leaf entries must be checked first for # duplication/overlapping. undef $pos; foreach $idx (1 .. $AvoidNum) { if (!$AvoidURL[$idx]) # Fill any gaps { $pos = $idx; next; } $old = $AvoidURL[$idx]; if ($url eq $old) # URL duplicates an old one? { return if ($rwtag ne 'W'); return if (($AvoidSave[$idx] eq 'W') && ($AvoidExp[$idx] eq '*')); next if ($AvoidType[$idx] eq $avoidTag); $pos = $idx; last; } if ($url =~ m#^$old#) # URL less general than an old one? { return if ($rwtag ne 'W'); return if (($AvoidSave[$idx] eq 'W') && ($AvoidExp[$idx] eq '*')); return if (($AvoidType[$idx] eq $avoidTag) && (($AvoidSave[$idx] eq 'W') || ($AvoidExp[$idx] eq '*'))); next; } if ($old =~ m#^$url#) # URL more general than an old one? { next if ($AvoidType[$idx] eq $avoidTag); next if (($AvoidSave[$idx] eq 'W') && (($rwtag ne 'W') || ($AvoidExp[$idx] eq '*'))); $pos = $idx; last; } } if (!defined($pos)) { $pos = ++$AvoidNum; } $AvoidType[$pos] = $leafTag; $AvoidURL[$pos] = $url; $AvoidExp[$pos] = $date; $AvoidSave[$pos] = $rwtag; } # ========================================================================== # exclude(): Add the given url to the leaf table for the duration of the # current traversal (useful for delineating the bounds of a # Site traversal process). # sub exclude { local($url) = @_; local($pos, $old); # All existing avoid/leaf entries must be checked first for # duplication/overlapping. undef $pos; foreach $idx (1 .. $AvoidNum) { if (!$AvoidURL[$idx]) # Fill any gaps { $pos = $idx; next; } $old = $AvoidURL[$idx]; if ($url =~ m#^$old#) # URL less general than an old one? { return; } } if (!defined($pos)) { $pos = ++$AvoidNum; } $AvoidType[$pos] = $leafTag; $AvoidURL[$pos] = $url; $AvoidExp[$pos] = ''; $AvoidSave[$pos] = 'E'; } # ========================================================================== # clear_excludes(): Remove all "excluded" URLs from the leaf table, presumably # because we just finished an infostructure traversal. # sub clear_excludes { local($pos, $old); for ($idx = 1; $idx <= $AvoidNum; $idx++) { next unless ($AvoidURL[$idx] && ($AvoidSave[$idx] eq 'E')); undef $AvoidType[$idx]; undef $AvoidURL[$idx]; undef $AvoidExp[$idx]; undef $AvoidSave[$idx]; } } # ========================================================================== # addsite(): Add the given site to the site table with the expiration date # given ('' for default, '*' for never) and a read/write tag # indicating whether or not to save the entry at process end. # sub addsite { local($site, $expdate, $rwtag) = @_; local($date, $idx); if (!$expdate) # Expire at default days from now { $date = &wwwdates'wtime((time + $SitesCheck),'GMT'); } else # or just when expdate indicates we should { $date = $expdate; } $idx = $Sites{$site}; if (!$idx) { $idx = ++$SitesNum; $Sites{$site} = $idx; } $SitesAddr[$idx] = $site; $SitesExp[$idx] = $date; $SitesSave[$idx] = ($rwtag eq 'W'); } # ========================================================================== # checkurl(): Check the url to see whether there are any restrictions # on its access and return # 0 -> no restrictions # 1 -> Leaf (okay to test, but don't traverse) # 2 -> Avoid (no access allowed) # sub checkurl { local($url) = @_; local($site, $idx, $found, $prefix); if (($url =~ /^http:/) && ($site = &wwwurl'get_site($url))) { &checksite($site); # Has the side effect of updating avoid table # if the site has not already been checked. } $found = 0; for ($idx = 1; $idx <= $AvoidNum; $idx++) { next unless ($AvoidURL[$idx]); $prefix = $AvoidURL[$idx]; if ( $url =~ m#^$prefix# ) { if ($AvoidType[$idx] eq $avoidTag) { return 2; } $found = 1; } } return $found; } # ========================================================================== # checksite(): Check the site table to see if this site has already been # checked for restrictions. If it hasn't, perform the check on # that site using the RobotsNotWanted protocol and update the # avoid and sites tables. See the format description at # . # sub checksite { local($site) = @_; local($url, $headers, %headers, $content, $response, $my_name, $agent, $drs, $in_def, $in_mine, $def_dr, $my_dr); return if defined($Sites{$site}); # Return if RobotsURL has been checked $my_name = &www'get_def_header('http','User-Agent'); if (!$my_name) { die "No User-Agent has been specified, stopped"; } $url = "http://$site$RobotsURL"; $headers = ''; # The response headers will be returned here %headers = (); # The parsed response headers will be returned here $content = ''; # The response content will be returned here print "Checking for $url ... "; $response = &www'request('GET', $url, *headers, *content, $Timeout); print $response,"\n"; &addsite($site, $headers{'expires'}, 'W'); # This site has been checked return unless ($response == $wwwerror'RC_ok); $my_name =~ s#/.*##; # Remove any version or library information $in_def = 0; # Keep track of default '*' record boundaries $in_mine = 0; # Keep track of my own record boundaries $def_dr = ''; # Store default disallow names $my_dr = ''; # Store my own disallow names foreach (split(/\n/, $content)) { next if (/^\s*#/); # Ignore lines containing only a comment s/\s*#.*//; # Remove any other comments if (/^\s*$/) # Records are separated by blank lines { last if ($in_mine); $in_def = 0; } elsif (/^User-Agent:\s*(.*)$/i) # List of robot names { next if ($in_mine); $agent = $1; if ($agent =~ /\b$my_name\b/i) { $in_mine = 1; next; } if ($agent =~ /^\*/) { $in_def = 1; } } elsif (/^Disallow:(.*)$/i) # List of URLs to avoid { next unless ($in_def || $in_mine); $drs = $1; if ($in_mine) { $my_dr .= ' '. $drs; next; } if ($in_def) { $def_dr .= ' '. $drs; } } } if ($in_mine) { $def_dr = $my_dr; } # My own record takes precedence if ($def_dr !~ /^\s*$/) { foreach $drs (split(' ', $def_dr)) { &avoid(&wwwurl'absolute($url,$drs), $headers{'expires'}, 'W'); } &save($IsaveAvoid, $IsaveSites); # Save to file } } # ========================================================================== 1;