# Copyright (c) 1994 Regents of the University of California.
# All rights reserved.
# $Id: momavoid.pl,v 1.7 1994/08/10 10:18:29 fielding Exp $
# ---------------------------------------------------------------------------
# momavoid: A package for keeping track of where a World-Wide Web spider
#           has visited and what parts of the Web are off-limits or
#           restricted to being terminal nodes (leafs).
#
# This software has been developed by Roy Fielding <fielding@ics.uci.edu> as
# part of the Arcadia project at the University of California, Irvine.
#
# Redistribution and use in source and binary forms are permitted,
# subject to the restriction noted below, provided that the above
# copyright notice and this paragraph and the following paragraphs are
# duplicated in all such forms and that any documentation, advertising
# materials, and other materials related to such distribution and use
# acknowledge that the software was developed in part by the University of
# California, Irvine.  The name of the University may not be used to
# endorse or promote products derived from this software without
# specific prior written permission.  THIS SOFTWARE IS PROVIDED ``AS IS''
# AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
# LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE.
#   
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#   
# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
# ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION
# (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY
# OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# If you have any suggestions, bug reports, fixes, or enhancements,
# send them to the author Roy Fielding at <fielding@ics.uci.edu>.
# ---------------------------------------------------------------------------
require "www.pl";
require "wwwurl.pl";
require "wwwerror.pl";
require "wwwdates.pl";
require "momconfig.pl";

package momavoid;

# ==========================================================================
# Grab any needed defaults from momconfig.pl

$Version   = $momconfig'Version;

# The following is the standard name for the URL which defines for
# any site where Robots are not allowed.  See Martijn Koster's proposal
# at <http://web.nexor.co.uk/mak/doc/robots/norobots.html> for more info.

$RobotsURL = ($momconfig'RobotsURL || "/robots.txt");

# Number of seconds to wait for a response from a site.

$Timeout   = ($momconfig'Timeout   || 30);

# Specify the default interval between checks of a site's RobotsURL file.

if (!defined($momconfig'CheckInterval) || ($momconfig'CheckInterval < 2))
{
       $SitesCheck = 15 * 86400;          # Fifteen days
}
else { $SitesCheck = $momconfig'CheckInterval * 86400; }


# ==========================================================================
# Initialize the Avoids and Sites tables

$AvoidNum  = 0;
@AvoidType = ();
@AvoidURL  = ();
@AvoidExp  = ();
@AvoidSave = ();

$SitesNum  = 0;
@SitesAddr = ();
@SitesExp  = ();
@SitesSave = ();
%Sites     = ();

# Setup global information about the Avoid and Sites files

$avoidTag  = 'Avoid';
$leafTag   = 'Leaf';
$siteTag   = 'Site';

$BackedUpAvoid = 0;     # Used to prevent the overwriting of backups
$BackedUpSites = 0;
$IsaveAvoid    = '';    # Filename for interim saves
$IsaveSites    = '';

$TopAvoid = <<"EOF";
# $Version Avoid File:  Lists URL prefixes to avoid or leaf.
# New URL prefixes can be added when the program is not running.
# The file format is:  EntryType  URLprefix  [ExpireDate]
# where EntryType  = "$avoidTag" or "$leafTag"
#       URLprefix  = the full URL prefix for which this entry applies
#       ExpireDate =  [*] for never expire
#                  or [date] (see wwwdates.pl for valid date formats)
# This file is automatically generated, so don't bother changing the format.

EOF

$TopSites = <<"EOF";
# $Version Sites File:  Lists IPaddress:port locations we've checked
# for a $RobotsURL file and followed its directions.
# New sites can be added when the program is not running.
# The file format is:  EntryType  IPaddress:Port  [ExpireDate]
# where EntryType   = "$siteTag"
#       IPaddress   = the full hostname or IP address for the site
#       Port        = the numeric TCP port for the site (write 80 for default)
#       ExpireDate  =  [*] for never expire (i.e. never check this site)
#                   or [date] (see wwwdates.pl for valid date formats)
# This file is automatically generated, so don't bother changing the format.

EOF


# ==========================================================================
# ==========================================================================
# setCheckInterval($days): Set the default number of days between site
# checks and thus the default expiration time for new avoid table entries.
#
sub setCheckInterval
{
    $SitesCheck = $_[0] * 86400;
}


# ==========================================================================
# load(): Load the named avoidfile and sitesfile into their respective arrays
#         and tag them as writable according to rwtag.  The rwtag is used so
#         that information loaded from systemwide files are not written to
#         the user's files during momavoid'save.
#            Assumes ($rwtag eq 'R' || $rwtag eq 'W').
#
sub load
{
    local($avoidfile, $sitesfile, $rwtag) = @_;
    local($type, $url, $date, $site);

    if ($rwtag eq 'W')
    {
        $IsaveAvoid = $avoidfile;
        $IsaveSites = $sitesfile;
    }

    if ($avoidfile)
    {
        if (!(-e $avoidfile))
        {
            print "No existing AvoidFile -- starting from scratch\n";
            $BackedUpAvoid = 1;
        }
        elsif (!open(AVOIDF, $avoidfile))
        {
            print STDERR "Failed to open avoid file $avoidfile for load: $!\n";
        }
        else
        {
            while (<AVOIDF>)
            {
                next if ( /^$/ || /^\#/ );     # Ignore blank and comment lines
                if ( /^($avoidTag|$leafTag)\s+(\S+)\s+\[(.+)\]/ )
                {
                    $type = $1;
                    $url  = $2;
                    $date = $3;
                }
                else
                {
                    print STDERR "Invalid line $. of avoid file $avoidfile\n";
                    next;
                }
                if ($date ne '*')              # Unless expiration date is [*]
                {                              # see if this entry is too old
                    next if (time > &wwwdates'get_gmtime($date));
                }
                $url = &wwwurl'absolute('',$url);

                if ($type eq $leafTag)
                {
                    &leaf($url, $date, $rwtag);
                }
                else
                {
                    &avoid($url, $date, $rwtag);
                }
            }
            close AVOIDF;
        }
    }
    
    if ($sitesfile)
    {
        if (!(-e $sitesfile))
        {
            print "No existing SitesFile -- starting from scratch\n";
            $BackedUpSites = 1;
        }
        elsif (!open(SITESF, $sitesfile))
        {
            print STDERR "Failed to open sites file $sitesfile for load: $!\n";
        }
        else
        {
            while (<SITESF>)
            {
                next if ( /^$/ || /^\#/ );     # Ignore blank and comment lines
                if ( /^$siteTag\s+(\S+)\s+\[(.+)\]/ )
                {
                    $site = &wwwurl'get_site("http://$1/");
                    $date = $2;
                }
                else
                {
                    print STDERR "Invalid line $. of sites file $sitesfile\n";
                    next;
                }
                if ($date ne '*')              # Unless expiration date is [*]
                {                              # see if this entry is too old
                    next if (time > &wwwdates'get_gmtime($date));
                }
                &addsite($site, $date, $rwtag);
            }
            close SITESF;
        }
    }
}


# ==========================================================================
# save(): Save the Avoid and Sites tables into the passed-in filenames avoid
#         and sites files.  Only writes those entries marked as Save
#         (those originated by the user).  This has been set up for multiple
#         calls at any time during processing.
#
sub save
{
    local($avoidfile, $sitesfile) = @_;
    local($idx);

    if ($avoidfile)
    {
        if (!$BackedUpAvoid)
        {
            if (!rename($avoidfile, "$avoidfile".'.bak'))
            {
                print STDERR "Failed to backup avoid file $avoidfile: $!\n";
            }
            $BackedUpAvoid = 1;
        }

        if (!open(AVOIDF, "> $avoidfile"))
        {
            print STDERR "Failed to open avoid file $avoidfile for save: $!\n";
            print STDERR "Here is what should have been written:\n";
            print STDERR $TopAvoid;
            for ($idx = 1; $idx <= $AvoidNum; $idx++)
            {
                if ($AvoidURL[$idx] && ($AvoidSave[$idx] eq 'W'))
                {
                    printf(STDERR "%-5s %s [%s]\n", $AvoidType[$idx],
                                   $AvoidURL[$idx], $AvoidExp[$idx]);
                }
            }
        }
        else
        {
            print AVOIDF $TopAvoid;
            for ($idx = 1; $idx <= $AvoidNum; $idx++)
            {
                if ($AvoidURL[$idx] && ($AvoidSave[$idx] eq 'W'))
                {
                    printf(AVOIDF "%-5s %s [%s]\n", $AvoidType[$idx],
                                   $AvoidURL[$idx], $AvoidExp[$idx]);
                }
            }
            close(AVOIDF);
        }
    }

    if ($sitesfile)
    {
        if (!$BackedUpSites)
        {
            if (!rename($sitesfile, "$sitesfile".'.bak'))
            {
                print STDERR "Failed to backup sites file $sitesfile: $!\n";
            }
            $BackedUpSites = 1;
        }

        if (!open(SITESF, "> $sitesfile"))
        {
            print STDERR "Failed to open sites file $sitesfile for save: $!\n";
            print STDERR "Here is what should have been written:\n";
            print STDERR $TopSites;
            for ($idx = 1; $idx <= $SitesNum; $idx++)
            {
                next unless ($SitesSave[$idx]);
                printf(STDERR "$siteTag %s [%s]\n", $SitesAddr[$idx],
                                                    $SitesExp[$idx]);
            }
        }
        else
        {
            print SITESF $TopSites;
            for ($idx = 1; $idx <= $SitesNum; $idx++)
            {
                next unless ($SitesSave[$idx]);
                printf(SITESF "$siteTag %s [%s]\n", $SitesAddr[$idx],
                                                    $SitesExp[$idx]);
            }
            close(SITESF);
        }
    }
}


# ==========================================================================
# snapshot(): Print a snapshot of the current contents of the
#             Avoid and Sites Tables.  Used only for debugging.
#
sub snapshot
{
    local($rwtag, $idx);

    print "Snapshot of Avoid Table ($AvoidNum entries):\n";
    for ($idx = 1; $idx <= $AvoidNum; $idx++)
    {
        if ($AvoidURL[$idx])
        {
            printf("%4d %s %-5s %s [%s]\n", $idx, $AvoidSave[$idx],
                    $AvoidType[$idx], $AvoidURL[$idx], $AvoidExp[$idx]);
        }
    }

    print "Snapshot of Sites Table ($SitesNum entries):\n";
    for ($idx = 1; $idx <= $SitesNum; $idx++)
    {
        $rwtag = (($SitesSave[$idx] && 'W') || 'R');
        printf("%4d %s %s [%s]\n", $idx, $rwtag, $SitesAddr[$idx],
                                                 $SitesExp[$idx]);
    }
}


# ==========================================================================
# avoid(): Add the given url to the avoid table with the expiration date
#          given ('' for default, '*' for never) and a read/write tag
#          indicating whether or not to save the entry at process end.
#
sub avoid
{
    local($url, $expdate, $rwtag) = @_;
    local($date, $pos, $old);

    if (!$expdate)            # Expire at default days from now
    {
        $date = &wwwdates'wtime((time + $SitesCheck),'GMT');
    }
    else                      # or just when expdate indicates we should
    {
        $date = $expdate;
    }

    # All existing avoid/leaf entries must be checked first for
    # duplication/overlapping.

    undef $pos;

    foreach $idx (1 .. $AvoidNum)
    {
        if (!$AvoidURL[$idx])               # Fill any gaps
        {
            $pos = $idx;  next;
        }
        $old = $AvoidURL[$idx];
        if ($url eq $old)                   # URL duplicates an old one?
        {
            if (($AvoidType[$idx] ne $avoidTag) ||
                ($AvoidSave[$idx] ne 'W')       ||
                ($AvoidExp[$idx]  ne '*'))
            {
                $pos = $idx;  last;
            }
            else { return; }
        }
        if ($url =~ m#^$old#)               # URL less general than an old one?
        {
            return if (($AvoidType[$idx] eq $avoidTag) &&
                       (($rwtag ne 'W') || ($AvoidSave[$idx] eq 'W')));
            next;
        }
        if ($old =~ m#^$url#)               # URL more general than an old one?
        {
            if (($AvoidType[$idx] ne $avoidTag) ||
                ($AvoidSave[$idx] ne 'W')       ||
                ($AvoidExp[$idx]  ne '*')       ||
                (($date eq '*') && ($rwtag eq 'W')))
            {
                $pos = $idx;  last;
            }
            else { next; }
        }
    }    
    if (!defined($pos)) { $pos = ++$AvoidNum; }

    $AvoidType[$pos] = $avoidTag;
    $AvoidURL[$pos]  = $url;
    $AvoidExp[$pos]  = $date;
    $AvoidSave[$pos] = $rwtag;
}


# ==========================================================================
# leaf(): Add the given url to the leaf table with the expiration date
#         given ('' for default, '*' for never) and a read/write tag
#         indicating whether or not to save the entry at process end.
#
sub leaf
{
    local($url, $expdate, $rwtag) = @_;
    local($date, $pos, $old);

    if (!$expdate)            # Expire at default days from now
    {
        $date = &wwwdates'wtime((time + $SitesCheck),'GMT');
    }
    else                      # or just when expdate indicates we should
    {
        $date = $expdate;
    }

    # All existing avoid/leaf entries must be checked first for
    # duplication/overlapping.

    undef $pos;

    foreach $idx (1 .. $AvoidNum)
    {
        if (!$AvoidURL[$idx])               # Fill any gaps
        {
            $pos = $idx;  next;
        }
        $old = $AvoidURL[$idx];
        if ($url eq $old)                   # URL duplicates an old one?
        {
            return if ($rwtag ne 'W');
            return if (($AvoidSave[$idx] eq 'W') && ($AvoidExp[$idx] eq '*'));
            next if ($AvoidType[$idx] eq $avoidTag);
            $pos = $idx;  last;
        }
        if ($url =~ m#^$old#)               # URL less general than an old one?
        {
            return if ($rwtag ne 'W');
            return if (($AvoidSave[$idx] eq 'W') && ($AvoidExp[$idx] eq '*'));
            return if (($AvoidType[$idx] eq $avoidTag) &&
                      (($AvoidSave[$idx] eq 'W') || ($AvoidExp[$idx] eq '*')));
            next;
        }
        if ($old =~ m#^$url#)               # URL more general than an old one?
        {
            next if ($AvoidType[$idx] eq $avoidTag);
            next if (($AvoidSave[$idx] eq 'W') &&
                     (($rwtag ne 'W') || ($AvoidExp[$idx] eq '*')));
            $pos = $idx;  last;
        }
    }    
    if (!defined($pos)) { $pos = ++$AvoidNum; }

    $AvoidType[$pos] = $leafTag;
    $AvoidURL[$pos]  = $url;
    $AvoidExp[$pos]  = $date;
    $AvoidSave[$pos] = $rwtag;
}


# ==========================================================================
# exclude(): Add the given url to the leaf table for the duration of the
#            current traversal (useful for delineating the bounds of a
#            Site traversal process).
#
sub exclude
{
    local($url) = @_;
    local($pos, $old);

    # All existing avoid/leaf entries must be checked first for
    # duplication/overlapping.

    undef $pos;

    foreach $idx (1 .. $AvoidNum)
    {
        if (!$AvoidURL[$idx])               # Fill any gaps
        {
            $pos = $idx;  next;
        }
        $old = $AvoidURL[$idx];
        if ($url =~ m#^$old#)               # URL less general than an old one?
        {
            return;
        }
    }    
    if (!defined($pos)) { $pos = ++$AvoidNum; }

    $AvoidType[$pos] = $leafTag;
    $AvoidURL[$pos]  = $url;
    $AvoidExp[$pos]  = '';
    $AvoidSave[$pos] = 'E';
}


# ==========================================================================
# clear_excludes(): Remove all "excluded" URLs from the leaf table, presumably
#                   because we just finished an infostructure traversal.
#
sub clear_excludes
{
    local($pos, $old);

    for ($idx = 1; $idx <= $AvoidNum; $idx++)
    {
        next unless ($AvoidURL[$idx] && ($AvoidSave[$idx] eq 'E'));
        undef $AvoidType[$idx];
        undef $AvoidURL[$idx];
        undef $AvoidExp[$idx];
        undef $AvoidSave[$idx];
    }
}


# ==========================================================================
# addsite(): Add the given site to the site table with the expiration date
#            given ('' for default, '*' for never) and a read/write tag
#            indicating whether or not to save the entry at process end.
#
sub addsite
{
    local($site, $expdate, $rwtag) = @_;
    local($date, $idx);

    if (!$expdate)            # Expire at default days from now
    {
        $date = &wwwdates'wtime((time + $SitesCheck),'GMT');
    }
    else                      # or just when expdate indicates we should
    {
        $date = $expdate;
    }

    $idx = $Sites{$site};
    if (!$idx)
    {
        $idx = ++$SitesNum;
        $Sites{$site} = $idx;
    }
    $SitesAddr[$idx] = $site;
    $SitesExp[$idx]  = $date;
    $SitesSave[$idx] = ($rwtag eq 'W');
}


# ==========================================================================
# checkurl(): Check the url to see whether there are any restrictions
#             on its access and return
#                0 -> no restrictions
#                1 -> Leaf (okay to test, but don't traverse)
#                2 -> Avoid (no access allowed)
#
sub checkurl
{
    local($url) = @_;
    local($site, $idx, $found, $prefix);

    if (($url =~ /^http:/) && ($site = &wwwurl'get_site($url)))
    {
        &checksite($site);   # Has the side effect of updating avoid table
                             # if the site has not already been checked.
    }

    $found = 0;

    for ($idx = 1; $idx <= $AvoidNum; $idx++)
    {
        next unless ($AvoidURL[$idx]);
        $prefix = $AvoidURL[$idx];
        if ( $url =~ m#^$prefix# )
        {
            if ($AvoidType[$idx] eq $avoidTag) { return 2; }
            $found = 1;
        }
    }    

    return $found;
}


# ==========================================================================
# checksite(): Check the site table to see if this site has already been
#              checked for restrictions.  If it hasn't, perform the check on
#              that site using the RobotsNotWanted protocol and update the
#              avoid and sites tables.  See the format description at
#              <http://web.nexor.co.uk/mak/doc/robots/norobots.html>.
#
sub checksite
{
    local($site) = @_;
    local($url, $headers, %headers, $content, $response, $my_name, $agent,
          $drs, $in_def, $in_mine, $def_dr, $my_dr);

    return if defined($Sites{$site});   # Return if RobotsURL has been checked

    $my_name = &www'get_def_header('http','User-Agent');

    if (!$my_name) { die "No User-Agent has been specified, stopped"; }

    $url     = "http://$site$RobotsURL";

    $headers = '';     # The response headers will be returned here
    %headers = ();     # The parsed response headers will be returned here
    $content = '';     # The response content will be returned here

    print "Checking for $url ... ";
    $response = &www'request('GET', $url, *headers, *content, $Timeout);
    print $response,"\n";

    &addsite($site, $headers{'expires'}, 'W');    # This site has been checked

    return unless ($response == $wwwerror'RC_ok);

    $my_name =~ s#/.*##;     # Remove any version or library information
    $in_def  = 0;            # Keep track of default '*' record boundaries
    $in_mine = 0;            # Keep track of my own record boundaries
    $def_dr  = '';           # Store default disallow names
    $my_dr   = '';           # Store my own disallow names

    foreach (split(/\n/, $content))
    {
        next if (/^\s*#/);   # Ignore lines containing only a comment
        s/\s*#.*//;          # Remove any other comments

        if (/^\s*$/)         # Records are separated by blank lines
        {
            last if ($in_mine);
            $in_def = 0;
        }
        elsif (/^User-Agent:\s*(.*)$/i)   # List of robot names
        {
            next if ($in_mine);
            $agent = $1;
            if ($agent =~ /\b$my_name\b/i) { $in_mine = 1; next; }
            if ($agent =~ /^\*/)           { $in_def  = 1; }
        }
        elsif (/^Disallow:(.*)$/i)        # List of URLs to avoid
        {
            next unless ($in_def || $in_mine);
            $drs = $1;
            if ($in_mine) { $my_dr .= ' '. $drs; next; }
            if ($in_def) { $def_dr .= ' '. $drs; }
        }
    }

    if ($in_mine) { $def_dr = $my_dr; }   # My own record takes precedence

    if ($def_dr !~ /^\s*$/)
    {
        foreach $drs (split(' ', $def_dr))
        {
            &avoid(&wwwurl'absolute($url,$drs), $headers{'expires'}, 'W');
        }
        &save($IsaveAvoid, $IsaveSites);  # Save to file
    }
}


# ==========================================================================

1;