#!/usr/bin/env python
# 
# Copyright (c) 2001 Vivake Gupta (vivakeATomniscia.org).  All rights reserved.
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
# USA
#
# This software is maintained by Vivake (vivakeATomniscia.org) and is available at:
#     /~vivake/python/RobotRules.py

""" RobotRules - parses robots.txt files

NB: This is a straight conversion of the perl module WWW::RobotRules

This module parses a robots.txt file as specified in
"A Standard for Robot Exclusion", described in
http://www.robotstxt.org/wc/exclusion.html
Webmasters can use the robots.txt file to disallow conforming
robots access to parts of their web site.

The parsed file is kept in the RobotRules object, and this object
provides methods to check if access to a given URL is prohibited. The
same RobotRules object can parse multiple robots.txt files.

The module exports only one class: RobotRules.  Sample usage follows:

  import RobotRules

  rr = RobotRules.RobotRules('MOMspider/1.0')

  import urllib
  
  url = 'http://some.place/robots.txt'
  robots_txt = urllib.urlopen(url).read()
  rr.parse(url, robots_txt)

  url = 'http://some.other.place/robots.txt'
  robots_txt = urllib.urlopen(url).read()
  rr.parse(url, robots_txt)

  # Now we are able to check if a URL is valid for those servers that
  # we have obtained and parsed "robots.txt" files for.
  if rr->allowed(url):
      c = urllib.urlopen(url).read()
      ...
"""

__version__ = "$Revision: 1.2 $"
# $Id: RobotRules.py,v 1.2 2003/12/31 01:25:36 vivake Exp $

import re
import sys
import time
import string
import urlparse

class RobotRules:

    """RobotRules class

    This class implements all of the functionality of the RobotRules module.

    The following methods are provided:
       RobotRules.RobotRules(ua)
       RobotRules.parse(url, txt, fresh_until = None)
       RobotRules.is_me(ua)
       RobotRules.allowed(url)

    """
    
    ua = ""
    loc = {}
    
    def __init__(self, ua):
        """Constructor.  Receives name of the robot."""
        self.agent(ua)

    def parse(self, url, txt, fresh_until = None):
        """ Do the actual robots.txt file parsing.

        Keyword arguments:
        url -- URL of the robots.txt file
        txt -- Actual contents of the file.
        fresh_until -- Optional.  When the data expires.

        """
        netloc = urlparse.urlparse(url)[1]
        self.clear_rules(netloc)
        if not fresh_until:
            fresh_until = time.time() + 365*24*3600
        self.fresh_until(netloc, fresh_until)
        
        ua = None
        is_me = 0
        is_anon = 0
        my_disallowed = []
        anon_disallowed = []
        
        # blank lines are significant, so turn CRLF into LF to avoid generating
        # false ones
        txt = re.sub(r'\r\n', r'\n', txt)
        
        # split at \n or \r (Mac text files just have CR for EOL)
        for line in re.split(r'[\r\n]', txt):
            # Lines containing only a comment are discarded completely, and
            # therefore do not indicate a record boundary.
            if re.search(r'^\s*\#', line):
                continue

            line = re.sub(r'\s*\#.*', '', line)     # remove comments at end-of-line
            
            if re.search(r'^\s*$', line):           # blank line
                if is_me == 1:                      # That was our record.  No need to read the rest.
                    break
                is_anon = 0
            elif re.search(r'^User-Agent:\s*(.*)', line, re.IGNORECASE):
                mo = re.search(r'^User-Agent:\s*(.*)', line, re.IGNORECASE)
                ua = mo.group(1)
                ua = re.sub(r'\s+$','', ua)
                if is_me:
                    # This record already had a User-agent that
                    # we matched, so just continue.
                    pass
                elif ua == '*':
                    is_anon = 1
                elif self.is_me(ua):
                    is_me = 1
            elif re.search(r'^Disallow:\s*(.*)', line, re.IGNORECASE):
                mo = re.search(r'^Disallow:\s*(.*)', line, re.IGNORECASE)
                if not ua:
                    sys.stderr.write("RobotRules: Disallow without preceding User-agent\n")
                    is_anon = 1                      # assume that User-agent: * was intended
                disallow = mo.group(1)
                disallow = re.sub(r'\s+$','', disallow)
                if len(disallow):
                    disallow = urlparse.urlunparse(urlparse.urlparse(urlparse.urljoin(url, disallow)))
                if is_me:
                    me_disallowed.append(disallow)
                elif is_anon:
                    anon_disallowed.append(disallow)
            else:
                sys.stderr.write("RobotRules: Unexpected line:", line, "\n")
        if is_me:
            self.push_rules(netloc, me_disallowed)
        else:
            self.push_rules(netloc, anon_disallowed)

    def is_me(self, ua):
        """Return 1 if the given name matches the name of this robot"""
        if string.find(ua, self.agent()) >= 0:
          return 1

    def allowed(self, url):
        """Return 1 if this robot is allowed to retrieve this URL"""
        netloc = urlparse.urlparse(url)[1]
        fresh_until = self.fresh_until(netloc)
        if not fresh_until or fresh_until < time.time():
            return -1
        
        str = urlparse.urlunparse(urlparse.urlparse(url))
        for rule in self.rules(netloc):
            if string.find(str, rule) == 0:
                return 0
        return 1

    def agent(self, name = None):
        """Get/set the agent name.

        NOTE: Changing the agent name will clear the robots.txt
        rules and expire time out of the cache.

        """
        old = self.ua
        if name:
            self.loc = {}                            # all old info is now stale
            name = re.sub(r'/?\s*\d+.\d+\s*$', '', name)
            self.ua = name
        return old

    def visit(self, netloc, time = None):
        if not time:
            time = time.time()
        if not self.loc.has_key(netloc):
            self.loc[netloc] = {}
        self.loc[netloc]['last'] = time

        if not self.loc[netloc].has_key('count'):
            self.loc[netloc]['count'] = 0
        count = self.loc[netloc]['count']
        if not count:
            count = 1
        else:
            count = count + 1

    def no_visits(self, netloc):
        if not self.loc.has_key(netloc):
            self.loc[netloc] = {}
        if not self.loc[netloc].has_key('count'):
            self.loc[netloc]['count'] = 0
        return self.loc[netloc]['count']

    def last_visit(self, netloc):
        if not self.loc.has_key(netloc):
            self.loc[netloc] = {}
        if not self.loc[netloc].has_key('last'):
            self.loc[netloc]['last'] = 0
        return self.loc[netloc]['last']
        
    def fresh_until(self, netloc, fresh_until = None):
        if not self.loc.has_key(netloc):
            self.loc[netloc] = {}
        if not self.loc[netloc].has_key('fresh'):
            self.loc[netloc]['fresh'] = 0
        old = self.loc[netloc]['fresh']
        if fresh_until:
            self.loc[netloc]['fresh'] = fresh_until
        return old
        
    def push_rules(self, netloc, rules):
        if not self.loc.has_key(netloc):
            self.loc[netloc] = {}
        if not self.loc[netloc].has_key('rules'):
            self.loc[netloc]['rules'] = []
        self.loc[netloc]['rules'] = self.loc[netloc]['rules'] + rules
        
    def clear_rules(self, netloc):
        if not self.loc.has_key(netloc):
            self.loc[netloc] = {}
        self.loc[netloc]['rules'] = []
        
    def rules(self, netloc):
        if not self.loc.has_key(netloc):
            self.loc[netloc] = {}
        if not self.loc[netloc].has_key('rules'):
            self.loc[netloc]['rules'] = []
        return self.loc[netloc]['rules']
            
    def dump(self):
        print 'User-agent', ':', self.ua
        for key in self.loc.keys():
            rules = self.rules(key)
            print key, ':', rules