#!/usr/bin/env python # # Copyright (c) 2001 Vivake Gupta (vivakeATomniscia.org). All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA # # This software is maintained by Vivake (vivakeATomniscia.org) and is available at: # /~vivake/python/RobotRules.py """ RobotRules - parses robots.txt files NB: This is a straight conversion of the perl module WWW::RobotRules This module parses a robots.txt file as specified in "A Standard for Robot Exclusion", described in http://www.robotstxt.org/wc/exclusion.html Webmasters can use the robots.txt file to disallow conforming robots access to parts of their web site. The parsed file is kept in the RobotRules object, and this object provides methods to check if access to a given URL is prohibited. The same RobotRules object can parse multiple robots.txt files. The module exports only one class: RobotRules. Sample usage follows: import RobotRules rr = RobotRules.RobotRules('MOMspider/1.0') import urllib url = 'http://some.place/robots.txt' robots_txt = urllib.urlopen(url).read() rr.parse(url, robots_txt) url = 'http://some.other.place/robots.txt' robots_txt = urllib.urlopen(url).read() rr.parse(url, robots_txt) # Now we are able to check if a URL is valid for those servers that # we have obtained and parsed "robots.txt" files for. if rr->allowed(url): c = urllib.urlopen(url).read() ... """ __version__ = "$Revision: 1.2 $" # $Id: RobotRules.py,v 1.2 2003/12/31 01:25:36 vivake Exp $ import re import sys import time import string import urlparse class RobotRules: """RobotRules class This class implements all of the functionality of the RobotRules module. The following methods are provided: RobotRules.RobotRules(ua) RobotRules.parse(url, txt, fresh_until = None) RobotRules.is_me(ua) RobotRules.allowed(url) """ ua = "" loc = {} def __init__(self, ua): """Constructor. Receives name of the robot.""" self.agent(ua) def parse(self, url, txt, fresh_until = None): """ Do the actual robots.txt file parsing. Keyword arguments: url -- URL of the robots.txt file txt -- Actual contents of the file. fresh_until -- Optional. When the data expires. """ netloc = urlparse.urlparse(url)[1] self.clear_rules(netloc) if not fresh_until: fresh_until = time.time() + 365*24*3600 self.fresh_until(netloc, fresh_until) ua = None is_me = 0 is_anon = 0 my_disallowed = [] anon_disallowed = [] # blank lines are significant, so turn CRLF into LF to avoid generating # false ones txt = re.sub(r'\r\n', r'\n', txt) # split at \n or \r (Mac text files just have CR for EOL) for line in re.split(r'[\r\n]', txt): # Lines containing only a comment are discarded completely, and # therefore do not indicate a record boundary. if re.search(r'^\s*\#', line): continue line = re.sub(r'\s*\#.*', '', line) # remove comments at end-of-line if re.search(r'^\s*$', line): # blank line if is_me == 1: # That was our record. No need to read the rest. break is_anon = 0 elif re.search(r'^User-Agent:\s*(.*)', line, re.IGNORECASE): mo = re.search(r'^User-Agent:\s*(.*)', line, re.IGNORECASE) ua = mo.group(1) ua = re.sub(r'\s+$','', ua) if is_me: # This record already had a User-agent that # we matched, so just continue. pass elif ua == '*': is_anon = 1 elif self.is_me(ua): is_me = 1 elif re.search(r'^Disallow:\s*(.*)', line, re.IGNORECASE): mo = re.search(r'^Disallow:\s*(.*)', line, re.IGNORECASE) if not ua: sys.stderr.write("RobotRules: Disallow without preceding User-agent\n") is_anon = 1 # assume that User-agent: * was intended disallow = mo.group(1) disallow = re.sub(r'\s+$','', disallow) if len(disallow): disallow = urlparse.urlunparse(urlparse.urlparse(urlparse.urljoin(url, disallow))) if is_me: me_disallowed.append(disallow) elif is_anon: anon_disallowed.append(disallow) else: sys.stderr.write("RobotRules: Unexpected line:", line, "\n") if is_me: self.push_rules(netloc, me_disallowed) else: self.push_rules(netloc, anon_disallowed) def is_me(self, ua): """Return 1 if the given name matches the name of this robot""" if string.find(ua, self.agent()) >= 0: return 1 def allowed(self, url): """Return 1 if this robot is allowed to retrieve this URL""" netloc = urlparse.urlparse(url)[1] fresh_until = self.fresh_until(netloc) if not fresh_until or fresh_until < time.time(): return -1 str = urlparse.urlunparse(urlparse.urlparse(url)) for rule in self.rules(netloc): if string.find(str, rule) == 0: return 0 return 1 def agent(self, name = None): """Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt rules and expire time out of the cache. """ old = self.ua if name: self.loc = {} # all old info is now stale name = re.sub(r'/?\s*\d+.\d+\s*$', '', name) self.ua = name return old def visit(self, netloc, time = None): if not time: time = time.time() if not self.loc.has_key(netloc): self.loc[netloc] = {} self.loc[netloc]['last'] = time if not self.loc[netloc].has_key('count'): self.loc[netloc]['count'] = 0 count = self.loc[netloc]['count'] if not count: count = 1 else: count = count + 1 def no_visits(self, netloc): if not self.loc.has_key(netloc): self.loc[netloc] = {} if not self.loc[netloc].has_key('count'): self.loc[netloc]['count'] = 0 return self.loc[netloc]['count'] def last_visit(self, netloc): if not self.loc.has_key(netloc): self.loc[netloc] = {} if not self.loc[netloc].has_key('last'): self.loc[netloc]['last'] = 0 return self.loc[netloc]['last'] def fresh_until(self, netloc, fresh_until = None): if not self.loc.has_key(netloc): self.loc[netloc] = {} if not self.loc[netloc].has_key('fresh'): self.loc[netloc]['fresh'] = 0 old = self.loc[netloc]['fresh'] if fresh_until: self.loc[netloc]['fresh'] = fresh_until return old def push_rules(self, netloc, rules): if not self.loc.has_key(netloc): self.loc[netloc] = {} if not self.loc[netloc].has_key('rules'): self.loc[netloc]['rules'] = [] self.loc[netloc]['rules'] = self.loc[netloc]['rules'] + rules def clear_rules(self, netloc): if not self.loc.has_key(netloc): self.loc[netloc] = {} self.loc[netloc]['rules'] = [] def rules(self, netloc): if not self.loc.has_key(netloc): self.loc[netloc] = {} if not self.loc[netloc].has_key('rules'): self.loc[netloc]['rules'] = [] return self.loc[netloc]['rules'] def dump(self): print 'User-agent', ':', self.ua for key in self.loc.keys(): rules = self.rules(key) print key, ':', rules