#!/usr/local/bin/rebol -cs REBOL [ Title: "Server object handler script" File: %server.r Author: [ "HY" ] Purpose: { This script will parse robot.txt files and store a hash! of forbidden paths. This is very useful for webbots or spiders of any kind (at least if obeying robot exclusion standards is desirable). I looked at the ht://dig package ( script that does the same thing as this script to see if I had overlooked something, and thereby fixed a few bugs. } Date: 16-Aug-2003 Examples: { politebot: server "PoliteBot" politebot/add reduce ["" read "" read ] print politebot/forbidden? print politebot/forbidden? } History: [ 19-Oct-2004 {Removed a bug that caused this path: /rebol/index.html to be forbidden if robots.txt had Disallow: /i I should do something about port numbers in URLs as well.} 23-Sep-2004 {Added the library header} 22-Apr-2004 {Removed a bug that modified an incoming variable (from other scripts) directly.} ] Library: [ level: 'intermediate domain: [parse http text-processing web] license: none Platform: 'all Tested-under: none Type: [module] Support: none ] ] server-handler-object: context [ ; 'context should automatically pick up set-words ; and bind them to the new local context, so: bot-name: "" patterns: make hash! copy [] add: func [ block [block!] /local seen-my-name? pay-attention? my-patterns] [ foreach [ host robots-file ] block [ seen-my-name?: false pay-attention?: false my-patterns: copy [] if none? robots-file [robots-file: ""] lines: parse/all robots-file "^/" forall lines [ line: first lines if hash: find line "#" [ line: head remove/part find line "#" length? hash ] if all [0 < length? line #"#" = first line] [ line: "" ] parsed: parse/all line ":" if not 0 = length? parsed [ name: trim first parsed either 1 < length? parsed [ rest: trim first next parsed ] [ rest: "" ] if all [name = "user-agent"] [ either all [rest = "*" not seen-my-name?] [ pay-attention?: true ] [ either rest = bot-name [ either not seen-my-name? [ seen-my-name?: true pay-attention?: true clear my-patterns ; ignore previous messages for "*" ] [ pay-attention?: false ; only take first section with our name ] ] [ pay-attention?: false ; none of our business ] ; end either rest = bot-name ] ; end either rest is * and not seen my name ] ; end name = user-agent if all [pay-attention? name = "disallow"] [ if not 0 = length? rest [ append my-patterns rest ] ] ; end pay attention and name is disallow ] ; end length? parsed not 0 ] ; end forall lines append patterns reduce [host my-patterns] ] ] ; end add got-robots-file?: func [ "Tells whether or not we have the robots.txt file for a given host" host [string!] "The hostname to look for" ] [ find patterns host ] forbidden?: func [ "Tells whether or not a given URL is forbidden by the robots.txt file" in-url [object! string! url!] {The URL to check. This must be a complete URL. If this is an object, it is assumed to be a url-handler object ( } ] [ url: copy in-url ; don't modify incoming variable! if url? url [url: to-string url ] if string? url [ if "" = url [return false] ; this might be just a bit too quick ... ; assume this is a complete url: url: url-handler url ] ; allowing objects, but assuming they are url-handler objects ; ( path: find/match url/plain join url/protocol url/host pattern: select patterns url/host if none? pattern [return false] ; Maybe load robots.txt instead, to check for real? foreach p pattern [ if found? find/part path p length? p [ return true ] ] false ] ] ; shortcut: server: func [ "Shorthand for make server-handler-object [ bot-name: name ]" name [string! "The bot's name. This is the name that we will look for in the robots.txt files."] ] [ return make server-handler-object [ bot-name: name ] ]
halt ;; to terminate script if DO'ne from webpage