[REBOL] Re: parsing rebsites
From: oliva:david:seznam:cz at: 4-Aug-2002 15:42
Hello Anton,
here is first (not tested online yet) version of my simle Rebol-sites bot:
<code>
rebol [
title: "Rebol Reb-sites-BOT"
file: %reb-reb-bot.r
author: 'Oldes
email: [oliva--david--seznam--cz]
purpose: {Simple BOT for travelling thru Reb-sites - uses read-thru so downloads these
files as well:-)}
version: 0.0.1
date: 4-Aug-2002/15:27:23+2:00
]
path-thru: func [url /local purl][
if file? url [return url]
if not all [purl: decode-url url purl/host] [return none]
rejoin [
view-root/public slash purl/host
either none? purl/port-id [""][join "%3A" purl/port-id]
slash any [purl/path ""] any [purl/target ""]
]
]
page-url: to-url ask "Start URL (default is: http://www.rebol.com/index.r): "
if empty? page-url [page-url: http://www.rebol.com/index.r]
if block? ctx-viewtop [ctx-viewtop: context ctx-viewtop]
files-to-search: make block! 1000
files-searched: make block! 1000
insert files-to-search page-url
read-file: func[
{Read a net file from thru the disk cache. Returns binary, else none on error.}
url [url!]
/update "Force update from source site"
/check "Update only if date/size do not match."
/local info
] either connected? [
[;online version
either check [
if error? try [
info: info? url
return read-thru/check url reduce [info/size info/date]
][ return none]
][
either update [
read-thru/update url
][ read-thru url ]
]
]
][
[;offline version
if error? try [return read/binary path-thru url][return none]
]
]
while [not empty? files-to-search ][
insert tail files-searched page-url: first files-to-search
if not none? page: read-file/check page-url [
page: load/all to-string page
obj: ctx-viewtop/parse-index page
purl: decode-url page-url
base-href: rejoin [
http:// purl/host
either none? purl/port-id [""][join ":" purl/port-id]
#"/" any [purl/path ""]
]
if obj [ ; index file parsed ok?
print reform ["NEW INDEX:" page-url]
print reform ["TITLE:" obj/title "ICONS:" length? obj/icons]
foreach icon obj/icons [
if file? icon/item [icon/item: join base-href icon/item]
either icon/type = 'folder [
print reform [tab "FOLDER:" icon/item]
if not found? find files-searched icon/item [
insert tail files-to-search icon/item
]
][
;place your own code for ather file types:
print reform [tab uppercase to-string icon/type #":" icon/item]
;You will probably want to donwload them as well:-)
if url? icon/item [
read-file/check icon/item
]
]
]
]
]
remove files-to-search
]
</code>