Mailing List Archive: Re: parsing rebsites

[REBOL] Re: parsing rebsites

From: oliva:david:seznam:cz at: 4-Aug-2002 15:42


Hello Anton,

here is first (not tested online yet) version of my simle Rebol-sites bot:

<code>

rebol [
    title: "Rebol Reb-sites-BOT"
    file: %reb-reb-bot.r
    author: 'Oldes
    email: [oliva--david--seznam--cz]

    purpose: {Simple BOT for travelling thru Reb-sites - uses read-thru so downloads these 
    files as well:-)}
    version: 0.0.1
    date: 4-Aug-2002/15:27:23+2:00
]

path-thru: func [url /local purl][
    if file? url [return url]
    if not all [purl: decode-url url purl/host] [return none]
    rejoin [
        view-root/public slash purl/host
        either none? purl/port-id [""][join "%3A" purl/port-id]
        slash any [purl/path ""] any [purl/target ""]
    ]
]

page-url: to-url ask "Start URL (default is: http://www.rebol.com/index.r): "
if empty? page-url [page-url: http://www.rebol.com/index.r]

if block? ctx-viewtop [ctx-viewtop: context ctx-viewtop]

files-to-search: make block! 1000
files-searched: make block! 1000

insert files-to-search page-url

read-file: func[
    {Read a net file from thru the disk cache. Returns binary, else none on error.}
    url [url!]
    /update "Force update from source site"
    /check  "Update only if date/size do not match."
    /local info
] either connected? [
    [;online version
        either check [
            if error? try [
                info: info? url
                return read-thru/check url reduce [info/size info/date]
            ][  return none]

        ][
            either update [
                read-thru/update url
            ][  read-thru url ]
        ]
    ]
][
    [;offline version
        if error? try [return read/binary path-thru url][return none]
    ]
]

while [not empty? files-to-search ][

    insert tail files-searched page-url: first files-to-search
    if not none? page: read-file/check page-url [
        page: load/all to-string page
        obj: ctx-viewtop/parse-index page

        purl: decode-url page-url
        base-href: rejoin [
            http:// purl/host
            either none? purl/port-id [""][join ":" purl/port-id]
            #"/" any [purl/path ""]
        ]

        if obj [ ; index file parsed ok?
            print reform ["NEW INDEX:" page-url]
            print reform ["TITLE:" obj/title "ICONS:" length? obj/icons]
            foreach icon obj/icons [
                if file? icon/item [icon/item: join base-href icon/item]
                either icon/type = 'folder [
                    print reform [tab "FOLDER:" icon/item]
                    if not found? find files-searched icon/item [
                        insert tail files-to-search icon/item
                    ]
                ][
                    ;place your own code for ather file types:
                    print reform [tab uppercase to-string icon/type #":" icon/item]
                    ;You will probably want to donwload them as well:-)
                    if url? icon/item [
                        read-file/check icon/item
                    ]
                ]
            ]
        ]
    ]
    remove files-to-search
]
</code>