Script Library: 1238 scripts
 

site-check.r

REBOL [ Title: "Web Site Checker" Date: 11-June-2004 ;16-May-2001 Version: 1.1.1 File: %site-check.r Author: "Carl Sassenrath" Purpose: {Scan a web site looking for missing pages, remote links, email links, etc. Helps you clean up sites.} Email: %carl--rebol--com library: [ level: 'intermediate platform: 'all type: 'tool domain: [web file-handling markup parse] tested-under: none support: none license: none see-also: none ] ] ; 1.1.1 - Fixes problem when top-level relative paths are used ( /index.html /about.html etc) ;--Config: base-url: http://www.rebol.com threshold: 8000 ; used to filter out huge pages exclude-urls: [ ; URL patterns for pages to exclude http://www.rebol.com/library http://www.rebol.com/docs/core23 http://www.rebol.com/dictionary http://www.rebol.com/users.html http://www.rebol.com/docs/dictionary ] ;--Lists: base-str: form base-url scanned-urls: [] missing-urls: [] remote-urls: [] local-urls: [] secure-urls: [] email-urls: [] ftp-urls: [] ref-urls: [] ; pairs of: url and referrer ;--Functions: html?: func [url /local t] [ all [ t: find/last/tail url "." t: to-string t any [t = "htm" t = "html"] ] ] add-url: func [urlset url from /local t] [ clear find url "#" if all [ not find url "?" html? url ][ append urlset url repend ref-urls [url form from] ; second is string ] ] scan-page: func [url /local tag page new path] [ print ["Scanning:" url length? local-urls length? missing-urls] append scanned-urls url foreach u exclude-urls [if find/match url u [print "(excluded)" exit]] path: either html? url [first split-path url][url] if error? try [page: load/markup url][append missing-urls url exit] if (length? page) > threshold [exit] ; big page, skip it. foreach tag page [ if all [ tag? tag tag: parse tag "=" tag: select tag "HREF" ][ new: to-url tag parse/all tag [ "#" | base-str (add-url local-urls new url) | "/" (add-url local-urls base-url/:new url) | ;1.1.1 "http:" (append remote-urls new) | "https:" (append secure-urls new) | "ftp:" (append ftp-urls new) | "mailto:" (append email-urls new) | none (add-url local-urls path/:new url) ] ] ] remote-urls: unique remote-urls local-urls: unique local-urls secure-urls: unique secure-urls email-urls: unique email-urls ftp-urls: unique ftp-urls ] ;--Main code: scan-page base-url while [pick urls: exclude local-urls scanned-urls 1][ scan-page pick urls 1 ] out: reform ["Site Summary for" base-url "on" now newline] sort scanned-urls repend out "^/Scanned Pages:^/" foreach url scanned-urls [repend out [url newline]] sort remote-urls repend out "^/Remote Links:^/" foreach url remote-urls [repend out [url newline]] sort email-urls repend out "^/Email Links:^/" foreach url email-urls [repend out [url newline]] repend out "^/References:^/" foreach [url url2] ref-urls [repend out [url2 " -> " url newline]] repend out "^/Missing Pages:^/" foreach url missing-urls [ n: ref-urls repend out ["Missing URL:" url newline] while [n: find n url] [ repend out [tab "Ref from:" n/2 newline] n: next n ] ] write %site-summary.txt out browse %site-summary.txt
halt ;; to terminate script if DO'ne from webpage
Notes
  • email address(es) have been munged to protect them from spam harvesters. If you are a Library member, you can log on and view this script without the munging.
  • (carl:rebol:com)