View in color | License | Download script | History | Other scripts by: peterwood |
7-Oct 3:08 UTC
[0.063] 16.246k
[0.063] 16.246k
extract-web-links.rREBOL[
Title: "extract-web-links"
Version: 1.2.1
Date: 18-04-2005
Author: "Peter WA Wood"
Copyright: "Peter WA Wood"
File: %extract-web-links.r
Purpose: {A function which scans a string (normally a web page)
and creates a block of URL/Text combinations for each
HTML <a> tag in the string.}
Usage: { example: extract-web-links read http://www.rebol.org }
Library: [
level: 'beginner
type: 'function
domain: [web html markup]
platform: 'all
tested-under: [ core 2.5.6.2.4 "Mac OS X 10.2.8"
core 2.5.6.3.1 "Windows XP Professional"
view 1.2.10.3.1 "Windows XP Professiona"]
support: none
license: cc-by
{see http://www.rebol.org/cgi-bin/cgiwrap/rebol/license-help.r}
]
]
extract-web-links: func [
{scans a string (normally a web page)
and returns a block of URL (url!) / Text (string!) combinations for
each HTML <a> tag in the string}
web-page [string!]
"The string from which to extract web links"
/only-urls
"Only URLs are returned in the block"
/only-descriptions
"Only the supplied descriptions of the links are returned"
/local
result-block
"Block for harvesting URLs and descriptions"
collected-url
"Used to harvest individual URLs"
collected-desc
"Used to harvest individual descriptions"
end-pos
"end position of selection of web-page"
][
it: [ ; the main processing of the function
do initialisation
until [ ; end of web-page is reached
;Get the next URL in web-page
web-page: find/tail web-page "<a" ; Position after <a
; Find is case-insensitive
; by default.
; So this captures <A> taqs too
if web-page [ ; <a> tag found
if (not only-descriptions) [ ; URLs requested
do harvest-url ; Harvest the URL
either collected-url ; Add any URL to result block
[append result-block to-URL collected-url]
[break] ; skip to next url
]
if not only-urls [ ; Descriptions wanted ?
do harvest-desc
append result-block collected-desc
]
]
web-page = none ; test for end of web-page
] ; end until
return result-block
] ; end it
;;=======================================================================
initialisation: [
result-block: make block! [] ; initialise result block
; If both "only" refinements are set, turn them off.
; Using both refinements has the same effect as using neither.
; This allows the remainder of the code to treat the refinements as
; being mutually exclusive.
if all [only-urls only-descriptions] [
only-urls: none
only-descriptions: none
]
] ; end intialisation
;;=======================================================================
harvest-url: [ ; section to harvest URL
collected-url: copy ""
web-page: find/tail web-page "href=" ; move to char after href=
either web-page [ ; check href present
; Find start of URl
; href may be full or relative URL
; Skip opening quote if full URL
if (first web-page) = #"^""
[web-page: next web-page]
end-pos: find web-page ">" ; find end of <a> tag
either end-pos [ ; end of <a> tag found
collected-url: copy/part web-page end-pos
if (last collected-url) = #"^"" ; remove trailing quote
[collected-url: head remove back tail collected-url]
][ ; no closing > for <a> tag !!
collected-url: none
]
][ ; no href !!!
collected-url: none
]
] ; end harvest-url
;;======================================================================
harvest-desc: [ ; section to harvest description
collected-desc: copy ""
web-page: find web-page ">" ; move to end of <a> tag
if web-page [
web-page: next web-page ; move past >
if find web-page "</a>" [ ; look for closing tag
end-pos: find web-page "</a>" ; set end-pos at <
;check for img tag
either find/part web-page "<img" end-pos [
do harvest-image-desc
][ ; text description
collected-desc: copy/part web-page end-pos
do strip-embedded-tags
]
]
]
] ; end harvest-desc
;;========================================================================
harvest-image-desc: [
collected-desc: copy "Image: "
if find/part web-page "alt=^"" end-pos [ ; Alt attribute ?
web-page: find/tail web-page "alt=^""
if find web-page "^"" ; Alt ends with "
[append collected-desc copy/part web-page find web-page "^""]
]
] ; end harvest-image-desc
;;========================================================================
strip-embedded-tags: [
;; strips embedded html tags from collected-desc
while [ all [ (find collected-desc "<") (find collected-desc ">")]][
collected-desc: find collected-desc "<"
remove remove/part collected-desc find collected-desc ">"
]
collected-desc: head collected-desc ; set index at start
] ; strip-embedded-tags
;;========================================================================
do it ; execute the function code
] ; end extract-web-links
;; History
;;
;; 1.1.0 23-Dec-2004 Initial release
;; 1.2.0 16-Jan-2004 Usage, copyright added to header
;; 1.2.1 19-Apr-2004 Tested under View 1.2.10.3.1
;;
;; |