Script Library: 1213 scripts
 

extract-urls.r

REBOL [ Title: "Extract URLs" File: %extract-urls.r Version: 1.0.0 Home: http://www.ross-gill.com/ Date: 29-Nov-2009 Purpose: "To identify and extract URIs from plain text" Author: "Christopher Ross-Gill" Library: [ level: 'intermediate platform: 'all type: [function module] domain: [markup parse text text-processing web] tested-under: [view 2.7.6.2.4 view 2.100.95.2.5] support: none license: 'cc-by-sa see-also: http://daringfireball.net/2009/11/liberal_regex_for_matching_urls ] ] extract-urls: use [out rule word uri space punct chars][ word: charset [#"_" #"0" - #"9" #"A" - #"Z" #"a" - #"z"] ; per regex space: charset "^/^- ()<>" punct: charset "!'#$%&`*+,-./:;=?@[/]^^{|}~" ; regex 'punct without ()<> chars: complement union space punct uri: [ [some [word | "-"] ":/" opt "/" | "www."] some [opt [some punct] some chars opt "/"] opt [any punct "(" some word ")"] ] rule: use [emit-link emit-text link text mk ex][ emit-link: [(append out to-url link)] emit-text: [(unless mk = ex [append out copy/part mk ex])] [ mk: any [ ex: copy link uri emit-text emit-link mk: | some [chars | punct] some space ; non-uri words, line not required | skip ] ex: emit-text ] ] func [ "Separates URLs from plain text" txt [string!] "Text to be " ][ out: copy [] if parse/all txt rule [out] ] ]
halt ;; to terminate script if DO'ne from webpage
Notes