Script Library: 1238 scripts
 

web-to-plain.r

Rebol[ Title: "web to plain" Author: "Tom Conlin" Version: 1.0.0 File: %web-to-plain.r Date: 14-Mar-2003 Purpose: { to translate htmlized text into plain text in one pass markdown? } Warning: { this does NOT handle ALL char-entites above 255 or most charsets ... thanks to Chris Ross-Gill for help with the chars below 160 and above 255 http://www.ross-gill.com/techniques/entities/ unreconized entities are passed as is } Example: { page: load/markup http://www.sandia.gov/sci_compute/iso_symbol.html buffer: copy "" foreach thing page[if not tag? thing[append buffer thing]] web-to-plain buffer probe buffer } Category: [web] library: [ level: 'intermediate platform: none type: 'function domain: [web markup internet HTML] tested-under: none support: none license: none see-also: none ] Usage: [ web-to-plain {old hat & new hat; red hat & blue hat &&foo;} ] ] web-to-plain: func [ content [string!] /local c-e keyset-a keyset-1 entity key value i j][ c-e: [ "euro" "" "sbquo" "fnof" "bdquo" "hellip" "dagger" "Dagger" "circ" "permil" "Scaron" "lsaquo" "OElig" "" "" "" "" "lsquo" "rsquo" "ldquo" "rdquo" "bull" "ndash" "mdash" "tilde" "trade" "scaron" "rsaquo" "oelig" "" "" "Yuml" "nbsp" "iexcl" "cent" "pound" "curren" "yen" "brvbar" "sect" "uml" "copy" "ordf" "laquo" "not" "shy" "reg" "macr" "deg" "plusmn" "sup2" "sup3" "acute" "micro" "para" "middot" "cedil" "sup1" "ordm" "raquo" "frac14" "frac12" "frac34" "iquest" "Agrave" "Aacute" "Acirc" "Atilde" "Auml" "Aring" "AElig" "Ccedil""Egrave" "Eacute" "Ecirc" "Euml" "Igrave" "Iacute" "Icirc" "Iuml" "ETH" "Ntilde" "Ograve" "Oacute" "Ocirc" "Otilde" "Ouml" "times" "Oslash" "Ugrave" "Uacute" "Ucirc" "Uuml" "Yacute" "THORN" "szlig" "agrave" "aacute" "acirc" "atilde" "auml" "aring" "aelig" "ccedil" "egrave" "eacute" "ecirc" "euml" "igrave" "iacute" "icirc" "iuml" "eth""ntilde" "ograve" "oacute" "ocirc" "otilde" "ouml" "divide" "oslash" "ugrave" "uacute" "ucirc" "uuml" "yacute" "thorn" "yuml"] exception-to-char: [ "gt" 62 "lt" 60 "amp" 38 "quot" 34 "ensp" 32 "emsp" 32 "sp" 32 "ldots" 133 "8364" 128 "8218" 130 "8222" 132 "8224" 134 "8225" 135 "710" 136 "8240" 137 "352" 138 "8249" 139 "338" 140 "8216" 145 "8217" 146 "8220" 147 "8221" 148 "8211" 150 "8212" 151 "732" 152 "8482" 153 "353" 154 "8250" 155 "339" 156 "376" 159] keyset: charset {abcdefghijklmnopqrstuvwxyz0123456789} parse content [ any [ (key: copy "" value: none) to "&" mark: [ [ "&" copy key [some keyset] ";" ( either value: select exception-to-char key [ remove/part next :mark (length? key) + 1 change :mark to-string to-char value ] [ if all[key <> "" j: find/case c-e key][ value: (index? j) + 127 remove/part next :mark (length? key) + 1 change :mark to-string to-char value ] ] mark: next :mark ):mark ]|["&#" copy key integer! ";" ( i: to-integer key if all[i > 255 j: select exception-to-char key][i: j] if all[i >= 0 i <= 255][ remove/part next :mark (length? key) + 2 change :mark to-string to-char i ] mark: next :mark ):mark ]| "&" ] ] ] content ]
halt ;; to terminate script if DO'ne from webpage