View in color | License | Download script | History | Other scripts by: tomc |
15-Sep 7:37 UTC
[0.045] 13.217k
[0.045] 13.217k
web-to-plain.rRebol[
Title: "web to plain"
Author: "Tom Conlin"
Version: 1.0.0
File: %web-to-plain.r
Date: 14-Mar-2003
Purpose: { to translate htmlized text into plain text in one pass
markdown?
}
Warning: { this does NOT handle ALL char-entites above 255
or most charsets ... thanks to Chris Ross-Gill
for help with the chars below 160 and above 255
http://www.ross-gill.com/techniques/entities/
unreconized entities are passed as is
}
Example: {
page: load/markup http://www.sandia.gov/sci_compute/iso_symbol.html
buffer: copy "" foreach thing page[if not tag? thing[append buffer thing]]
web-to-plain buffer
probe buffer
}
Category: [web]
library: [
level: 'intermediate
platform: none
type: 'function
domain: [web markup internet HTML]
tested-under: none
support: none
license: none
see-also: none
]
Usage: [
web-to-plain {old hat & new hat; red hat & blue hat &&foo;}
]
]
web-to-plain: func [ content [string!]
/local c-e keyset-a keyset-1 entity key value i j][
c-e: [
"euro" "" "sbquo" "fnof" "bdquo" "hellip" "dagger" "Dagger" "circ" "permil"
"Scaron" "lsaquo" "OElig" "" "" "" "" "lsquo" "rsquo" "ldquo" "rdquo" "bull"
"ndash" "mdash" "tilde" "trade" "scaron" "rsaquo" "oelig" "" "" "Yuml"
"nbsp" "iexcl" "cent" "pound" "curren" "yen" "brvbar" "sect" "uml" "copy"
"ordf" "laquo" "not" "shy" "reg" "macr" "deg" "plusmn" "sup2" "sup3" "acute"
"micro" "para" "middot" "cedil" "sup1" "ordm" "raquo" "frac14" "frac12"
"frac34" "iquest" "Agrave" "Aacute" "Acirc" "Atilde" "Auml" "Aring" "AElig"
"Ccedil""Egrave" "Eacute" "Ecirc" "Euml" "Igrave" "Iacute" "Icirc" "Iuml"
"ETH" "Ntilde" "Ograve" "Oacute" "Ocirc" "Otilde" "Ouml" "times" "Oslash"
"Ugrave" "Uacute" "Ucirc" "Uuml" "Yacute" "THORN" "szlig" "agrave" "aacute"
"acirc" "atilde" "auml" "aring" "aelig" "ccedil" "egrave" "eacute" "ecirc"
"euml" "igrave" "iacute" "icirc" "iuml" "eth""ntilde" "ograve" "oacute"
"ocirc" "otilde" "ouml" "divide" "oslash" "ugrave" "uacute" "ucirc" "uuml"
"yacute" "thorn" "yuml"]
exception-to-char: [
"gt" 62 "lt" 60 "amp" 38 "quot" 34
"ensp" 32 "emsp" 32 "sp" 32 "ldots" 133
"8364" 128 "8218" 130 "8222" 132 "8224" 134 "8225" 135 "710" 136 "8240" 137
"352" 138 "8249" 139 "338" 140 "8216" 145 "8217" 146 "8220" 147 "8221" 148
"8211" 150 "8212" 151 "732" 152 "8482" 153 "353" 154 "8250" 155 "339" 156
"376" 159]
keyset: charset {abcdefghijklmnopqrstuvwxyz0123456789}
parse content [
any [ (key: copy "" value: none)
to "&" mark: [
[ "&" copy key [some keyset] ";"
( either value: select exception-to-char key
[ remove/part next :mark (length? key) + 1
change :mark to-string to-char value
]
[ if all[key <> "" j: find/case c-e key][
value: (index? j) + 127
remove/part next :mark (length? key) + 1
change :mark to-string to-char value
]
]
mark: next :mark
):mark
]|["" copy key integer! ";"
( i: to-integer key
if all[i > 255 j: select exception-to-char key][i: j]
if all[i >= 0 i <= 255][
remove/part next :mark (length? key) + 2
change :mark to-string to-char i
]
mark: next :mark
):mark
]| "&"
]
]
]
content
] |