View in color | View documentation | License |
Download script | History | Other scripts by: peterwood · sunanda |
30-Apr 10:51 UTC
[0.065] 26.653k
[0.065] 26.653k
make-word-list.rREBOL[
Title: "make-word-list"
Version: 1.0.0
Date: 2-Apr-2007
Author: Peter W A Wood
Copyright: {Copyright© PWA Wood 2007}
File: %make-word-list.r
Purpose: {Makes a list of words from a string}
License: 'mit
Library: [
level: 'beginner
platform: [all]
type: [package function tool]
domain: [files markup database]
tested-under: [win mac]
support: none
license: [mit]
]
]
make-word-list: function
[
{Makes a list of words from a string}
config [object! none!]
{Configuration options to be used instead of the default ones}
content [string!]
{The content from which the words are to be extracted}
/for-search
{The requested word list is being built to perform a search}
][
word
{An individual word from the content}
word-list
{The list of words for indexing}
make-words-parse
{A object in which to store the parse rules}
][
make-words-parse: make object!
[
;; default bitsets
alpha: charset [#"a" - #"z" #"A" - #"Z"]
digit: charset [#"0" - #"9"]
alpha-or-digit: union alpha digit
letter: union alpha-or-digit charset ["~"]
no-character: charset []
hex-digit: union digit charset [#"a" - #"f" #"A" - #"F"]
hyphen: charset ["-_"]
word-start: alpha
word-letter: letter
word-end: charset ["!" "?"]
number: union digit charset [".,"]
number-prefix: charset ["+-£$¢"]
number-postfix: charset ["+-"]
not-prefix: "~"
;; other default settings
word-length: 1x40
ignore-tags: false
index-pairs: true
stop-list: ["a" "is" "the"]
word-list-length: 2000
months: [ "jan" "feb" "mar" "apr" "may" "jun"
"jul" "aug" "sep" "oct" "nov" "dec"
]
day-suffix: ["th" "st" "rd"]
;; apply supplied configuration changes
if object! = type? config
[
foreach item next first config
[
error? try
[set in self to word! item get in config to word! item]
]
]
;; convert stop-list to a hash
stop-list: to hash! stop-list
;; word definitions
a-word: [word-start any word-letter opt word-end]
a-hyphenated-word:
[a-word hyphen some word-letter any [hyphen | word-letter]]
either not-prefix
[
a-not-word: [not-prefix a-word]
][
a-not-word: no-character ;;effectively deactiviates the rule
]
a-tag: ["<" thru ">"]
a-closing-tag: ["</" thru ">"]
a-number: [opt number-prefix digit any number opt number-postfix]
a-hex-number: [[digit] any #"x" any hex-digit]
a-tuple: [some digit #"." some digit #"." some digit]
a-rebol-pair: [opt "-" some digit #"x" opt "-" some digit]
a-dotted-word: [word-start any word-letter #"."
word-start any [word-letter | #"."] ]
a-slashed-word: [word-start any word-letter #"/"
word-start any [word-letter | #"/"]]
a-back-slashed-word: [word-start any word-letter backslash
word-start any [word-letter | backslash]]
a-rebol-binary-string: ["#{" thru "}"]
a-rebol-base64-string: ["64#{" thru "}"]
a-rebol-debase: ["debase" any [" " | newline] "{" thru "}"]
an-html-escaped: ["&" thru ";"]
eg: ["e.g" opt "." | "E.G" opt "."]
a-scheme: [word-start any word-start "://" opt "www."]
;; specification of a web page address
a-web-page-address:
[
a-dotted-word #"/" word-start
any [word-letter | #"/"]
any [ #"." any word-letter]
]
;; specification of an email address
an-email-address: [word-start any [word-letter | #"."] #"@"
word-start any word-letter #"." any [word-letter | #"."]]
;; charsets to speed skipping over unwanted portions of the content
start-chars: copy [#"<" #"#"]
if not-prefix
[append start-chars to-char not-prefix]
if index-pairs
[append start-chars #"-"]
start-chars: charset start-chars
start-chars: union start-chars union word-start digit
not-start-char: complement start-chars
skip-to-next-possible-word: [some not-start-char]
;; the specification of a date
;; first build the day suffix list
either 0 = length? day-suffix
[
a-day-suffix: [""]
][
;; build the block of day-suffixes
a-day-suffix: copy []
foreach abbr day-suffix
[
append a-day-suffix abbr
append a-day-suffix to-word "|"
]
remove back tail a-day-suffix ;; remove extraneous | at the end
]
;; build a definition of month abbreviation
a-month-abbr: copy []
foreach month months
[
append a-month-abbr month
append a-month-abbr to-word "|"
]
remove back tail a-month-abbr ;; remove extraneous | at the end
a-date:
[
;; rebol date with alpha month
[ [1 2 digit] #"-" a-month-abbr [any letter] #"-" [2 4 digit] ]
|
;; rebol date with numeric month
[ [1 2 digit] #"-" [ 1 2 digit] #"-" [ 2 4 digit] ]
|
;; date with alpha month
[ [1 2 digit] #" " a-month-abbr [any letter] #" " [2 4 digit] ]
|
;; dd/mm/yyyy mm/dd/yyyy (numeric)
[ [1 2 digit] slash [1 2 digit] slash [1 4 digit] ]
|
;; yyyy/mm/dd (numeric)
[ [1 4 digit] slash [1 2 digit] slash [1 2 digit] ]
|
;; date of types 1st June 2000; 3rd June 2000 or 15th June
;; this rule will also add phrases such as 21st Century to the word list
[ [1 2 digit] a-day-suffix #" " [3 letter any letter]
any [#" " 2 4 digit] ]
]
;; a set of definitions and rule to try to create a rebol format date from
;; a date that meets the a-date definition
a-year-first-date: [ copy yy [4 digit] skip copy mm [1 2 digit] skip
copy dd [1 2 digit]
]
a-year-last-date: [
copy dd [1 2 digit] skip
[
[copy mm [3 letter] [any letter]] |
[copy mm [ 1 2 digit] ]
] skip
copy yy [2 4 digit]
]
a-day-suffix-date: [
copy dd [1 2 digit] a-day-suffix #" " copy mm [3 letter] [any letter]
#" " copy yy [2 4 digit]
]
make-rebol-date-rule: [
[a-year-first-date | a-year-last-date | a-day-suffix-date]
(
;; check to see if there is a match with alpha month abbr
if mmm: find months lowercase mm
[
;; convert the alpha to a month number
mm: index? mmm
]
if not error? try [mmm: to integer! mm] ;; genuine month?
[
if mmm > 12
[
;; swap dd and mm
swap: mm
mm: dd
dd: swap
]
;; try to create a Rebol date
date-str: join dd ["-" mm "-" yy]
if attempt [date-str: to date! date-str]
[
_xadd word-list lowercase to string! date-str
]
]
)
]
;; word-in-date-rule
word-in-date-rule:
[
any
[
[1 2 digit] a-day-suffix ;; this rule ignores 1st, 15th etc.
|
copy word-in-date a-word
(
_xadd word-list lowercase word-in-date
)
|
skip
]
]
;; date rule
date-rule:
[
copy date a-date
(
;; add the date to the list in its current form
_xadd word-list lowercase date
;; try to create a rebol date as well
parse/all date make-rebol-date-rule
;; add any words in the date
parse/all date word-in-date-rule
)
]
;; word-rule - add a genuine word to the word-list
word-rule:
[
copy word a-word
(
_xadd word-list lowercase word
)
]
;; not-word-rule - remove the "not prefix" from the start of a word
;; unless the for-search refinement is set
not-word-rule:
[
copy not-word [a-not-word]
(
if not for-search [remove not-word]
_xadd word-list lowercase not-word
)
]
;; tag rule
tag-rule:
[
copy word a-tag
(
if not ignore-tags
[
;; strip off <
remove word
;; strip off >
remove back tail word
;; now re-parse the remaining content
parse/all word rule
]
)
]
;; dotted-word rule - used for domain names and qualified variable names
dotted-word-rule:
[
copy dotted-word a-dotted-word
(
;; add each "level" of the dotted-word to the list
;; eg www.rebol.com will result in the following being added :
;; www.rebol.com
;; rebol.com
;; com
;; if the last character is a "." remove it
while
[
#"." = last dotted-word
]
[
remove back tail dotted-word
]
_add-word-hierarchy dotted-word "."
)
]
;; slashed-word rule - used for domain names and qualified variable names
slashed-word-rule:
[
copy slashed-word a-slashed-word
(
;; add each "level" of the slashed-word to the list
;; eg apps/rebolcore/rebol will result in the following being added :
;; apps/rebolcore/rebol
;; rebolcore/rebol
;; rebol
;; if the last character is a "/" remove it
while
[
#"/" = last slashed-word
]
[
remove back tail slashed-word
]
_add-word-hierarchy/with-individual-words slashed-word "/"
)
]
;; back-slashed-word rule - used for domain names and qualified variable names
back-slashed-word-rule:
[
copy back-slashed-word a-back-slashed-word
(
;; add each "level" of the back-slashed-word to the list
;; eg apps\rebolcore\rebol will result in the following being added :
;; apps\rebolcore\rebol
;; rebolcore\rebol
;; rebol
;; if the last character is a backslash remove it
while
[
backslash = last back-slashed-word
]
[
remove back tail back-slashed-word
]
_add-word-hierarchy/with-individual-words back-slashed-word backslash
)
]
;; hyphenated-word rule
;; add each level of the hyphenated word and all the sub-words (if they are
;; valid words)
hyphenated-word-rule:
[
copy hyphenated-word a-hyphenated-word
(
;; if the last characters are hyphens remove them
while
[
parse to string! last hyphenated-word [hyphen]
]
[
remove back tail hyphenated-word
]
;; use the first hyphen found as the separator
parse hyphenated-word
[
copy this-hyphen [a-word hyphen]
;; add the different levels of hierarchy and the individual words to
;; the list
(
this-hyphen: last this-hyphen
_add-word-hierarchy/with-individual-words hyphenated-word
this-hyphen
)
]
)
]
web-page-address-rule:
[
copy web-words a-web-page-address
(
;; add each "level" of the web-page to the list
;; eg www.rebol.com/downloads.html will result in
;; the following being added :
;; www.rebol.com/downloads.html
;; rebol.com
;; com
;; downloads
;; split off the domain name and process it
parse copy/part web-words find web-words "/" dotted-word-rule
remove/part web-words find/tail web-words "/"
;; extract valid words from the remainder ingnoring part after dot
parse web-words
[
any
[
copy sub-word a-word
(
_xadd word-list lowercase copy sub-word
)
|
"/"
|
["." a-word]
]
]
)
]
;; Rebol pair rule
;; include rebol pairs in the wordlist if the index-pairs is true in the
;; configuration object
either index-pairs
[
rebol-pair-rule:
[
;; add rebol-pairs to word-list
copy rp a-rebol-pair
(
parse/all rp
[
copy reb-pair a-rebol-pair
(_xadd word-list reb-pair)
]
)
]
][
;; ignore rebol-pairs
rebol-pair-rule: no-character ;; nothing can be selected
]
;; prefix-word-rule - strip of specified prefix and the word to word-list
prefix-word-rule:
[
copy prefix-word a-prefix-word
(
_xadd word-list lowercase next prefix-word
)
]
;; e.g. rule - adds e.g. and e.g. to the words-list
eg-rule:
[
copy eg-word eg
(
_xadd word-list lowercase eg-word
)
]
;; rebol debase rule - adds the word debase and ignores the binary string
rebol-debase-rule:
[
a-rebol-debase
(
_xadd word-list "debase"
)
]
;; the main rule
rule:
[
any
[
skip-to-next-possible-word
|
an-email-address ;; ignore email adresses
|
hyphenated-word-rule
|
a-scheme ;; ignore http:// etc
|
web-page-address-rule
|
eg-rule
|
dotted-word-rule
|
slashed-word-rule
|
back-slashed-word-rule
|
rebol-debase-rule
|
word-rule
|
date-rule
|
rebol-pair-rule
|
a-rebol-base64-string ;; ignore Rebol base64
|
a-rebol-binary-string ;; ignore Rebol binary
|
a-tuple ;; ignore tuples
|
[digit a-word] ;; ignore words starting
| ;; with a numeric digit
a-hex-number ;; ignore hex numbers
|
a-number ;; ignore complete numbers
|
a-closing-tag ;; ignore closing tags
|
not-word-rule
|
tag-rule
|
an-html-escaped ;; ignore "escaped" html
|
skip
]
]
_xadd: function
[
{exclusive add - add an element if the block doesn't already contain it}
hsh [hash!]
element
][
len ;; the length of the element
][
;; don't add:
if any
[
;; duplicates
find hsh element
;; words in the stop-list
find stop-list element
;; words greater than the maximum length required
(len: length? element) > second word-length
;; words shorter than the minimum length required
len < first word-length
][
return
]
;;add the element
insert hsh element
]
_add-word-hierarchy: function
[
{adds the words from a hierarchy of words, each level being identified by
a specified separator. E.g. www.rebol.com will result in the following
being added to the word-list:
;; www.rebol.com
;; rebol.com
;; com
}
word-hier [string!]
separator [char! string!]
/with-individual-words
{stores any valid individual word in the hierarchy}
][
ind-word
"An individual word in the hierarchy"
][
_xadd word-list copy lowercase word-hier
until
[
if with-individual-words
[
parse word-hier
[
copy ind-word a-word
(_xadd word-list lowercase ind-word)
]
]
;; strip off the first part of the hierarchy
remove/part word-hier find/tail word-hier separator
;; add the hierarchy if it's first char is a letter
if true = parse to string! first word-hier [word-start]
[
_xadd word-list copy lowercase word-hier
]
none = find word-hier separator
]
]
] ;; end make-words-parse
;; create the ouput block
word-list: make hash! make-words-parse/word-list-length
;; parse the content
parse/all content make-words-parse/rule
;; finally sort the list
return sort to-block word-list
] Notes
|