View in color | View documentation | License |
Download script | History | Other scripts by: sunanda |
30-Apr 14:49 UTC
[0.22] 50.429k
[0.22] 50.429k
skimp.rrebol [
title: "SKIMP: Simple keyword index management program"
author: "Sunanda"
date: 23-apr-2007
purpose: "Simple, fast way of indexing the text content of many documents"
version: 0.0.2
file: %skimp.r
license: 'mit
history: [
[0.0.0 11-aug-2005 "Written"]
[0.0.1 3-apr-2007 "Modified to use rse-ids.r and make-word-list.r"]
[0.0.2 23-apr-2007 "Add flush-cache and flush-cache-all"]
]
library: [
level: 'intermediate
platform: [all]
type: [function tool package]
domain: [files markup database]
tested-under: [win unix mac]
support: none
license: [mit]
see-also: [%rse-ids.r %make-word-list.r %skimp-tools.r]
]
]
if not value? 'rse-ids [do %rse-ids.r] ;; Load rse-ids if not already there
skimp: make object!
[
;; =====================================
;; = Global settings =
;; = --------------- =
;; = Change the magic values here to =
;; = alter skimp's over all settings =
;; = *** =
;; = (to change behavior for a =
;; = specific index, use =
;; = skimp/set-config ) =
;; = ===================================
index-name-prefix: "" ;; no prefix by default
index-name-suffix: ".sif" ;; skimp index file
;; ===================================
;; = Public functions =
;; = ---------------- =
;; = Call any or all of these to =
;; = build or manage your index =
;; ===================================
;; ========================================
index-exists?: func [
index-name [file!]
][
;; An index exists:
;; * if it is in the cache
;; (whether or not it has been ever
;; written)
;; * Or the index header file exists
;; on permanent storage.
if find index-cache index-name [return true]
return exists? to-file rejoin [index-name-prefix index-name index-name-suffix]
]
;; ========================================
get-index-information: func [
index-name [file!]
/document-list
/local
index-info
] [
if not index-exists? index-name [return none] ;; no such index
index-info: make object! [
index-file: index-name
top-index: copy []
owner-data: none
config: none
word-parameters: none
make-word-list: none
]
_read-index-file index-name
index-info/config: first reduce load/all mold cif/config
index-info/owner-data: first reduce load/all mold cif/owner-data
;; Document list
;; -------------
;; Remembering to remove the "0" place-holders
if document-list [
index-info: make index-info [document-list: copy []]
append index-info/document-list unique sort cif/document-list
if find index-info/document-list 0 [alter index-info/document-list 0]
]
;; First letter list
;; -----------------
;; Create a block with all the
;; first letters of words indexed
if 0 <> length? cif/word-index-block [
foreach char cif/word-index-block/1 [
append index-info/top-index to-string char
]
sort index-info/top-index
]
;; Word parameter details
;; ----------------------
index-info/word-parameters: first reduce load/all mold cif/word-parameters
index-info/make-word-list: get in cif 'make-word-list
return index-info
]
;; ========================================
get-indexed-words: func [
index-name [file!]
index-char [char! string!]
/document-list
/local
indexed-words
current-word
index-entry
] [
if not index-exists? index-name [return none] ;; no such index
if not string? index-char [index-char: to-string index-char]
if 1 <> length? index-char [return none] ;; only one letter allowed
indexed-words: copy []
_read-index-file index-name
if 0 = length? cif/word-index-block [return none] ;; no words indexed at all
current-word: copy index-char
lowercase current-word
index-entry: find cif/word-index-block/1 current-word
if not index-entry [return none] ;; no words with that one letter
index-entry: 1 + index? index-entry
_haul-in-entry cif/word-index-block index-entry current-word 1
append indexed-words _extract-words current-word cif/word-index-block/:index-entry 1 document-list
either document-list [
sort/skip indexed-words 2
] [
sort indexed-words
]
return indexed-words
]
;; ========================================
get-indexed-words-for-document: func [
index-name [file!]
document-name [string! integer!]
index-char [char! string!]
/local
doc-id
current-word
index-entry
] [
if not index-exists? index-name [return none] ;; no such index
if not string? index-char [index-char: to-string index-char]
if 1 <> length? index-char [return none] ;; only one letter allowed
indexed-words: copy []
_read-index-file index-name
if 0 = length? cif/word-index-block [return none] ;; no words indexed at all
current-word: copy index-char
lowercase current-word
index-entry: find cif/word-index-block/1 current-word
if not index-entry [return none] ;; no words with that one letter
doc-id: _make-document-id/check document-name
if not doc-id [return none] ;; document is not in index
index-entry: 1 + index? index-entry
_haul-in-entry cif/word-index-block index-entry current-word 1
append indexed-words _extract-words-for-document current-word cif/word-index-block/:index-entry 1 doc-id
sort indexed-words
return indexed-words
]
;; ========================================
get-indexed-document-names: func [
index-name [file!]
/local
doc-list
][
if not index-exists? index-name [return none] ;; no such index
_read-index-file index-name
doc-list: copy cif/document-list
;; Integer doc-names: simply
;; decompact the list
;; -------------------------
if cif/config/integer-document-names [
return _unpack-numset doc-list
]
;; string doc-names: weed
;; out the zeroes first
;; ----------------------
doc-list: unique sort doc-list
if find doc-list 0 [alter doc-list 0]
return doc-list
]
;; ========================================
write-cache: func [
index-name [file!]
/flush
/local
pointer
] [
;; Check we've got it first
;; ------------------------
pointer: find index-cache index-name
if none? pointer [return false] ;; not in memory, so nothing to do
cif: first next pointer ;; make current item
_write-index-file index-name
if flush [
remove pointer
remove pointer
cif: none
]
return true ;; written back okay
]
;; ========================================
write-cache-all: func [
/flush
] [
;; Writes *all* the indexes in the cache
;; -------------------------------------
if flush [
while [0 <> length? index-cache] [
write-cache/flush index-cache/1
]
cif: none
recycle
return true
]
foreach [index-name index-file] index-cache [
write-cache index-name
]
return true
]
;; ========================================
flush-cache: func [
index-name [file!]
/flush
/local
pointer
] [
;; Check we've got it first
;; ------------------------
pointer: find index-cache index-name
if none? pointer [return false] ;; not in memory, so nothing to do
cif: first next pointer ;; make current item
remove pointer
remove pointer
cif: none
return true ;; It's been flushed
]
;; ========================================
flush-cache-all: func [
/flush
][
;; Flushes *all* the indexes in the cache
;; --------------------------------------
cif: none
recycle
return true
]
;; ========================================
set-config: func [
index-name [file!]
con-set [object!]
/defer
/local
] [
;; Can only use if no words have been indexed
;; ------------------------------------------
_read-index-file index-name
if any [
0 <> length? cif/document-list
0 <> length? cif/word-index-block
] [
make error! "SKIMP/set-config: cannot use on existing index"
halt
]
;; =====================
;; TODO: ADD VALIDATION
;; =====================
foreach item next first con-set [
error? try
[set in cif/config to-word item get in con-set to-word item]
]
;; Set to working values, if crazy settings
;; ----------------------------------------
;; We'll do this until we add some proper validation
if not integer? cif/config/index-levels [
cif/config/index-levels: 3
]
cif/config/index-levels: maximum 1 cif/config/index-levels
if not logic? cif/config/integer-document-names [
cif/config/integer-document-names: false
]
if not logic? cif/config/one-file [
cif/config/one-file: false
]
if not defer [
_write-index-file index-name
]
return make object! third cif/config ;; ensure it's a copy not an updatable original
]
;; ========================================
set-owner-data: func [
index-name [file!]
owner-data-obj [object!]
/defer
/local
] [
_read-index-file index-name
cif/owner-data: first reduce load/all mold owner-data-obj
if not defer [
_write-index-file index-name
]
return true
]
;; ======================================
set-word-definition: func [
index-name [file!]
/parameters parm-obj [object!]
/make-word-list mwl [function! none! word!]
/defer
/local
update-needed
][
;; set-word-defintions
;; ===================
;; Define or override the default
;; expectations of what a "word"
;; is.
;; Generally, this should be done
;; before adding any words to an index --
;; otherwise you may add things you cannot
;; later easily query.
update-needed: false
_read-index-file index-name
if parameters [
cif/word-parameters: make cif/word-parameters third parm-obj
update-needed: true
]
if make-word-list [
cif/make-word-list: mwl
either word? mwl [
cif/make-word-list: get mwl
][
if none? cif/make-word-list [ ;; minimal make-word-list function
cif/make-word-list: get in skimp '_minimal-make-word-list
]
]
update-needed: true
]
if all [update-needed not defer] [
_write-index-file index-name ;; update the index
]
return true
]
;; ======================================
find-word: func [
index-name [file!]
target [string!]
/entry
/local
wb
show-word
tags
short-tree
res
invert-needed?
] [
invert-needed?: false
if target/1 = #"~" [
invert-needed?: true ;; we're doing a NOT search
target: skip target 1 ;; slip past the tilde
]
if 0 = length? target [return copy []] ;; find nothing
_read-index-file index-name
wb: cif/word-index-block
set [short-word tags] _make-tags target
for nn 1 length? tags 1 [
set [short-tree wb] _find-tree-entry wb tags nn
if short-tree [break]
if none? wb [break]
]
if none? wb [;; word not found
return _map-to-doc-names _invert-list invert-needed? copy []
]
res: select wb short-word
if not block? res [res: reduce [res]]
if entry [return _map-to-doc-names _invert-list invert-needed? res]
if none? res [return _map-to-doc-names _invert-list invert-needed? copy []] ;; word not found
return _map-to-doc-names _invert-list invert-needed? res
]
;; ==========================================
find-words: func [
index-name [file!]
targets [block!]
/local
hits
seq
] [
;; -------------------------------
;; Finds all the words, assumed to
;; be connected by an AND
;; --------------------------------
hits: copy []
seq: 0
foreach target targets [
seq: seq + 1
hits: _and-results hits
find-word index-name target
seq
]
return unique hits
]
;; ==================================
remove-document: func [
index-name [file!]
document-name [string!]
/defer ;; don't write back
/local
documents
] [
;; Just a courtesy wrapper for remove documents
;; --------------------------------------------
documents: copy []
append documents document-name
either defer [
remove-documents/defer index-name documents
][
remove-documents index-name documents
]
return true
]
;; ==================================
remove-documents: func [
index-name [file!]
document-name-list [block!]
/defer ;; don't write back
/local
temp
doc-id-list
starting-word-index
] [
_read-index-file index-name
doc-id-list: copy []
foreach doc-name document-name-list [
if _make-document-id/check doc-name [
append doc-id-list _make-document-id doc-name
]
]
doc-id-list: sort unique doc-id-list
if 0 = length? doc-id-list [return true] ;; we don't have those document-names on file
;; so there is nothing to remove
if 0 = length? cif/word-index-block [ ;; no words indexed at all ...
;; ... not sure this can happen
foreach doc-name doc-id-list [
_remove-document-id doc-name
]
return true
]
;; Capture starting word index
;; ---------------------------
starting-word-index: copy cif/word-index-block/1
;; Make sure all top levels are hauled in
;; --------------------------------------
for nn 1 length? cif/word-index-block/1 1 [
_haul-in-entry cif/word-index-block nn + 1 to-string cif/word-index-block/1/:nn 1
]
_remove-level cif/word-index-block 1 doc-id-list
foreach doc-name document-name-list [
_remove-document-id doc-name
]
;; design oversight here: we don't know
;; which of the subfiles we removed words
;; from....So set things so they are
;; all updated
foreach letter unique join starting-word-index cif/word-index-block/1 [
letter: to-string letter
if not find cif/admin/dirty-tags letter [alter cif/admin/dirty-tags letter]
]
if not defer [_write-index-file index-name]
return true
]
;; ==================================
remove-index: func [
index-name [file!]
][
;; Remove index
;; ============
;; Deletes an index entirely.
;; We're using a lazy/slow
;; method:
;; 1. set all index files to empty
;; 2. write the cache -- that will
;; delete all files except the header
;; 3. delete the header file
if not index-exists? index-name [return true] ;; no such index
_read-index-file index-name
if not 0 = length? cif/word-index-block [
foreach tag cif/word-index-block/1 [
append cif/admin/dirty-tags to-string tag
]
cif/admin/dirty-tags: unique cif/admin/dirty-tags
cif/word-index-block/1: copy ""
write-cache/flush index-name
]
delete to-file rejoin [index-name-prefix index-name index-name-suffix]
return true
]
;; ==================================
extract-words-from-string: func [
index-name [file!]
words [string! block!]
/for-search
][
;; extract-words-from-string
;; =========================
;; An interface to the make-word-list
;; function saved in the index header
;; that changes strings into
;; a block of words.
;; Useful for testing, especially
;; if you write your own function
if block? words [return words] ;; nothing to do
_read-index-file index-name
either for-search [
return do [cif/make-word-list/for-search cif/word-parameters words]
][
return do [cif/make-word-list cif/word-parameters words]
]
]
;; ==================================
add-words: func [
index-name [file!]
document-name [string! integer!]
words [block! string!]
/defer ;; no index update at end
/local
add-list
e-id
new-word? ;; new word indexed for this document
] [
_read-index-file index-name
;; Get the unique list of words
;; that we are going to add
;; ----------------------------
either block? words [
add-list: copy words ;; block: their words are taken literally
][
add-list: copy []
append add-list do [cif/make-word-list cif/word-parameters words]
]
add-list: unique add-list
if find add-list copy "" [alter add-list copy ""] ;; can't index the null string
if 0 = length? add-list [return true] ;; nothing to do
;; Let's go and add them
;; ---------------------
doc-id: _make-document-id document-name
foreach w add-list [
_add-a-word w doc-id
]
if not defer [_write-index-file index-name]
return true
]
;; ==================================
add-bulk-words: func [
index-name [file!]
data-block [block!]
/defer
/local
all-words
inverted-list
new-word? ;; new word indexed for this document
temp
doc-ids
db
] [
_read-index-file index-name
;; ensure all documents are in word format
;; ----------------------------------------
;; ie ["doc-1" "this are my words" ...] becomes
;; ["doc-1" ["these" "are" "my" "words"] ...]
for nn 1 length? data-block 2 [
if not block? pick data-block nn + 2 [
poke data-block nn + 1 do [cif/make-word-list cif/word-parameters pick data-block nn + 1]
]
]
;; Strip out any empty word lists
;; ------------------------------
db: make block! length? data-block
foreach [file block] data-block [
if 0 <> length? block [
append db file
append/only db block
]
]
;; Ensure each block entry is unique
;; --------------------------------
temp: 0
all-words: copy []
for nn 2 length? db 2 [
poke db nn sort unique db/:nn
temp: temp + length? db/:nn
append all-words db/:nn
]
;; Convert document names to their doc-ids
;; ---------------------------------------
doc-ids: make block! length? db ;; could be half the length
foreach [document-name words] db [
append doc-ids _make-document-id document-name
]
all-words: unique sort all-words
;; Now invert the list
;; -------------------
inverted-list: make block! (2 * length? all-words)
foreach w all-words [
temp: copy []
append inverted-list form w
for nn 2 length? db 2 [
if w = db/:nn/1 [
insert temp pick doc-ids (nn / 2) ;; document id
poke db nn next db/:nn
]
]
append/only inverted-list temp
]
;; Sort inverted list so common words are first
;; --------------------------------------------
;; error? try [;; fails on some earlier versions
;; sort/skip/all/compare inverted-list 2
;; func [a b] [
;; if (length? a/2) > length? b/2 [return -1]
;; if (length? a/2) < length? b/2 [return +1]
;; if a/1 < b/1 [return 1]
;; if a/1 > b/1 [return -1]
;; return 0 ;; how did we get here?
;; ]
;; ]
foreach [word document-list] inverted-list [
if 0 <> length? word [;; can't index the null word
_add-a-word word document-list
]
]
if not defer [_write-index-file index-name]
return true
]
;; ===================================
;; = Private stuff =
;; = ---------------- =
;; = DON'T call or mess with any of =
;; = these, unless you are doing =
;; = your own development =
;; ===================================
;; ----------
;; data areas
;; ----------
cif: none ;; current index file
current-file-name: none
index-cache: copy []
;; =======================================================
_read-index-file: func [
index-name [file!]
][
;; -------------------------------
;; Results in the index file being
;; in cif.
;; If the index file does not
;; exist, a new one is created.
;; -------------------------------
current-file-name: copy index-name
;; Read from cache, if possible
;; ----------------------------
cif: select index-cache index-name
if not none? cif [return true]
;; Read from file, if possible
;; ---------------------------
if not error? try [
cif: first reduce load/all decompress
read/binary to-file rejoin [index-name-prefix index-name index-name-suffix]
][
insert index-cache cif
insert index-cache index-name
return true
]
;; Create a new one
;; ----------------
;; Note we don't write the empty file --
;; that's done after the first update
;; operation, or when a user use a
;; write-cache[-all]/flush operation.
cif: make object! [
owner-data: make object! []
admin: make object! [
last-updated: now/precise
dirty-tags: copy [] ;; for multiple files: parts that have changed
]
config: make object! [
index-levels: 3 ;; number of higher-level index layers
;; (there will always be one bottom-level index layer)
integer-document-names: false ;; ie they are strings
one-file: false ;; ie is multiple files
]
word-parameters: make object! [
alpha: charset [#"a" - #"z" #"A" - #"Z"]
digit: charset [#"0" - #"9"]
initial-letter: alpha
letter: union alpha union digit charset ["~"]
number: union digit charset [".,"]
number-prefix: charset ["+-£$¢"]
number-postfix: charset ["+-"]
word-length: 1x40
not-prefix: "~"
stop-list: []
ignore-tags: false
index-pairs: true
final-letter: charset ["!" "?"]
hyphen: charset ["-_"]
]
make-word-list: get-make-word-list
document-list: copy [] ;; document-ids
word-index-block: copy [] ;; indexed words / highest level index
]
insert index-cache cif
insert index-cache index-name
return true
] ;; func
;; =======================================================
_write-index-file: func [
index-name [file!]
/local
pointer
temp-cif
nn
file-name
][
cif: select index-cache index-name
if none? cif [return false] ;; file is not current, so nothing to write
cif/admin/last-updated: now/precise
;; add file if not in cache
;; ------------------------
pointer: find index-cache index-name
if none? pointer [
insert index-cache cif
insert index-cache index-name
return true
]
;; Write it -- as a single file
;; ----------------------------
if cif/config/one-file [
cif/admin/dirty-tags: copy []
write/binary to-file rejoin [index-name-prefix index-name index-name-suffix]
compress mold cif
return true
]
;; Write it -- as a set of files
;; -----------------------------
;; Empty file?
;; -----------
if 0 = length? cif/word-index-block [
write/binary to-file rejoin [index-name-prefix index-name index-name-suffix]
compress mold cif
return true
]
;; Build a CIF that has everything except the
;; word-index-block (we don't do a straight object
;; clone on account of the potential size of
;; the thing)
temp-cif: make object! []
foreach entry next first cif [
if entry <> 'word-index-block [
temp-cif: make temp-cif reduce [to-set-word entry none]
set in temp-cif entry get in cif entry
]
temp-cif: make temp-cif [word-index-block: none]
]
temp-cif/word-index-block: copy []
append temp-cif/word-index-block cif/word-index-block/1 ;; node index
for nn 1 length? cif/word-index-block/1 1 [
append temp-cif/word-index-block none] ;; entries
if _has-leaf-node cif/word-index-block [
append/only temp-cif/word-index-block last cif-word-index-block/1] ;; node, if there is one (shouldn't be at this level)
;; Write first-character files
;; ---------------------------
foreach tag cif/admin/dirty-tags [
file-name: to-file rejoin [index-name-prefix index-name "-" to-integer tag/1 index-name-suffix]
either nn: find cif/word-index-block/1 tag [
nn: 1 + index? nn
write/binary file-name compress mold cif/word-index-block/:nn
] [
error? try [delete file-name]
]
]
temp-cif/admin/dirty-tags: copy []
cif/admin/dirty-tags: copy []
write/binary to-file rejoin [index-name-prefix index-name index-name-suffix]
compress mold temp-cif
return true
] ;; func
;; =================================================
_make-document-id: func [
document-name [string! integer!]
/check
/local
doc-id
find-res
] [
;; ---------------------------
;; Returns the position of the
;; document name in the cif
;; ----------------------------
if cif/config/integer-document-names [
;; This is simple: we use the user's value directly
;; ------------------------------------------------
;; Though it must be a positive integer, or
;; things later will fail very badly
if any [
not integer? document-name
document-name < 1
][
make error! rejoin ["SKIMP: id must be a positive integer..." document-name]
halt
]
find-res: _find-packed cif/document-list document-name
if all [check find-res] [return true] ;; it exists already
if find-res [return document-name] ;; already exists
if check [return false] ;; does not exist & is just a check
;; create a new entry
;; ------------------
doc-id: document-name
_insert-packed cif/document-list document-name
return doc-id
]
;; Need (perhaps) to make a new one
;; --------------------------------
doc-id: find cif/document-list form document-name
if not none? doc-id [
return index? doc-id ;; simple: it exists already
]
;; Just a check on existence?
;; --------------------------
if check [return none] ;; not an existing document
;; document doesn't exist ....
;; --------------------------
;; reuse an empty slot
;; ------------------
if doc-id: find cif/document-list 0 [
doc-id/1: form document-name
return index? doc-id
]
;; Let's make a new one
;; --------------------
append cif/document-list form document-name ;; document-name
return length? cif/document-list
]
;; =================================================
_remove-document-id: func [document-name [string! integer!]
/local
pointer
] [
;; ---------------------------
;; Deletes an existing
;; document name in the cif
;; ----------------------------
;; Deal with external-doc-id
;; ----------------------
if cif/config/integer-document-names [
if find cif/document-list document-name [
alter cif/document-list document-name
]
return true
]
;; Deal with internal-doc-id
;; ----------------------
pointer: find cif/document-list form document-name
if none? pointer [;; doesn't exist already (shouldn't happen)
return true
]
pointer/1: 0 ;; doesn't exist any more
;; trim trailing zeroes
;; --------------------
;; trailing zeroes just waste space
;; on the document list while giving
;; us nothing of any use.
while [
all [0 <> length? cif/document-list
0 = last cif/document-list
]
] [
remove back tail cif/document-list
]
return true
]
;; ==============================================
_make-word-block: func [word [string!]
/local wb
initial
initial-offset
wl-entry
tags
short-word
short-tree
] [
wb: cif/word-index-block
set [short-word tags] _make-tags word
for nn 1 length? tags 1 [
set [short-tree wb] _make-tree-entry wb tags nn
if short-tree [break]
if none? tags/:nn [break]
]
return reduce [short-word tags wb]
]
;; ==========================================
_make-tags: func [word [string!]
/local
tags
tag
shortened-word
] [
;; --------------------------------
;; The config/index-levels define a tree
;; structure. Means we don't have to
;; store those letters as they are
;; can be recovered.
;; eg if the config/index-levels is 3:
;; It means take the first letter,
;; three times. So "AMEND" is
;; stored in a tree three deep,
;; and we only need to store the
;; "ND"
;; A --> M --> E ["ND" [1x34]]
tags: copy []
shortened-word: copy word
for nn 1 cif/config/index-levels 1 [
tag: none
error? try [tag: first shortened-word]
shortened-word: copy next shortened-word
if not none? tag [
tag: to-string tag
]
append tags tag
]
return reduce [shortened-word tags]
]
;; ==========================================
_has-leaf-node: func [wb [block!]
] [
if 0 = length? wb [return false]
return (2 + length? wb/1) = length? wb
]
;; ==========================================
_find-tree-entry: func [
wb [block!]
tags [block!]
nn [integer!]
/local
ti
target
] [
if 0 = length? wb [return reduce [true none]]
ti: find wb/1 tags/:nn
if not none? ti [
_haul-in-entry wb (1 + index? ti) tags/1 nn
return reduce [false first at wb (1 + index? ti)] ;; return existing entry
]
if none? tags/:nn [
if not _has-leaf-node wb [
return reduce [true none]
]
return reduce [true first at wb length? wb] ;; return existing leaf entry
]
return reduce [true none]
]
;; ==========================================
_haul-in-entry: func [
wb [block!]
offset [integer!]
tag [string!]
level [integer!]
/local
index-segment
] [
;; ------------------------------
;; If the top level of an index
;; is not in memory, haul it in
;; ----------------------------
if any [level <> 1 ;; only haul top levels
'none <> wb/:offset ;; and only of none
] [
return true ;; nothing to do
]
index-segment: first reduce
load/all decompress
read/binary to-file rejoin [index-name-prefix current-file-name "-" to-integer tag/1 index-name-suffix]
poke wb offset index-segment
return true
]
;; ==========================================
_make-tree-entry: func [
wb [block!]
tags [block!]
nn [integer!]
/local
ti
] [
if 0 = length? wb [insert wb copy ""] ;; null level index
ti: find wb/1 tags/:nn
if not none? ti [
_haul-in-entry wb (1 + index? ti) tags/1 nn
return reduce [false first at wb (1 + index? ti)] ;; return existing entry
]
if none? tags/:nn [
if not _has-leaf-node wb
[
append/only wb copy []
]
return reduce [true first at wb length? wb] ;; return existing leaf entry
]
;; New level tag
;; -------------
append wb/1 tags/:nn
insert/only at wb 1 + length? wb/1 copy []
return reduce [false first at wb 1 + length? wb/1]
]
;; ==============================================
_add-a-word: func [
word [string!]
doc-id [integer! block!]
/local
w-block ;; word block
w-entry ;; word entry
short-word
temp
unpacked-doc-ids
res
tags
] [
set [short-word tags w-block] _make-word-block lowercase word
;; We got the word block, ie w-block
;; is positioned at something like:
;; ["mile" [1 7 5] "mite" [668 433] "mitten" 55 ]
;; set the index entry as dirty, so it will be saved later
;; -------------------------------------------------------
if not find cif/admin/dirty-tags tags/1 [alter cif/admin/dirty-tags tags/1]
;; Now we find the entry for the word we want
;; ------------------------------------------
word-entry: find/skip w-block short-word 2
if none? word-entry [
;; The entry we want does not exist
;; --------------------------------
;; We are either adding a single integer
;; for a new word, or a block of
;; integers:
;; "snickers" 45
;; "snickers" [30 12 49 26]
append w-block short-word
temp: copy []
foreach e either block? doc-id [doc-id] [reduce [doc-id]] [_insert-packed temp e]
append/only w-block temp
return true ;; we've added a new word
]
;; We are now pointing at an existing word
;; entry, eg:
;;
;; ["snickers" [1 3x6 8x19]]
temp: copy []
foreach e either block? doc-id [doc-id] [reduce [doc-id]] [_insert-packed word-entry/2 e]
return true ;; we've added a new word
]
;; ==========================================
_map-to-doc-names: func [
doc-ids [block! integer! pair!]
/local
ent-block
] [
if not block? doc-ids
[doc-ids: reduce [doc-ids]]
doc-ids: _unpack-numset doc-ids
if cif/config/integer-document-names [return doc-ids]
ent-block: make block! length? doc-ids
foreach e doc-ids [
insert ent-block pick cif/document-list e
]
return ent-block
]
;; =======================================
_unpack-numset: func [
doc-id-block [block!]
] [
return rse-ids/decompact doc-id-block
]
;; ==================================
_pack-numset: func [doc-id-block [block!]
] [
return rse-ids/compact doc-id-block
]
;; ===============================================
_find-packed: func [
blk [block!]
target [integer!]
] [
return rse-ids/find-compact blk target
]
;; ===================================================
_insert-packed: func [
blk [block!]
new-entry [integer!]
] [
return rse-ids/insert-compact blk new-entry
]
;; ==================================
_remove-packed: func [
entry [block!]
doc-id [integer!]
/local
new-wb
] [
rse-ids/remove-compact entry doc-id
return entry
]
;; ===========================================
get-make-word-list: func [
/local
][
;; Supply a minimal dummy function if we
;; have no access to make-word-list.r function
if not exists? %make-word-list.r [
return get in skimp '_minimal-make-word-list
]
do %make-word-list.r
return :make-word-list
]
;; ===========================================
_minimal-make-word-list: func [
parms ;; could be any type -- we don't check
string [string!]
/for-search
][
;; Minimal function needed to parse a
;; string into a set of words
return unique sort parse/all trim/lines copy string " "
]
;; ========================================
_extract-words: func [
current-word [string!]
wb [block!]
level [integer!]
document-list [logic! none!]
/local
word-list
doc-list
][
;; Extract words
;; =============
;; Returns all the words in the index
;; that begin with the first letter
;; of the supplied current-word
;; Optionally (with document-list
;; parameter) also returns
;; the document names that
;; contain each word.
word-list: copy []
if document-list [doc-list: copy []]
;; Handle lowest-level index
;; -------------------------
if level = cif/config/index-levels [
foreach [word index] wb [
append word-list join current-word word
if document-list [append/only word-list sort _map-to-doc-names _unpack-numset index]
]
return word-list
]
;; Handle higher-level indexes
;; ---------------------------
for nn 1 length? wb/1 1 [
append word-list
_extract-words
join current-word wb/1/:nn
pick wb nn + 1
level + 1
document-list
]
if _has-leaf-node wb [
append word-list current-word
if document-list [append/only word-list sort _map-to-doc-names _unpack-numset last last wb]
]
return word-list
]
;; ========================================
_extract-words-for-document: func [
current-word [string!]
wb [block!]
level [integer!]
doc-id [integer!]
/local
word-list
][
;; Extract words-for-document
;; ==========================
;; Returns all the words in the index
;; that begin with the first letter
;; of the supplied current-word that
;; are indexed for the given doc-id
word-list: copy []
;; Handle lowest-level index
;; -------------------------
if level = cif/config/index-levels [
foreach [word index] wb [
if rse-ids/find-compact index doc-id [append word-list join current-word word]
]
return word-list
]
;; Handle higher-level indexes
;; ---------------------------
for nn 1 length? wb/1 1 [
append word-list
_extract-words-for-document
join current-word wb/1/:nn
pick wb nn + 1
level + 1
doc-id
]
if _has-leaf-node wb [
if rse-ids/find-compact first next last wb doc-id [append word-list current-word]
]
return word-list
]
;; ===========================================================
_invert-list: func [
invert-needed? [logic!]
res [block!]
/local
all-doc-ids
] [
if not invert-needed? [return res]
;; handle doc names are integers
;; -----------------------------
if cif/config/integer-document-names [
return difference _unpack-numset res _unpack-numset cif/document-list
]
;; Handle doc names are strings
;; ----------------------------
all-doc-ids: make block! length? cif/document-list
repeat nn length? cif/document-list [
if 0 <> cif/document-list/:nn [;; ignore deletion placeholders
insert all-doc-ids nn
]
]
return difference _unpack-numset res all-doc-ids
]
;; ===================================
_and-results: func [
prev-hits [block!]
new-hits [block!]
seq [integer!]
] [
;; ANDs later results to the results set
if seq = 1 [
return copy new-hits
return
]
return intersect prev-hits new-hits
]
;; ==================================
_remove-level: func [
wb [block!]
level [integer!]
doc-ids [block!]
/local
pointer
removed
temp
] [
if 0 = length? wb [return true] ;; can this happen!?
;; Remove any leaf node entry
;; --------------------------
if _has-leaf-node wb [
temp: second last wb
foreach doc-id doc-ids [_remove-packed temp doc-id]
poke last wb 2 temp
if ["" []] = last wb [ ;; empty leaf
remove back tail wb
if 1 = length? wb [
clear wb
return true
]
]
]
;; Recurse downwards to the bottom for most removal work
;; ------------------------------------------------------
if level <> cif/config/index-levels [
for nn 1 length? wb/1 1 [
_remove-level first at wb (nn + 1) level + 1 doc-ids
]
_remove-empties wb
return true
]
;; Main removal job
;; ----------------
;; Step 1
;;
;; we're positioned on something like this:
;; ["abc"
;; ["ct" [3] "dd" [3 5] ] "act" and "add"
;; ["at" [3x6] "et" [3x2 89x2] ] "bat" and "bet"
;; ["am" [3] "og" [3] ] "cam" and "cog"
;; ]
;;
;; So, as the next step, we step through those inner blocks
;; and remove the doc-id. If we are removing doc-id 3, then this
;; structure will become
;;
;; ["abc"
;; ["dd" [5] ] "act" gone
;; ["at" [4x5] "et" [4 89x2] ] "bat" and "bet"
;; [] "cam" and "cog" both gone
;; ]
;;
;; See Step 2 for sorting out the empty block left by"cam" and "cog"
for nn 1 length? wb/1 1
[nn: nn + 1
pointer: head wb/:nn
loop (length? wb/:nn) / 2 [
foreach doc-id doc-ids [_remove-packed pointer/2 doc-id]
either all [block? pointer/2 0 = length? pointer/2] [
remove pointer
remove pointer
] [
pointer: next next pointer
]
]
]
;; Step 2
;; ------
;; Something like:
;; ["abc"
;; [dd" [5] ] "add"
;; ["at" 4x5 "et" [4 89x2] ] "bat" and "bet"
;; [] empty c entry
;; ]
;; Must become:
;; ["ab" c gone
;; [dd" [5] ] "add"
;; ["at" [4x5] "et" [4 89x2] ] "bat" and "bet"
;; ]
_remove-empties wb
;; Step 3
;; ------
if 1 = length? wb [clear wb] ;; removes stray [""]
if 0 = length? wb [return true]
return true
]
;; ==================================
_remove-empties: func [
wb [block!]
/local removed
] [
forever [
removed: false
for nn 1 length? wb/1 1 [
nn: nn + 1
if any [0 = length? wb/:nn
all [1 = length? wb/:nn "" = wb/:nn/1]
] [
removed: true
remove at wb nn
alter wb/1 to-string pick wb/1 nn - 1
break ;; so the loop can restart
]
] ;; for
if not removed [break] ;; no more forever loop
] ;; forever
;; Check if leaf node is empty
;; ===========================
if all [_has-leaf-node wb
0 = length? last wb] [
remove at wb length? wb
]
return true
]
] ;; skimp object Notes
|