Script Library: 1238 scripts
 

split.r

REBOL [ File: %split.r Date: 6-May-2012 Title: "Split.r" Author: "Gregg Irwin" Purpose: { Provide functions to split a series into pieces, according to different criteria and needs. } library: [ level: 'intermediate platform: 'all type: [module function dialect] domain: [parse dialects text text-processing] tested-under: [View 2.7.8 on Win7] license: 'MIT ] ] use [map-each test split-tests] [ ; R3-compatible map-each interface. ; ; This is local to the context for SPLIT, because it is not designed ; to be fully R3 MAP-EACH compatible. For that, you should look at ; Brian Hawley's %r2-forward.r. ; ; What happens if the result of the DO is unset!? For now, we'll ; ignore unset values. The example case being SPLIT, which uses ; MAP-EACH with an unset value for negative numeric vals used to ; skip in the series. map-each: func [ 'word data [block!] body [block!] /local tmp ] [ collect compose/deep [ repeat (word) data [ set/any 'tmp do bind/copy body (to lit-word! word) if value? 'tmp [keep/only :tmp] ] ] ] ; R2 version. ; ; There are differences from the version in R3 which, itself, will likely ; need to be revisited due to changes in R3. split: func [ [catch] {Split a series into pieces; fixed or variable size, fixed number, or at delimiters} series [series!] "The series to split" dlm [block! integer! char! bitset! any-string!] "Split size, delimiter(s), or rule(s)." /into {If dlm is an integer, split into n pieces, rather than pieces of length n.} /local size count mk1 mk2 res piece-size fill-val add-fill-val type ][ ; This is here becaus using "to series", which should work, fails if the ; target type is paren!. If we ignore that case, all the "to type" bits ; can go away completely. type: type? series either all [block? dlm parse dlm [some integer!]] [ map-each len dlm [ either positive? len [ to type copy/part series series: skip series len ] [ series: skip series negate len () ] ] ] [ size: dlm res: collect [ parse/all series case [ all [integer? size into] [ if size < 1 [throw make error! compose [script invalid-arg size]] count: size - 1 ; Max 1 is to catch when size is larger than the series, giving us 0. piece-size: max 1 round/down divide length? series size [ count [copy series piece-size skip (keep/only to type series)] copy series to end (keep/only to type series) ] ] integer? dlm [ if size < 1 [throw make error! compose [script invalid-arg size]] [any [copy series 1 size skip (keep/only to type series)]] ] 'else [ ; This is quite a bit different under R2, in order to stop ; at the end properly and collect the final value. [ any [ mk1: [ to dlm mk2: dlm (keep to type copy/part mk1 mk2) | to end mk2: (keep to type copy mk1) skip ] ] ] ] ] ] ;-- Special processing, to handle cases where the spec'd more items in ; /into than the series contains (so we want to append empty items), ; or where the dlm was a char/string/charset and it was the last char ; (so we want to append an empty field that the above rule misses). fill-val: make type none add-fill-val: does [append/only res fill-val] case [ all [integer? size into] [ ; If the result is too short, i.e., less items than 'size, add ; empty items to fill it to 'size. ; We loop here, because insert/dup doesn't copy the value inserted. if size > length? res [ loop (size - length? res) [add-fill-val] ] ] ; integer? dlm [ ; ] 'else [ ; = any [bitset? dlm any-string? dlm char? dlm] ; If the last thing in the series is a delimiter, there is an ; implied empty field after it, which we add here. case [ bitset? dlm [ ; ATTEMPT is here because LAST will return NONE for an ; empty series, and finding none in a bitest is not allowed. if attempt [find dlm last series] [add-fill-val] ] ; These cases are now handled, under R2, by the parse rule, and ; no longer require special handling. ;char? dlm [ ; if dlm = last series [add-fill-val] ;] ;string? dlm [ ; if all [ ; find series dlm ; empty? find/last/tail series dlm ; ] [add-fill-val] ;] ] ] ] res ] ] test: func [block expected-result /local res err] [ ;print ['TEST tab mold block mold expected-result] if error? set/any 'err try [ res: do block ;print [mold/only :block newline tab mold res] if res <> expected-result [print [tab 'FAILED! tab 'expected mold expected-result]] ][ print [mold/only :block newline "ERROR!" mold disarm err] ] ] split-tests: [ >> [split "1234567812345678" 4] == ["1234" "5678" "1234" "5678"] >> [split "1234567812345678" 3] == ["123" "456" "781" "234" "567" "8"] >> [split "1234567812345678" 5] == ["12345" "67812" "34567" "8"] >> [split/into [1 2 3 4 5 6] 2] == [[1 2 3] [4 5 6]] >> [split/into "1234567812345678" 2] == ["12345678" "12345678"] >> [split/into "1234567812345678" 3] == ["12345" "67812" "345678"] >> [split/into "1234567812345678" 5] == ["123" "456" "781" "234" "5678"] ; Dlm longer than series >> [split/into "123" 6] == ["1" "2" "3" "" "" ""] ;or ["1" "2" "3"] >> [split/into [1 2 3] 6] == [[1] [2] [3] [] [] []] ;or [1 2 3] >> [split/into first [(1 2 3)] 6] == [(1) (2) (3) () () () ] ;or [1 2 3] ;>> [split/into [1 2 3] 6] == [[1] [2] [3] none none none] ;or [1 2 3] >> [split [1 2 3 4 5 6] [2 1 3]] == [[1 2] [3] [4 5 6]] >> [split "1234567812345678" [4 4 2 2 1 1 1 1]] == ["1234" "5678" "12" "34" "5" "6" "7" "8"] >> [split first [(1 2 3 4 5 6 7 8 9)] 3] == [(1 2 3) (4 5 6) (7 8 9)] >> [split #{0102030405060708090A} [4 3 1 2]] == [#{01020304} #{050607} #{08} #{090A}] >> [split [1 2 3 4 5 6] [2 1]] == [[1 2] [3]] >> [split [1 2 3 4 5 6] [2 1 3 5]] == [[1 2] [3] [4 5 6] []] >> [split [1 2 3 4 5 6] [2 1 6]] == [[1 2] [3] [4 5 6]] ; Old design for negative skip vals ;>> [split [1 2 3 4 5 6] [3 2 2 -2 2 -4 3]] == [[1 2 3] [4 5] [6] [5 6] [3 4 5]] ; New design for negative skip vals >> [split [1 2 3 4 5 6] [2 -2 2]] == [[1 2] [5 6]] >> [split "abc,de,fghi,jk" #","] == ["abc" "de" "fghi" "jk"] >> [split "abc<br>de<br>fghi<br>jk" <br>] == ["abc" "de" "fghi" "jk"] >> [split "a.b.c" "."] == ["a" "b" "c"] >> [split "c c" " "] == ["c" "c"] >> [split "1,2,3" " "] == ["1,2,3"] >> [split "1,2,3" ","] == ["1" "2" "3"] >> [split "1,2,3," ","] == ["1" "2" "3" ""] >> [split "1,2,3," #","] == ["1" "2" "3" ""] >> [split "1..2..3" ".."] == ["1" "2" "3"] >> [split "1..2..3.." ".."] == ["1" "2" "3" ""] ; Doesn't work under R2, because PARSE doesn't support [to charset!] ; or [to block!]. ; Need to look at Ladislav's parse enhancements to see about that. ;>> [split "1,2,3," charset ",."] == ["1" "2" "3" ""] ;>> [split "1.2,3." charset ",."] == ["1" "2" "3" ""] ; This doesn't work under R2 either. ;>> [split "-a-a" ["a"]] == ["-" "-"] ;>> [split "-a-a'" ["a"]] == ["-" "-" "'"] ;>> [split "abc^M^Jde^Mfghi^Jjk" [crlf | #"^M" | newline]] == ["abc" "de" "fghi" "jk"] ;>> [split "abc de fghi jk" [some #" "]] == ["abc" "de" "fghi" "jk"] ] run-split-tests: has [chevron =test =expected-result n] [ chevron: (to lit-word! ">>") n: 0 prin "Tests parsed successfully: " print parse split-tests [ some [ chevron set =test block! '== set =expected-result block! (n: n + 1 test =test =expected-result) ] ] print [n "tests run"] ] ]
halt ;; to terminate script if DO'ne from webpage
Notes