View in color | License | Download script | History | Other scripts by: luce80 |
30-Apr 13:54 UTC
[0.098] 38.684k
[0.098] 38.684k
compression.rREBOL [
title: "Various compression algorithms"
file: %compression.r
author: "Marco Antoniazzi"
email: [luce80 AT libero DOT it]
date: 14-02-2013
version: 0.0.3
Purpose: "Collect and show various compression algorithms (Huffman, RLE, LZ77, LZSS, LZ78, LZW)."
History: [
0.0.1 [01-01-2013 "Started"]
0.0.2 [12-02-2013 "ok"]
0.0.3 [14-02-2013 "Fixed silly bug in lzw binary! decompress"]
]
library: [
level: 'intermediate
platform: 'all
type: [function tool]
domain: [compression]
tested-under: [View 2.7.8.3.1]
support: none
license: 'public-domain
]
notes: {
I should say that these functions are made slow on pourpose.
Any optimization for speed or bits-saving is left as an exercise to the reader ;)
Do not esitate to help me improve this script by adding more algorithms.
}
]
compression: context [
; some functions taken or partially derived from an article of Ole Friis in www.rebolforces.com
alphadigits: "0123456789ABCDEF"
enbase: func [value [integer!] /base base-value [integer!] /local result num][
if value = 0 [return "0"]
base-value: any [base-value 16]
result: copy ""
while [value <> 0][
num: mod value base-value
insert result any [alphadigits/(num + 1) "0"]
value: (value - num) / base-value
]
result
]
debase: func [value [any-string!] /base base-value [integer!] /local num char pos][
base-value: any [base-value 16]
;FIXME: if 0 <> mod (length? string) base-value [make error! "wrong string length"]
num: 0
foreach char value [
if none? pos: find alphadigits uppercase char [break]
num: num * base-value + (index? pos) - 1
]
num
]
from-base2: func [value][debase/base value 2]
to-base2: func [value][enbase/base value 2]
bit-pos: 1
bin-result: copy #{}
to-bin: func [num][system/words/debase/base to-hex num 16]
bits-set: func [num len /local shifted-num result][
; shift num left to bit-pos
shifted-num: shift/left to-integer num 32 - len - bit-pos + 1
; overwrite dest binary with shifted number
result: change bin-result bin-result or to-bin shifted-num
; update bit number and bin-result pos
bit-pos: bit-pos + len
while [bit-pos > 8] [bit-pos: bit-pos - 8 bin-result: next bin-result]
head result
]
bits-get: func [len /local mask num][
mask: shift to-integer #{80000000} len - 1
mask: shift/logical mask bit-pos - 1
; extract bits using mask and then shift them to get the right number
num: mask and to-integer copy/part bin-result 4
num: shift/logical num 32 - len - bit-pos + 1
bit-pos: bit-pos + len
while [bit-pos > 8] [bit-pos: bit-pos - 8 bin-result: next bin-result]
num
]
encode-bitstream: func [;author Ole Friis
"Encodes a string of 1's and 0's to a binary string"
s [string!] "The string of 1's and 0's"
/local res byte add-this
][
res: copy #{}
forever [
byte: 0
add-this: 128
while [(add-this <> 0) and (not tail? s)] [
if #"1" = first s [byte: byte + add-this]
add-this: shift add-this 1
s: next s
]
append res to-char byte
if tail? s [return res]
]
]
decode-bitstream: func [;author Ole Friis
"Decodes a binary string into a string of 1's and 0's"
s [binary!] "The binary string"
/local res next-bit
][
res: copy ""
next-bit: 128
while [not tail? s] [
append res either 0 = and~ next-bit to-integer first s ["0"]["1"]
next-bit: shift next-bit 1
if next-bit = 0 [next-bit: 128 s: next s]
]
res
]
huffman: context [
stats: none ; The statistics
tree: none ; the Huffman tree
codes: none ; the translation table with alphabet symbols and their Huffman code
huffman-code: copy "" ; temporary partial Huffman code
build_stats: func [
{build a block with all symbols and their count (eg. [2 #"e" 7 #"a" ...])}
data [any-string!] "data to analyze"
/local char pos
][
stats: copy []
data: as-string data
foreach char data [
either pos: find/case stats char [pos/(-1): pos/(-1) + 1][insert insert stats 1 char]
]
sort/skip stats 2
]
build_tree: func [
"build the Huffman tree" ; the tree is a list of lists something like: [[[a 3] [[[c 1] [b 1] 2] [e 2] 4] 7]]
stats [block!] "The statistics"
/local node1 node2 weight new-node temp-list
][
tree: copy []
; Create a leaf node for each symbol and add it to the tree
forskip stats 2 [repend/only tree [second stats first stats]]
while [1 < length? tree][
node1: take tree
node2: take tree
weight: (last node1) + (last node2)
new-node: reduce [node1 node2 weight]
; Insert the new node correctly (sorted) in the tree (we could use a bisection)
temp-list: head tree
while [all [not tail? temp-list (last first temp-list) <= weight]][
temp-list: next temp-list
]
insert/only temp-list new-node
]
; Return the top element of the Huffman tree
tree: first tree
]
build_table: func [
"recursively traverse the huffman tree building a translation table"
tree [block!]
/local node
][
if block? node: first tree [
append huffman-code "0"
build_table node
]
either block? node: second tree [
append huffman-code "1"
build_table node
][
repend codes [first tree copy huffman-code]
]
remove back tail huffman-code
make hash! codes
]
compress: func [
"Huffman-compress a string series"
data [any-string!] "Data to compress"
/table block [block!] "The Huffman translation table to use"
/to-block "Give result as a block of strings"
/local char result code block-result
][
result: copy ""
block-result: copy []
block: any [block do [codes: copy [] build_table build_tree build_stats data]]
foreach char data [
insert tail result code: select block to-char char ;cannot use path selection because it is case INSENSITIVE (!?)
if to-block [insert tail block-result code]
]
if to-block [return block-result]
result: encode-bitstream result
]
to-canonical: func [
"Transform a Huffman translation table to a canonical one"
/table block [block!] "The Huffman translation table to use"
/local codes-copy canonical-codes current-bit-length new-code
][
codes-copy: copy any [block codes]
sort/skip codes-copy 2
canonical-codes: copy []
;The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
current-bit-length: length? second codes-copy
repend canonical-codes [first codes-copy head insert/dup clear "" "0" current-bit-length]
new-code: second canonical-codes
codes-copy: skip codes-copy 2
forskip codes-copy 2 [
current-bit-length: length? second codes-copy
new-code: to-base2 (from-base2 new-code) + 1
if current-bit-length > length? new-code [insert new-code "0"]
if current-bit-length > length? new-code [insert tail new-code "0"]
repend canonical-codes [first codes-copy new-code]
]
make hash! canonical-codes
]
decompress: func [
"decompress Huffman encoded block or binary"
data [block! binary!] "data to decompress"
/tree tree-block [block!] "The Huffman tree to use"
/table block [block!] "The Huffman translation table to use"
/local result char temp-code node
][
result: copy #{}
if block? data [data: rejoin data]
if binary? data [data: decode-bitstream data]
either block [
while [not tail? data][
temp-code: clear ""
char: false
while [not char][
append temp-code first+ data
char: find/case block temp-code
if char [char: first back char]
]
append result char
]
][
tree-block: any [tree-block self/tree]
while [not tail? data][
node: tree-block
until [
node: either (first+ data) = #"0" [first node] [second node]
(length? node) = 2 ; a leaf
]
append result first node
]
]
data: head data
result
]
] ; huffman
rle: context [
compress: func [
"Run length (aka PackBits) compresses a string series and returns it."
data [any-string!] "Data to compress"
/local start count char result
][
result: copy #{}
while [not tail? data] [
count: 0
start: data
char: pick data 1
; collect a sequence of not equal characters (optionally including some pairs of equals)
while [all [any [char <> pick data 2 char <> pick data 3] count < 127]][
count: count + 1
data: next data
char: pick data 1
]
if count > 0 [repend result [to-char (count - 1) copy/part start count]]
if count = 127 [data: back data]
if tail? data [break]
count: 0
start: data
char: pick data 1
; collect a sequence of all equal characters
while [all [char = pick data 1 count < 128]][
count: count + 1
data: next data
]
if count > 1 [repend result [to-char (256 - count + 1) char]]
]
data: head data
result
]
decompress: func [
"decompress a run length (aka PackBits) compressed binary"
data [binary!] "Data to decompress"
/local char count result
][
result: copy #{}
while [not tail? data] [
char: first data
case [
char < 128 [
count: 1 + char
insert tail result copy/part next data count
data: skip data count + 1
]
char > 128 [
insert/dup tail result to-char second data (256 + 1 - char)
data: skip data 2
]
char = 128 [
data: skip data 1
]
]
]
data: head data
result
]
] ; rle
; utility functions for LZ77 and LZSS
shift_window: func [look-ahead-buffer positions][
set look-ahead-buffer skip get look-ahead-buffer positions
]
match-length: func [a b /local start][
start: a
while [all [a/1 = b/1 not tail? a]][a: next a b: next b]
offset? start a
]
find_longest_match: func [search data search-buffer-size look-ahead-buffer-size /local pos len off length result][
pos: data
length: 0
result: head insert insert clear [] 0 0
while [pos: find/case/reverse pos first data] [
if (off: offset? pos data) > search-buffer-size [break]
if (len: match-length pos data) > length [
if len > look-ahead-buffer-size [break]
length: len
change/part result reduce [off length] 2
]
]
result
]
;
lz77: context [
result: copy []
compress: func [
"LZ77 compress a string series"
data [any-string!] "data to compress"
/to-block "return abstract block"
/local
look-ahead-buffer look-ahead-buffer-size search-buffer search-buffer-size position length
emit
][
clear result
look-ahead-buffer: data
look-ahead-buffer-size: 15
search-buffer: data
search-buffer-size: 255 ; if you increase this you have to change the code to generate binary! accordingly
emit: func [pos len char][insert insert insert tail result pos len char]
while [not empty? look-ahead-buffer] [
;go backwards in search buffer to find longest match of the look-ahead buffer
set [position length] find_longest_match search-buffer look-ahead-buffer search-buffer-size look-ahead-buffer-size
emit position length any [pick look-ahead-buffer length + 1 "^@"]
shift_window 'look-ahead-buffer length + 1
]
probe result
; convert to compact binary!
unless to-block [
bit-pos: 1 ; reset bit position
bin-result: clear head bin-result
while [not tail? result] [
either 0 = first result [
insert insert tail bin-result to-char first result third result
][
insert insert insert tail bin-result to-char first result to-char second result third result
]
result: skip result 3
]
return copy head bin-result
]
copy result
]
decompress: func [
"decompress LZ77 encoded binary"
data [block! binary!] "String to decompress"
/local flag outstring index char out bytes
][
if binary? data [
;convert back to block
;(obviously we could convert directly to final binary but this is kind of "educational")
bit-pos: 1 ; reset bit position
bin-result: copy data
out: copy []
while [0 < length? bin-result][
flag: first bin-result
insert tail out flag
either flag != 0 [
insert tail out any [pick bin-result 2 0]
insert tail out to-char any [pick bin-result 3 0]
bin-result: skip bin-result 3
][
insert tail out 0
insert tail out to-char second bin-result
bin-result: skip bin-result 2
]
]
probe data: out
]
out: make binary! length? data
foreach [offset length symbol] data [
;go reverse in previous output by offset characters and copy character wise for length symbols;
out: insert out bytes: copy/part skip out negate offset length
; special case for duplicated bytes (encoding reaches look-ahead buffer)
if length > offset [
out: insert/dup out bytes (to-integer length / offset) - 1
out: insert out copy/part bytes mod length offset
]
out: insert out symbol
]
head out
]
] ; lz77
lzss: context [
result: copy []
compress: func [
"LZSS compress a string series"
data [any-string!] "data to compress"
/to-block "return abstract block"
/local
look-ahead-buffer look-ahead-buffer-size search-buffer search-buffer-size minimum-match-length position length
emit emit-char
][
clear result
look-ahead-buffer: data
look-ahead-buffer-size: 15
search-buffer: data
search-buffer-size: 4095
minimum-match-length: 2
emit: func [flag pos len][insert insert insert tail result flag pos len]
emit-char: func [flag char][insert insert tail result flag char]
while [not empty? look-ahead-buffer] [
;go backwards in search buffer to find longest match of the look-ahead buffer
set [position length] find_longest_match search-buffer look-ahead-buffer search-buffer-size look-ahead-buffer-size
either length > minimum-match-length [
emit 0 position length
shift_window 'look-ahead-buffer length
][
emit-char 1 first look-ahead-buffer
shift_window 'look-ahead-buffer 1
]
]
probe result
; convert to compact binary!
; (note that the (un)compressed flag is reversed from standard since this simplifies decoding)
unless to-block [
bit-pos: 1 ; reset bit position
bin-result: clear head bin-result
while [not tail? result] [
either 0 = first result [
bits-set 0 1
bits-set second result 12
bits-set third result 4
result: skip result 3
][
bits-set 1 1
bits-set second result 8
result: skip result 2
]
]
return copy head bin-result
]
copy result
]
decompress: func [
"decompress LZSS encoded binary"
data [block! string! binary!] "data to decompress"
/local offset length out flag bytes
][
if string? data [data: to-binary data]
if binary? data [
;convert back to block
;(obviously we could convert directly to final binary but this is kind of "educational")
bit-pos: 1 ; reset bit position
bin-result: copy data
out: copy []
while [0 < length? bin-result][
flag: bits-get 1
insert tail out flag
either flag = 0 [
insert tail out bits-get 12
insert tail out bits-get 4
][
insert tail out to-char bits-get 8
]
]
data: out
]
probe data
out: make binary! length? data
while [not empty? data] [
either 0 = first data [
offset: second data
length: third data
;go reverse in previous output by offset characters and copy character wise for length symbols;
out: insert out bytes: copy/part skip out negate offset length
; special case for duplicated bytes (encoding reaches look-ahead buffer)
if length > offset [
out: insert/dup out bytes (to-integer length / offset) - 1
out: insert out copy/part bytes mod length offset
]
data: skip data 3
][
out: insert out second data ; copy uncompressed character
data: skip data 2
]
]
head out
]
] ; lzss
lz78: context [
result: copy []
string-table: make hash! []
add_table_entry: func [string][head insert tail string-table string]
compress: func [
"LZ78 compress a string"
data [any-string!] "String to compress"
/to-block "return abstract block"
/local
bytes byte bytes+byte index
emit index_from_string
][
clear result
clear string-table
emit: func [code char][head insert insert tail result code char]
index_from_string: func [string /local pos][(pos: find/case string-table string) either pos [index? pos][0]]
bytes: ""
foreach byte data [
bytes+byte: join bytes byte
either find/case string-table bytes+byte [
bytes: bytes+byte
][
emit index_from_string bytes byte
add_table_entry bytes+byte
clear bytes
]
]
emit index_from_string bytes ""
probe result
; convert to compact binary!
unless to-block [
bit-pos: 1 ; reset bit position
bin-result: clear head bin-result
index: 1
forskip result 2 [
bits-set first result to integer! 0.99999 + log-2 index ; this math calcs necessary number of bits
bits-set second result 8
index: index + 1
]
return copy head bin-result
]
copy result
]
decompress: func [
"decompress LZ78 encoded binary"
data [block! binary!] "data to decompress"
/local
outstring index char out
emit string_from_index
][
if binary? data [
;convert back to block
;(obviously we could convert directly to final binary but this is kind of "educational")
bit-pos: 1 ; reset bit position
bin-result: copy data
out: copy []
insert tail out 0
insert tail out to-char bits-get 8
index: 2
while [0 < length? bin-result][
insert tail out bits-get to integer! 0.99999 + log-2 index
insert tail out to-char bits-get 8
index: index + 1
]
data: out
]
probe data
out: make binary! length? data
emit: func [string][insert tail out string]
string_from_index: func [index][either index > 0 [string-table/(index)][""]]
clear string-table
forskip data 2 [
;read pair of index and character from input
index: first data
char: second data
outstring: join string_from_index index char
emit outstring
add_table_entry outstring
]
out
]
] ; lz78
lzw: context [; encoding and decoding algorithms translated from TIFF documentation
clearcode: 256
endofinformation: 257
result: copy []
string-table: make hash! []
bit-len: 9
initialize_table: does [
either empty? string-table [
clear head string-table
for n 0 255 1 [insert tail string-table to-string to-char n]
insert tail string-table clearcode
insert tail string-table endofinformation
][
string-table: head clear at string-table 259
]
bit-len: 9
]
compress: func [
"LZW compresses a string series and returns it."
data [any-string!] "Data to compress"
/to-block "return abstract block"
/local bytes byte bytes+byte out
add_table_entry emit code_from_string
][
result: clear head result
bit-pos: 1 ; reset bit position
bin-result: clear head bin-result
initialize_table
add_table_entry: func [string][
insert tail string-table string
switch length? string-table [
512 [bit-len: 10]
1024 [bit-len: 11]
2048 [bit-len: 12]
4096 [emit clearcode initialize_table]
]
]
emit: func [code][
head insert tail result either code > 255 [code][to-char code] ;remove or comment this if you want only binary
unless to-block [bits-set code bit-len]
]
code_from_string: func [string][(index? find/case string-table string) - 1] ; - 1 because index? is 1 based
emit clearcode
bytes: ""
foreach byte data [
bytes+byte: join bytes byte
either find/case string-table bytes+byte [
bytes: bytes+byte
][
emit code_from_string to-string bytes
add_table_entry bytes+byte
bytes: byte
]
] ; end of for loop
emit code_from_string to-string bytes
emit endofinformation
probe result
unless to-block [return copy head bin-result]
copy result
]
decompress: func [
"Decompress LZW compressed binary"
data [block! binary!] "Data to decompress"
/local code oldcode out
add_table_entry emit is_in_table
][
if binary? data [
bit-pos: 1 ; reset bit position
bin-result: copy data
bit-len: 9
]
out: make binary! length? data
add_table_entry: func [string][
insert tail string-table string
switch length? string-table [
511 [bit-len: 10]
1023 [bit-len: 11]
2047 [bit-len: 12]
]
]
emit: func [code][insert tail out code ]
is_in_table: func [code][if code < 256 [code: to-string to-char code] any [find/case string-table code code < length? string-table]]
get-code: func [][
either block? data [
to-integer first+ data
][
bits-get bit-len
]
]
while [(code: get-code) != endofinformation] [
either code = clearcode [
initialize_table
code: get-code data
if code = endofinformation [break]
emit string-table/(code + 1)
][ ; end of clearcode case
either is_in_table code [
emit string-table/(code + 1)
add_table_entry join string-table/(oldcode + 1) first string-table/(code + 1)
][
outstring: join string-table/(oldcode + 1) first string-table/(oldcode + 1)
emit outstring
add_table_entry outstring
]
] ; end of not-clearcode case
oldcode: code
] ; end of while loop
out
]
]; lzw
]
; comment [
do [
string: "abracadabradadadadadadaaaaaaaa"
print "^/Huffman compress:"
probe string
probe stream: compression/huffman/compress/to-block string
probe as-string compression/huffman/decompress stream
print "^/RLE compress:"
probe string
probe stream: compression/rle/compress string
probe as-string compression/rle/decompress stream
print "^/LZ77 compress:"
probe string
probe stream: compression/lz77/compress string
probe as-string compression/lz77/decompress stream
print "^/LZSS compress:"
probe string
probe stream: compression/lzss/compress string
probe as-string compression/lzss/decompress stream
print "^/LZ78 compress:"
probe string
probe stream: compression/lz78/compress string
probe as-string compression/lz78/decompress stream
print "^/LZW compress:"
probe string
probe stream: compression/lzw/compress string
probe as-string compression/lzw/decompress stream
halt
] |