Script Library: compression.r

Script Library: 1247 scripts
compression.r

REBOL [
    title: "Various compression algorithms"
    file: %compression.r
    author: "Marco Antoniazzi"
    email: [luce80 AT libero DOT it]
    date: 14-02-2013
    version: 0.0.3
    Purpose: "Collect and show various compression algorithms (Huffman, RLE, LZ77, LZSS, LZ78, LZW)."
    History: [
        0.0.1 [01-01-2013 "Started"]
        0.0.2 [12-02-2013 "ok"]
        0.0.3 [14-02-2013 "Fixed silly bug in lzw binary! decompress"]
    ]
    library: [
        level: 'intermediate
        platform: 'all
        type: [function tool]
        domain: [compression]
        tested-under: [View 2.7.8.3.1]
        support: none
        license: 'public-domain
    ]
    notes: {
        I should say that these functions are made slow on pourpose.
        Any optimization for speed or bits-saving is left as an exercise to the reader ;)
        
        Do not esitate to help me improve this script by adding more algorithms.
    }
]

compression: context [
    ; some functions taken or partially derived from an article of Ole Friis in www.rebolforces.com

    alphadigits: "0123456789ABCDEF"
    enbase: func [value [integer!] /base base-value [integer!] /local result num][
        if value = 0 [return "0"]
        base-value: any [base-value 16]
        result: copy ""
        while [value <> 0][
            num: mod value base-value
            insert result any [alphadigits/(num + 1) "0"]
            value: (value - num) / base-value
        ]
        result
    ]
    debase: func [value [any-string!] /base base-value [integer!] /local num char pos][
        base-value: any [base-value 16]
        ;FIXME: if 0 <> mod (length? string) base-value [make error! "wrong string length"]
        num: 0
        foreach char value [
            if none? pos: find alphadigits uppercase char [break]
            num: num * base-value + (index? pos) - 1
        ]
        num
    ]
    from-base2: func [value][debase/base value 2]
    to-base2: func [value][enbase/base value 2]
    
    
    
    bit-pos: 1
    bin-result: copy #{}
    to-bin: func [num][system/words/debase/base to-hex num 16]
    bits-set: func [num len /local shifted-num result][
        ; shift num left to bit-pos
        shifted-num: shift/left to-integer num 32 - len - bit-pos + 1 
        ; overwrite dest binary with shifted number
        result: change bin-result bin-result or to-bin shifted-num
        ; update bit number and bin-result pos
        bit-pos: bit-pos + len
        while [bit-pos > 8] [bit-pos: bit-pos - 8 bin-result: next bin-result]
        head result
    ]
    bits-get: func [len /local mask num][
        mask: shift to-integer #{80000000} len - 1
        mask: shift/logical mask bit-pos - 1
        ; extract bits using mask and then shift them to get the right number
        num: mask and to-integer copy/part bin-result 4
        num: shift/logical num 32 - len - bit-pos + 1
        bit-pos: bit-pos + len
        while [bit-pos > 8] [bit-pos: bit-pos - 8 bin-result: next bin-result]
        num
    ]

    encode-bitstream: func [;author Ole Friis
        "Encodes a string of 1's and 0's to a binary string"
        s [string!] "The string of 1's and 0's"
        /local res byte add-this
        ][
        res: copy #{}
        forever [
            byte: 0
            add-this: 128
            while [(add-this <> 0) and (not tail? s)] [
                if #"1" = first s [byte: byte + add-this]
                add-this: shift add-this 1
                s: next s
            ]
            append res to-char byte
            if tail? s [return res]
        ]
    ]
    decode-bitstream: func [;author Ole Friis
        "Decodes a binary string into a string of 1's and 0's"
        s [binary!] "The binary string"
        /local res next-bit
        ][
        res: copy ""
        next-bit: 128
        while [not tail? s] [
            append res either 0 = and~ next-bit to-integer first s ["0"]["1"]
            next-bit: shift next-bit 1
            if next-bit = 0 [next-bit: 128  s: next s]
        ]
        res
    ]   



    huffman: context [

        stats: none ; The statistics
        tree: none ; the Huffman tree
        codes: none ; the translation table with alphabet symbols and their Huffman code
        huffman-code: copy "" ; temporary partial Huffman code

        build_stats: func [
            {build a block with all symbols and their count (eg. [2 #"e" 7 #"a" ...])}
            data [any-string!] "data to analyze"
            /local char pos
            ][
            stats: copy []
            data: as-string data
            foreach char data [
                either pos: find/case stats char [pos/(-1): pos/(-1) + 1][insert insert stats 1 char]
            ]
            
            sort/skip stats 2
        ]

        build_tree: func [
            "build the Huffman tree" ; the tree is a list of lists something like: [[[a 3] [[[c 1] [b 1] 2] [e 2] 4] 7]]
            stats [block!] "The statistics"
            /local node1 node2 weight new-node temp-list
            ][
            tree: copy []
            ; Create a leaf node for each symbol and add it to the tree
            forskip stats 2 [repend/only tree [second stats first stats]]

            while [1 < length? tree][
                node1: take tree
                node2: take tree
                weight: (last node1) + (last node2)
                new-node: reduce [node1 node2 weight]
                ; Insert the new node correctly (sorted) in the tree (we could use a bisection)
                temp-list: head tree
                while [all [not tail? temp-list  (last first temp-list) <= weight]][
                  temp-list: next temp-list
                ]
                insert/only temp-list new-node
            ]
            ; Return the top element of the Huffman tree
            tree: first tree
        ]

        build_table: func [
            "recursively traverse the huffman tree building a translation table"
            tree [block!]
            /local node
            ][
            if block? node: first tree [
                append huffman-code "0"
                build_table node
            ]
            either block? node: second tree [
                append huffman-code "1"
                build_table node
            ][
                repend codes [first tree copy huffman-code]
            ]
            remove back tail huffman-code
            make hash! codes
        ]

        compress: func [
            "Huffman-compress a string series"
            data [any-string!] "Data to compress"
            /table block [block!]  "The Huffman translation table to use"
            /to-block "Give result as a block of strings"
            /local char result code block-result
            ][
            result: copy ""
            block-result: copy []
            block: any [block do [codes: copy [] build_table build_tree build_stats data]]
            foreach char data [
                insert tail result code: select block to-char char ;cannot use path selection because it is case INSENSITIVE (!?)
                if to-block [insert tail block-result code]
            ]
            if to-block [return block-result]
            result: encode-bitstream result
        ]

        to-canonical: func [
            "Transform a Huffman translation table to a canonical one"
            /table block [block!]  "The Huffman translation table to use"
            /local codes-copy canonical-codes current-bit-length new-code
            ][
            codes-copy: copy any [block codes]
            sort/skip codes-copy 2
            canonical-codes: copy []
            ;The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
            current-bit-length: length? second codes-copy
            repend canonical-codes [first codes-copy head insert/dup clear "" "0" current-bit-length]

            new-code: second canonical-codes
            codes-copy: skip codes-copy 2
            forskip codes-copy 2 [
                current-bit-length: length? second codes-copy
                new-code: to-base2 (from-base2 new-code) + 1
                if current-bit-length > length? new-code [insert new-code "0"]
                if current-bit-length > length? new-code [insert tail new-code "0"]
                repend canonical-codes [first codes-copy new-code]
            ]
            make hash! canonical-codes
        ]

        decompress: func [
            "decompress Huffman encoded block or binary"
            data [block! binary!] "data to decompress"
            /tree tree-block [block!]  "The Huffman tree to use"
            /table block [block!]  "The Huffman translation table to use"
            /local result char temp-code node
            ][
            result: copy #{}
            if block? data [data: rejoin data]
            if binary? data [data: decode-bitstream data]
            either block [
                while [not tail? data][
                    temp-code: clear ""
                    char: false
                    while [not char][
                        append temp-code first+ data
                        char: find/case block temp-code
                        if char [char: first back char]
                    ]
                    append result char
                ]
            ][
                tree-block: any [tree-block self/tree]
                while [not tail? data][
                    node: tree-block
                    until [
                        node: either (first+ data) = #"0" [first node] [second node]
                        (length? node) = 2 ; a leaf
                    ]
                    append result first node
                ]
            ]
            data: head data
            result
        ]
        
    ] ; huffman
    
    rle: context [

        compress: func [
            "Run length (aka PackBits) compresses a string series and returns it."
            data [any-string!] "Data to compress"
            /local start count char result
            ][
            result: copy #{}
            while [not tail? data] [
                count: 0
                start: data
                char: pick data 1
                ; collect a sequence of not equal characters (optionally including some pairs of equals)
                while [all [any [char <> pick data 2 char <> pick data 3] count < 127]][
                    count: count + 1
                    data: next data
                    char: pick data 1
                ]
                if count > 0 [repend result [to-char (count - 1) copy/part start count]]
                if count = 127 [data: back data]
                if tail? data [break] 
                count: 0
                start: data
                char: pick data 1
                ; collect a sequence of all equal characters
                while [all [char = pick data 1 count < 128]][
                    count: count + 1
                    data: next data
                ]
                if count > 1 [repend result [to-char (256 - count + 1) char]]
            ]
            data: head data
            result
        ]

        decompress: func [
            "decompress a run length (aka PackBits) compressed binary"
            data [binary!] "Data to decompress"
            /local char count result
            ][
            result: copy #{}
            while [not tail? data] [
                char: first data
                case [
                    char < 128 [
                        count: 1 + char
                        insert tail result copy/part next data count
                        data: skip data count + 1
                    ]
                    char > 128 [
                        insert/dup tail result to-char second data (256 + 1 - char)
                        data: skip data 2
                    ]
                    char = 128 [
                        data: skip data 1
                    ]
                ]
            ]
            data: head data
            result
        ]
    ] ; rle

    ; utility functions for LZ77 and LZSS
        shift_window: func [look-ahead-buffer positions][
            set look-ahead-buffer skip get look-ahead-buffer positions              
        ]
        match-length: func [a b /local start][
            start: a
            while [all [a/1 = b/1 not tail? a]][a: next a b: next b]
            offset? start a
        ]
        find_longest_match: func [search data search-buffer-size look-ahead-buffer-size /local pos len off length result][
            pos: data
            length: 0
            result: head insert insert clear [] 0 0
            while [pos: find/case/reverse pos first data] [
                if (off: offset? pos data) > search-buffer-size [break]
                if (len: match-length pos data) > length [
                    if len > look-ahead-buffer-size [break]
                    length: len
                    change/part result reduce [off length] 2
                ]
            ]
            result
        ]
    ;

    lz77: context [

        result: copy []

        compress: func [
            "LZ77 compress a string series"
            data [any-string!] "data to compress"
            /to-block "return abstract block"
            /local
            look-ahead-buffer look-ahead-buffer-size search-buffer search-buffer-size position length
            emit
            ][
            clear result
            look-ahead-buffer: data
            look-ahead-buffer-size: 15
            search-buffer: data
            search-buffer-size: 255 ; if you increase this you have to change the code to generate binary! accordingly

            emit: func [pos len char][insert insert insert tail result pos len char]

            while [not empty? look-ahead-buffer] [
                ;go backwards in search buffer to find longest match of the look-ahead buffer
                set [position length] find_longest_match search-buffer look-ahead-buffer search-buffer-size look-ahead-buffer-size
                emit position length any [pick look-ahead-buffer length + 1 "^@"]
                shift_window 'look-ahead-buffer length + 1
            ]

            probe result
            ; convert to compact binary!
            unless to-block [
                bit-pos: 1 ; reset bit position
                bin-result: clear head bin-result
                while [not tail? result] [
                    either 0 = first result [
                        insert insert tail bin-result to-char first result third result
                    ][
                        insert insert insert tail bin-result to-char first result to-char second result third result
                    ]
                    result: skip result 3
                ]
                return copy head bin-result
            ]

            copy result
        ]

        decompress: func [
            "decompress LZ77 encoded binary"
            data [block! binary!] "String to decompress"
            /local flag outstring index char out bytes
            ][
            if binary? data [
                ;convert back to block
                ;(obviously we could convert directly to final binary but this is kind of "educational")
                bit-pos: 1 ; reset bit position
                bin-result: copy data
                out: copy []
                while [0 < length? bin-result][
                    flag: first bin-result
                    insert tail out flag
                    either flag != 0 [
                        insert tail out any [pick bin-result 2 0]
                        insert tail out to-char any [pick bin-result 3 0]
                        bin-result: skip bin-result 3
                    ][
                        insert tail out 0
                        insert tail out to-char second bin-result
                        bin-result: skip bin-result 2
                    ]
                ]
                probe data: out
            ]

            out: make binary! length? data
            foreach [offset length symbol] data [
                ;go reverse in previous output by offset characters and copy character wise for length symbols;
                out: insert out bytes: copy/part skip out negate offset length
                ; special case for duplicated bytes (encoding reaches look-ahead buffer)
                if length > offset [
                    out: insert/dup out bytes (to-integer length / offset) - 1
                    out: insert out copy/part bytes mod length offset
                ] 
                out: insert out symbol
            ]
            head out
        ]

    ] ; lz77

    lzss: context [

        result: copy []

        compress: func [
            "LZSS compress a string series"
            data [any-string!] "data to compress"
            /to-block "return abstract block"
            /local
            look-ahead-buffer look-ahead-buffer-size search-buffer search-buffer-size minimum-match-length position length
            emit emit-char
            ][
            clear result
            look-ahead-buffer: data
            look-ahead-buffer-size: 15
            search-buffer: data
            search-buffer-size: 4095
            minimum-match-length: 2
            emit: func [flag pos len][insert insert insert tail result flag pos len]
            emit-char: func [flag char][insert insert tail result flag char]

            while [not empty? look-ahead-buffer] [
                ;go backwards in search buffer to find longest match of the look-ahead buffer
                set [position length] find_longest_match search-buffer look-ahead-buffer search-buffer-size look-ahead-buffer-size
                either length > minimum-match-length [
                    emit 0 position length
                    shift_window 'look-ahead-buffer length
                ][
                    emit-char 1 first look-ahead-buffer
                    shift_window 'look-ahead-buffer 1
                ]
            ]
            
            probe result
            ; convert to compact binary!
            ; (note that the (un)compressed flag is reversed from standard since this simplifies decoding)
            unless to-block [
                bit-pos: 1 ; reset bit position
                bin-result: clear head bin-result
                while [not tail? result] [
                    either 0 = first result [
                        bits-set 0 1
                        bits-set second result 12
                        bits-set third result 4
                        result: skip result 3
                    ][
                        bits-set 1 1
                        bits-set second result 8
                        result: skip result 2
                    ]
                ]
                return copy head bin-result
            ]
            copy result
        ]

        decompress: func [
            "decompress LZSS encoded binary"
            data [block! string! binary!] "data to decompress"
            /local offset length out flag bytes
            ][
            if string? data [data: to-binary data]
            if binary? data [
                ;convert back to block
                ;(obviously we could convert directly to final binary but this is kind of "educational")
                bit-pos: 1 ; reset bit position
                bin-result: copy data
                out: copy []
                while [0 < length? bin-result][
                    flag: bits-get 1
                    insert tail out flag
                    either flag = 0 [
                        insert tail out bits-get 12
                        insert tail out bits-get 4
                    ][
                        insert tail out to-char bits-get 8
                    ]
                ]
                data: out
            ]
            probe data

            out: make binary! length? data
            while [not empty? data] [
                either 0 = first data [ 
                    offset: second data
                    length: third data
                    ;go reverse in previous output by offset characters and copy character wise for length symbols;
                    out: insert out bytes: copy/part skip out negate offset length
                    ; special case for duplicated bytes (encoding reaches look-ahead buffer)
                    if length > offset [
                        out: insert/dup out bytes (to-integer length / offset) - 1
                        out: insert out copy/part bytes mod length offset
                    ] 
                    data: skip data 3
                ][
                    out: insert out second data ; copy uncompressed character
                    data: skip data 2
                ]
            ]
            head out
        ]

    ] ; lzss
    
    lz78: context [

        result: copy []
        string-table: make hash! []

        add_table_entry: func [string][head insert tail string-table string]

        compress: func [
            "LZ78 compress a string"
            data [any-string!] "String to compress"
            /to-block "return abstract block"
            /local
            bytes byte bytes+byte index
            emit index_from_string
            ][
            clear result
            clear string-table
            emit: func [code char][head insert insert tail result code char]
            index_from_string: func [string /local pos][(pos: find/case string-table string) either pos [index? pos][0]]

            bytes: ""
            foreach byte data [
                bytes+byte: join bytes byte
                either find/case string-table bytes+byte [
                    bytes: bytes+byte
                ][
                    emit index_from_string bytes byte
                    add_table_entry bytes+byte
                    clear bytes
                ]
            ]
            emit index_from_string bytes ""

            probe result
            ; convert to compact binary!
            unless to-block [
                bit-pos: 1 ; reset bit position
                bin-result: clear head bin-result
                index: 1
                forskip result 2 [
                    bits-set first result to integer! 0.99999 + log-2 index ; this math calcs necessary number of bits
                    bits-set second result 8
                    index: index + 1
                ]
                return copy head bin-result
            ]

            copy result
        ]

        decompress: func [
            "decompress LZ78 encoded binary"
            data [block! binary!] "data to decompress"
            /local
            outstring index char out
            emit string_from_index
            ][
            if binary? data [
                ;convert back to block
                ;(obviously we could convert directly to final binary but this is kind of "educational")
                bit-pos: 1 ; reset bit position
                bin-result: copy data
                out: copy []
                insert tail out 0
                insert tail out to-char bits-get 8
                index: 2
                while [0 < length? bin-result][
                    insert tail out bits-get to integer! 0.99999 + log-2 index
                    insert tail out to-char bits-get 8
                    index: index + 1
                ]
                data: out
            ]
            probe data

            out: make binary! length? data
            emit: func [string][insert tail out string]
            string_from_index: func [index][either index > 0 [string-table/(index)][""]]

            clear string-table
            forskip data 2 [
                ;read pair of index and character from input
                index: first data
                char: second data
                outstring: join string_from_index index char
                emit outstring
                add_table_entry outstring
            ]
            out
        ]

    ] ; lz78

    lzw: context [; encoding and decoding algorithms translated from TIFF documentation

        clearcode: 256
        endofinformation: 257
        result: copy []
        string-table: make hash! []
        bit-len: 9
        
        initialize_table: does [
            either empty? string-table [
                clear head string-table
                for n 0 255 1 [insert tail string-table to-string to-char n]
                insert tail string-table clearcode
                insert tail string-table endofinformation
            ][
                string-table: head clear at string-table 259
            ]
            bit-len: 9
        ]

        compress: func [
            "LZW compresses a string series and returns it."
            data [any-string!] "Data to compress"
            /to-block "return abstract block"
            /local bytes byte bytes+byte out
            add_table_entry emit code_from_string
            ][
            result: clear head result

            bit-pos: 1 ; reset bit position
            bin-result: clear head bin-result

            initialize_table

            add_table_entry: func [string][
                insert tail string-table string
                switch length? string-table [
                    512  [bit-len: 10]
                    1024 [bit-len: 11]
                    2048 [bit-len: 12]
                    4096 [emit clearcode initialize_table]
                ]
            ]
            emit: func [code][
                head insert tail result either code > 255 [code][to-char code] ;remove or comment this if you want only binary
                unless to-block [bits-set code bit-len]
            ]
            code_from_string: func [string][(index? find/case string-table string) - 1] ; - 1 because index? is 1 based

            emit clearcode
            bytes: ""
            foreach byte data [
                bytes+byte: join bytes byte
                either find/case string-table bytes+byte [
                    bytes: bytes+byte
                ][
                    emit code_from_string to-string bytes
                    add_table_entry bytes+byte
                    bytes: byte
                ]
            ] ; end of for loop
            emit code_from_string to-string bytes
            emit endofinformation

            probe result
            unless to-block [return copy head bin-result]
            copy result
        ]

        decompress: func [
            "Decompress LZW compressed binary"
            data [block! binary!] "Data to decompress"
            /local code oldcode out
            add_table_entry emit is_in_table
            ][
            if binary? data [
                bit-pos: 1 ; reset bit position
                bin-result: copy data
                bit-len: 9
            ]

            out: make binary! length? data
            add_table_entry: func [string][
                insert tail string-table string
                switch length? string-table [
                    511  [bit-len: 10]
                    1023 [bit-len: 11]
                    2047 [bit-len: 12]
                ]
            ]
            emit: func [code][insert tail out code ]
            is_in_table: func [code][if code < 256 [code: to-string to-char code] any [find/case string-table code code < length? string-table]]
            get-code: func [][
                either block? data [
                    to-integer first+ data
                ][
                    bits-get bit-len
                ]
            ]

            while [(code: get-code) != endofinformation] [
                either code = clearcode [
                    initialize_table
                    code: get-code data
                    if code = endofinformation [break]
                    emit string-table/(code + 1)
                ][ ; end of clearcode case
                    either is_in_table code [
                        emit string-table/(code + 1)
                        add_table_entry join string-table/(oldcode + 1) first string-table/(code + 1)
                    ][
                        outstring: join string-table/(oldcode + 1) first string-table/(oldcode + 1)
                        emit outstring
                        add_table_entry outstring
                    ]
                ] ; end of not-clearcode case
                oldcode: code
            ] ; end of while loop
            out
        ]

    ]; lzw

]

; comment [
do [
    string: "abracadabradadadadadadaaaaaaaa"
    print "^/Huffman compress:"
    probe string
    probe stream: compression/huffman/compress/to-block string
    probe as-string compression/huffman/decompress stream
    print "^/RLE compress:"
    probe string
    probe stream: compression/rle/compress string
    probe as-string compression/rle/decompress stream
    print "^/LZ77 compress:"
    probe string
    probe stream: compression/lz77/compress string
    probe as-string compression/lz77/decompress stream
    print "^/LZSS compress:"
    probe string
    probe stream: compression/lzss/compress string
    probe as-string compression/lzss/decompress stream
    print "^/LZ78 compress:"
    probe string
    probe stream: compression/lz78/compress string
    probe as-string compression/lz78/decompress stream
    print "^/LZW compress:"
    probe string
    probe stream: compression/lzw/compress string
    probe as-string compression/lzw/decompress stream

    halt
]