LZ78
From WikiCoder
LZ78 is dictionary-based loseless compression algorithm. It is a side-effect of discussing compression limits by Lempel and Ziv. The original algorithm is somewhat idealistic - it doesn't have any bounds on dictionary size, this might not be applicable in real world.
Probably better known variant of LZ78 is LZW.
Pseudo-code
Encoding procedure can be describe using following following pseudo-code (Word + Char denotes concatenation)
initialize Word to an empty string "" initialize dictionary and add empty string Word for each Char in input string do if (Word + Char) is in the dictionary then Word = Word + Char else output(pointer(Word), Char) add (Word + Char) to dictionary initialize Word to an empty string "" end end
Simple example
Example of encoding string "abbadabbaabaad".
Dictionary keeps pairs (reference, char), where reference is reference to existing entry in dictionary. Reference to 0 indicates "end".
Dictionary index | input | encoding (output of encoder) |
---|---|---|
0 | ∅ | ∅ |
1 | a | (0, a) |
2 | b | (0, b) |
3 | ba | (2, a) |
4 | d | (0, d) |
5 | ab | (1, b) |
6 | baa | (3, a) |
7 | baad | (6, d) |
Of course word can be recovered by following references, ie:
(6, d) -> ((3, a), d) -> (((2, a), a), d) -> ((((0, b), a), a), d) = baad
Headers and helpers
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void output_buffer(const void* buffer_in, size_t size) {
const uint8_t* buffer = (const uint8_t*)buffer_in;
for (size_t i = 0; i < size; ++i) {
if (buffer[i] < 32 && buffer[i] != '\r' && buffer[i] != '\n' && buffer[i] != '\t')
printf("\\%d", buffer[i]);
else
printf("%c", buffer[i]);
}
}
typedef struct dict_entry_ {
uint32_t next;
uint8_t chr;
size_t length;
} dict_entry;
typedef struct dictionary_ {
dict_entry* entries;
size_t num_entries;
} dictionary;
void dictionary_init(dictionary* dict)
{
dict->entries = calloc(256, sizeof(dict_entry));
// reserve initial entry for empty element
dict->entries[0].next = 0;
dict->entries[0].chr = 0;
dict->entries[0].length = 0;
dict->num_entries = 1;
}
void dictionary_destroy(dictionary* dict)
{
free(dict->entries);
dict->entries = NULL;
}
int dictionary_has_match(dictionary* dict, size_t index, const uint8_t* buffer, size_t buffer_size)
{
size_t len = dict->entries[index].length;
const uint8_t* pEnd = buffer + len - 1;
while (len > 0) {
if (*pEnd != dict->entries[index].chr)
return 0;
index = dict->entries[index].next;
--pEnd;
--len;
}
return 1;
}
size_t dictionary_find_longest_match(dictionary* dict, const uint8_t* buffer, size_t buffer_size)
{
// process entries in dictionary in reverse order
size_t best_match_idx = 0;
size_t best_match_length = 0;
for (size_t num = dict->num_entries; num > 0; --num) {
size_t i = num - 1;
if (dict->entries[i].length >= buffer_size)
continue;
if (dict->entries[i].length <= best_match_length)
continue;
if (dictionary_has_match(dict, i, buffer, buffer_size)) {
best_match_idx = i;
best_match_length = dict->entries[i].length;
}
}
return best_match_idx;
}
void dictionary_add(dictionary* dict, uint32_t next, uint8_t chr)
{
size_t num = dict->num_entries;
dict->entries[num].next = next;
dict->entries[num].chr = chr;
dict->entries[num].length = dict->entries[next].length + 1;
++dict->num_entries;
}
Encoding
size_t dictionary_add_buffer(dictionary* dict, const uint8_t* buffer, size_t buffer_size)
{
if (dict->num_entries == 256) {
printf("encoder supports only 256 entries in dictionary");
return buffer_size;
}
// find longest string that is already in the dictionary
size_t ref = dictionary_find_longest_match(dict, buffer, buffer_size);
if (! ref) {
dictionary_add(dict, 0, *buffer);
return 1;
}
size_t length = dict->entries[ref].length;
buffer += length;
dictionary_add(dict, ref, *buffer);
return length + 1;
}
size_t lz78_encode(uint8_t* encoded_data_ptr, size_t num_out, const uint8_t* in_ptr, size_t num_in)
{
dictionary dict;
dictionary_init(&dict);
uint8_t* out_ptr = encoded_data_ptr;
while (num_in > 0) {
size_t match = dictionary_add_buffer(&dict, in_ptr, num_in);
// save last entry, cast next to single byte
*out_ptr++ = (uint8_t)dict.entries[dict.num_entries - 1].next;
*out_ptr++ = dict.entries[dict.num_entries - 1].chr;
in_ptr += match;
num_in -= match;
}
dictionary_destroy(&dict);
return out_ptr - encoded_data_ptr;
}
Decoding
void lz78_decode(uint8_t* decoded_data, const uint8_t* encoded_data, size_t encoded_size)
{
dictionary dict;
dictionary_init(&dict);
// read in whole dictionary
while (encoded_size > 1) {
uint32_t next = *encoded_data++;
uint8_t ptr = *encoded_data++;
dictionary_add(&dict, next, ptr);
encoded_size -= 2;
}
// could be done in one pass, but separated for visibility
uint8_t temp[256];
for (size_t i = 1; i < dict.num_entries; ++i) {
size_t id = i;
size_t run = 0;
// decode string in dictionary trie into temporary space
while (id > 0) {
temp[run++] = dict.entries[id].chr;
id = dict.entries[id].next;
}
// copy reversed from temporary space to output buffer
while (run)
*decoded_data++ = temp[--run];
}
}
Example
const char in_data[] =
"I am Sam,\n"
"Sam I am.\n"
"That Sam-I-am! That Sam-I-am!\n"
"I do not like that Sam-I-am!\n"
"Do you like green eggs and ham?\n"
"I do not like them, Sam-I-am.\n"
"I do not like green eggs and ham.\n";
int main(void)
{
size_t num_in = sizeof(in_data) - 1;
size_t num_out = 2 * num_in;
uint8_t* encoded_data = (uint8_t*)malloc(num_out);
uint8_t* decoded_data = (uint8_t*)malloc(num_in);
printf("Original size: %zu\n", num_in);
printf("Original data >>>\n");
output_buffer(in_data, num_in);
printf("<<<\n");
printf("\n");
size_t encoded_size = lz78_encode(encoded_data, num_out, (const uint8_t*)in_data, num_in);
printf("Encoded size: %zu\n", encoded_size);
printf("Saved: %d %%\n", 100 - (int)(100 * encoded_size / num_in));
lz78_decode(decoded_data, encoded_data, encoded_size);
printf("Decoded data >>>\n");
output_buffer(decoded_data, num_in);
printf("<<<\n");
free(encoded_data);
free(decoded_data);
return 0;
}
/* Output:
* Original size: 175
* Encoded size: 170
* Saved: 3 %
*
* because of how LZ-78 builds the dictionary (and how the dictionary itself is encoded),
* the gain is close to non-existing.
*/