LZ78

From WikiCoder

LZ78 is dictionary-based loseless compression algorithm. It is a side-effect of discussing compression limits by Lempel and Ziv. The original algorithm is somewhat idealistic - it doesn't have any bounds on dictionary size, this might not be applicable in real world.

Probably better known variant of LZ78 is LZW.

Pseudo-code

Encoding procedure can be describe using following following pseudo-code (Word + Char denotes concatenation)

    initialize Word to an empty string ""
    initialize dictionary and add empty string Word
    
    for each Char in input string do
       if (Word + Char) is in the dictionary then
          Word = Word + Char
       else 
          output(pointer(Word), Char)
          add (Word + Char) to dictionary
          initialize Word to an empty string ""
       end
    end

Simple example

Example of encoding string "abbadabbaabaad".

Dictionary keeps pairs (reference, char), where reference is reference to existing entry in dictionary. Reference to 0 indicates "end".

Dictionary index input encoding (output of encoder)
0
1 a (0, a)
2 b (0, b)
3 ba (2, a)
4 d (0, d)
5 ab (1, b)
6 baa (3, a)
7 baad (6, d)

Of course word can be recovered by following references, ie:

(6, d) -> ((3, a), d) -> (((2, a), a), d) -> ((((0, b), a), a), d) = baad

Headers and helpers

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void output_buffer(const void* buffer_in, size_t size) {
	const uint8_t* buffer = (const uint8_t*)buffer_in;
	for (size_t i = 0; i < size; ++i) {
		if (buffer[i] < 32 && buffer[i] != '\r' && buffer[i] != '\n' && buffer[i] != '\t')
			printf("\\%d", buffer[i]);
		else
			printf("%c", buffer[i]);
	}
}

typedef struct dict_entry_ {
	uint32_t next;
	uint8_t chr;
	size_t length;
} dict_entry;


typedef struct dictionary_ {
	dict_entry* entries;
	size_t num_entries;
} dictionary;

void dictionary_init(dictionary* dict)
{
	dict->entries = calloc(256, sizeof(dict_entry));
	// reserve initial entry for empty element
	dict->entries[0].next = 0;
	dict->entries[0].chr = 0;
	dict->entries[0].length = 0;
	dict->num_entries = 1;
}

void dictionary_destroy(dictionary* dict)
{
	free(dict->entries);
	dict->entries = NULL;
}

int dictionary_has_match(dictionary* dict, size_t index, const uint8_t* buffer, size_t buffer_size)
{
	size_t len = dict->entries[index].length;
	const uint8_t* pEnd = buffer + len - 1;

	while (len > 0) {
		if (*pEnd != dict->entries[index].chr)
			return 0;

		index = dict->entries[index].next;
		--pEnd;
		--len;
	}

	return 1;
}

size_t dictionary_find_longest_match(dictionary* dict, const uint8_t* buffer, size_t buffer_size)
{
	// process entries in dictionary in reverse order
	size_t best_match_idx = 0;
	size_t best_match_length = 0;

	for (size_t num = dict->num_entries; num > 0; --num) {
		size_t i = num - 1;

		if (dict->entries[i].length >= buffer_size)
			continue;


		if (dict->entries[i].length <= best_match_length)
			continue;

		if (dictionary_has_match(dict, i, buffer, buffer_size)) {
			best_match_idx = i;
			best_match_length = dict->entries[i].length;
		}
	}

	return best_match_idx;
}

void dictionary_add(dictionary* dict, uint32_t next, uint8_t chr)
{
	size_t num = dict->num_entries;
	dict->entries[num].next = next;
	dict->entries[num].chr = chr;
	dict->entries[num].length = dict->entries[next].length + 1;
	++dict->num_entries;
}

Encoding

size_t dictionary_add_buffer(dictionary* dict, const uint8_t* buffer, size_t buffer_size)
{
	if (dict->num_entries == 256) {
		printf("encoder supports only 256 entries in dictionary");
		return buffer_size;	
	}

	// find longest string that is already in the dictionary
	size_t ref = dictionary_find_longest_match(dict, buffer, buffer_size);
	if (! ref) {
		dictionary_add(dict, 0, *buffer);
		return 1;
	}
	
	size_t length = dict->entries[ref].length;
	buffer += length;
	dictionary_add(dict, ref, *buffer);
	return length + 1;
}


size_t lz78_encode(uint8_t* encoded_data_ptr, size_t num_out, const uint8_t* in_ptr, size_t num_in)
{
	dictionary dict;
	dictionary_init(&dict);

	uint8_t* out_ptr = encoded_data_ptr;
	while (num_in > 0) {
		size_t match = dictionary_add_buffer(&dict, in_ptr, num_in);

		// save last entry, cast next to single byte
		*out_ptr++ = (uint8_t)dict.entries[dict.num_entries - 1].next;
		*out_ptr++ = dict.entries[dict.num_entries - 1].chr;

		in_ptr += match;
		num_in -= match;
	}

	dictionary_destroy(&dict);

	return out_ptr - encoded_data_ptr;
}

Decoding

void lz78_decode(uint8_t* decoded_data, const uint8_t* encoded_data, size_t encoded_size)
{
	dictionary dict;
	dictionary_init(&dict);

	// read in whole dictionary
	while (encoded_size > 1) {
		uint32_t next = *encoded_data++;
		uint8_t ptr = *encoded_data++;
		dictionary_add(&dict, next, ptr);

		encoded_size -= 2;
	}

	// could be done in one pass, but separated for visibility
	uint8_t temp[256];
	for (size_t i = 1; i < dict.num_entries; ++i) {
		size_t id = i;
		size_t run = 0;
		// decode string in dictionary trie into temporary space
		while (id > 0) {
			temp[run++] = dict.entries[id].chr;
			id = dict.entries[id].next;
		}

		// copy reversed from temporary space to output buffer
		while (run)
			*decoded_data++ = temp[--run];
	}
}

Example

const char in_data[] = 
"I am Sam,\n"
"Sam I am.\n"
"That Sam-I-am! That Sam-I-am!\n"
"I do not like that Sam-I-am!\n"
"Do you like green eggs and ham?\n"
"I do not like them, Sam-I-am.\n"
"I do not like green eggs and ham.\n";

int main(void)
{
	size_t num_in = sizeof(in_data) - 1;
	size_t num_out = 2 * num_in;
	uint8_t* encoded_data = (uint8_t*)malloc(num_out);
	uint8_t* decoded_data = (uint8_t*)malloc(num_in);

	printf("Original size: %zu\n", num_in);
	printf("Original data >>>\n");
	output_buffer(in_data, num_in);
	printf("<<<\n");
	printf("\n");

	size_t encoded_size = lz78_encode(encoded_data, num_out, (const uint8_t*)in_data, num_in);
	printf("Encoded size: %zu\n", encoded_size);
	printf("Saved: %d %%\n", 100 - (int)(100 * encoded_size / num_in));

	lz78_decode(decoded_data, encoded_data, encoded_size);

	printf("Decoded data >>>\n");
	output_buffer(decoded_data, num_in);
	printf("<<<\n");

	free(encoded_data);
	free(decoded_data);

	return 0;
}

/* Output:
 * Original size: 175
 * Encoded size: 170
 * Saved: 3 %
 *
 * because of how LZ-78 builds the dictionary (and how the dictionary itself is encoded),
 * the gain is close to non-existing.
 */