LZ77

From WikiCoder
Revision as of 13:01, 16 July 2020 by Gim (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

LZ77 and LZ78 are simple lossless compression algorithms published by Lempel and Ziv. LZ77 is a sliding window algorithm, while LZ78 is an online dictionary algorithm, where dictionary is built during decompression.

LZ77 achieves compression, by replacing repeated patterns with references to earlier occurences in uncompressed data. The reference goes back from the current decompressor position and there's a limit how far backwards the reference can go (therefore sliding window). The reference consist of (length, distance) pair.

The original algorithm is generic one and allows any sizes of both window (distance) and length, but for clarity the implementation below uses 8-bit bytes. When compressing, the original algorithm always outputs triple (length, distance, char), where char is a character that is being output after the match. The implementation below will use zero-length as a marker, in that case tuple consiting of (0, char) will be output.

There are many algorithms that derive directly from LZ77, two related ones are LZSS and LZ4

Note: the implementation also assumes sizeof(char) = sizeof(uint8_t)

Headers and helpers

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

const size_t WINDOW_SIZE = 256;
const size_t MATCH_LIMIT = 255;

size_t window_match(const uint8_t* window_ptr, size_t offset, const uint8_t* in_ptr, size_t num_in)
{
	size_t i = 0;
	for ( ; i < MATCH_LIMIT && i < num_in; ++i)
		if (window_ptr[(offset + i) % WINDOW_SIZE] != in_ptr[i])
			break;

	return i;
}

void output_buffer(const void* buffer_in, size_t size) {
	const uint8_t* buffer = (const uint8_t*)buffer_in;
	for (size_t i = 0; i < size; ++i) {
		if (buffer[i] < 32 && buffer[i] != '\r' && buffer[i] != '\n' && buffer[i] != '\t')
			printf("\\%d", buffer[i]);
		else
			printf("%c", buffer[i]);
	}
}

#define SAVE_IN_SLIDING_WINDOW(B) do { \
		sliding_window[window_position] = B; \
		window_position = (window_position + 1) % WINDOW_SIZE; \
	} while(0)

Encoding

The original algorithm, assumes the sliding window is initially positioned before input text and filled with zeroes.

size_t lz77_encode(uint8_t* encoded_data_ptr, int num_out, const uint8_t* in_ptr, int num_in)
{
	uint8_t sliding_window[WINDOW_SIZE];
	memset(sliding_window, 0, WINDOW_SIZE);
	size_t window_position = 0;

	uint8_t* out_ptr = encoded_data_ptr;
	for ( ; num_in > 0; --num_in) {
		// find longest match inside window, note: longest match might not be the best match
		size_t max_match_len = 0;
		size_t max_match_pos = 0;
		for (size_t current_start = 0; current_start < WINDOW_SIZE; ++current_start) {
			size_t match_len = window_match(sliding_window, window_position + current_start, in_ptr, num_in);
			if (match_len >= max_match_len) {
				max_match_pos = current_start;
				max_match_len = match_len;
			}
		}

		// encode literal
		if (0 == max_match_len) {
			*out_ptr++ = 0;

		} else {
			*out_ptr++ = max_match_len;
			*out_ptr++ = max_match_pos;

			for (size_t i = 0; i < max_match_len; ++i) {
				uint8_t input_byte = sliding_window[(window_position + max_match_pos) % WINDOW_SIZE];
				SAVE_IN_SLIDING_WINDOW(input_byte);
			}

			in_ptr += max_match_len;
			num_in -= max_match_len;
		}

		if (num_in > 0) {
			SAVE_IN_SLIDING_WINDOW(*in_ptr);
			*out_ptr++ = *in_ptr++;
		}
	}

	return out_ptr - encoded_data_ptr;
}

Decoding

Sliding window can be imagined as a space that is "left" to the output buffer

         sliding window |
   output buffer: abcdef
                        ^
                        +- current output position

when decoding more and more data, window moves along with output position

                       | sliding window |
   output buffer: abcdefghijklmnopqrstuv
                                        ^
                                        +- current output position
void lz77_decode(uint8_t* decoded_data, const uint8_t* encoded_data, size_t encoded_size)
{
	uint8_t sliding_window[WINDOW_SIZE];
	memset(sliding_window, 0, WINDOW_SIZE);
	size_t window_position = 0;

	uint8_t* out_ptr = decoded_data;
	while (encoded_size > 2) {
		size_t length = *encoded_data++;

		// copy literal to output
		if (!length) {			
			uint8_t input_byte = *encoded_data++;
			encoded_size -= 2;

			*out_ptr++ = input_byte;
			SAVE_IN_SLIDING_WINDOW(input_byte);
		} else {
			size_t offset = *encoded_data++;
			for (size_t i = 0; i < length; ++i) {
				uint8_t input_byte = sliding_window[(window_position + offset) % WINDOW_SIZE];
				*out_ptr++ = input_byte;
				SAVE_IN_SLIDING_WINDOW(input_byte);
			}

			encoded_size -= 2;
			
			if (encoded_size > 0) {
				uint8_t input_byte = *encoded_data++;
				--encoded_size;

				*out_ptr++ = input_byte;
				SAVE_IN_SLIDING_WINDOW(input_byte);
			}
		}
	}
}

Example Usage

const char in_data[] =
"\0\0I am Sam,\n"
"Sam I am.\n"
"That Sam-I-am! That Sam-I-am!\n"
"I do not like that Sam-I-am!\n"
"Do you like green eggs and ham?\n"
"I do not like them, Sam-I-am.\n"
"I do not like green eggs and ham.\n";

int main(void)
{
	size_t num_in = sizeof(in_data);
	size_t num_out = 2 * num_in;
	uint8_t* encoded_data = (uint8_t*)malloc(num_out);
	uint8_t* decoded_data = (uint8_t*)malloc(num_in);

	printf("Original size: %zu\n", num_in);
	printf("Original data >>>\n");
	output_buffer(in_data, num_in);
	printf("<<<\n");
	printf("\n");

	size_t encoded_size = lz77_encode(encoded_data, num_out, (const uint8_t*)in_data, num_in);
	printf("Encoded size: %zu\n", encoded_size);
	printf("Saved: %d %%\n", 100 - (int)(100 * encoded_size / num_in));


	lz77_decode(decoded_data, encoded_data, encoded_size);

	printf("Decoded data >>>\n");
	output_buffer(decoded_data, num_in);
	printf("<<<\n");

	free(encoded_data);
	free(decoded_data);

	return 0;
}

References

A Universal Algorithm for Sequential Data Compression - Ziv, Jacob; Lempel, Abraham