Lempel–Ziv–Storer–Szymanski

From WikiCoder

Lempel–Ziv–Storer–Szymanski (LZSS) is a lossless data compression algorithm, a direct descendent of of LZ77. It was created in 1982 by James Storer and Thomas Szymanski.

LZSS can be considered in terms of improvements over LZ77.

  1. LZSS is usually used with text, so sliding window is initialized to spaces instead of zeroes
  2. LZSS quit from using triplet in favor of (length, offset pair) and distinguishing literal from encoded pair via single bit
  3. LZSS uses break even point, minimum length of a match to be worth replacting with encoded pair
  4. in implementations, bits that are used to distinguish pair vs literal are grouped together for easier/faster access.
  5. there were further advancements, that huffman encoded literals and/or code pairs, that resulted in compressors known as LZARI, LHarc and LHA, curious reader will find more details in "History of Data Compression in Japan"[1]

The implementation below is similar to classic written by Haruhiko Okumura. It uses ring-buffer to maintain sliding window. The window size is 4k (so offset can be encoded on 12 bits). The length is encoded on 4bits, but because break even is equal to 2, possible length values are in range 2-18.

Headers and helpers

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

const size_t WINDOW_SIZE = 4096;
const size_t MATCH_LIMIT = 18;
const size_t BREAK_EVEN = 2;

size_t window_match(const uint8_t* window_ptr, size_t offset, const uint8_t* in_ptr, size_t num_in)
{
	size_t i = 0;
	for ( ; i < MATCH_LIMIT && i < num_in; ++i)
		if (window_ptr[(offset + i) % WINDOW_SIZE] != in_ptr[i])
			break;

	return i;
}

void outputBuffer(const void* buffer_in, size_t size) {
	const uint8_t* buffer = (const uint8_t*)buffer_in;
	for (size_t i = 0; i < size; ++i) {
		if (buffer[i] < 32 && buffer[i] != '\r' && buffer[i] != '\n' && buffer[i] != '\t')
			printf("\\%d", buffer[i]);
		else
			printf("%c", buffer[i]);
	}
}

#define ADVANCE_SLIDING_WINDOW window_position = (window_position + 1) % WINDOW_SIZE
#define SAVE_IN_SLIDING_WINDOW(B) do { \
		sliding_window[window_position] = B; \
		ADVANCE_SLIDING_WINDOW; \
	} while(0)

Encoding

For clarity, the implementation below uses brute-force search inside the sliding window. Various optimizations are possible (indexing letters, using match tree, using Rabin-Karp hashing to speed up search).

size_t lzss_encode(uint8_t* encoded_data_ptr, size_t num_out, const uint8_t* in_ptr, size_t num_in)
{
	uint8_t sliding_window[WINDOW_SIZE];
	memset(sliding_window, ' ', WINDOW_SIZE);
	size_t window_position = 0;

	// temporary space for holding encoded flags and pairs, 0th element is reserved for flags
	uint8_t code_buf[17];
	code_buf[0] = 0;
	uint8_t flag_mask = 1;
	size_t code_buf_position = 1;

	// fill in sliding window
	for (size_t i = 0; i < MATCH_LIMIT && i < num_in; ++i)
		SAVE_IN_SLIDING_WINDOW(in_ptr[i]);
		
	uint8_t* out_ptr = encoded_data_ptr;
	while (num_in > 0) {
		// find longest match inside window, note: skip last MATCH_LIMIT bytes to work adjust for initial fill
		size_t max_match_len = 0;
		size_t max_match_pos = 0;
		for (size_t current_start = 0; current_start < WINDOW_SIZE - MATCH_LIMIT; ++current_start) {
			size_t match_len = window_match(sliding_window, window_position + current_start, in_ptr, num_in);
			if (match_len >= max_match_len) {
				// adjust position due to initial fill, so that decoder can get correct references
				max_match_pos = (MATCH_LIMIT + current_start) % WINDOW_SIZE;
				max_match_len = match_len;
			}
		}
		
		// whole code_buf is ready, output it to stream
		if (!flag_mask) {
			size_t copied_code_size = num_out > code_buf_position ? code_buf_position : num_out;
			memcpy(out_ptr, code_buf, copied_code_size);
			out_ptr += copied_code_size;
			num_out -= copied_code_size;

			code_buf[0] = 0;
			code_buf_position = 1;
			flag_mask = 1;
		}

		// encode literal
		if (max_match_len <= BREAK_EVEN) {
			code_buf[0] |= flag_mask;
			code_buf[code_buf_position++] = *in_ptr;
			
			max_match_len = 1;

		// encode pair
		} else {
			// append 0 bit to code_buf[0], so just skip it
			code_buf[code_buf_position++] = (uint8_t)max_match_pos;
			code_buf[code_buf_position++] = (0xf0 & (max_match_pos >> 4)) | (max_match_len - (BREAK_EVEN + 1));
		}
		flag_mask <<= 1;

		// append new bytes to sliding window
		for (size_t i = 0; i < max_match_len; ++i) {
			if (num_in > MATCH_LIMIT)
				SAVE_IN_SLIDING_WINDOW(*(in_ptr + MATCH_LIMIT));
			else
				ADVANCE_SLIDING_WINDOW;

			--num_in;
			++in_ptr;
		}
	}

	// output leftovers
	size_t copied_code_size = num_out > code_buf_position ? code_buf_position : num_out;
	memcpy(out_ptr, code_buf, copied_code_size);
	out_ptr += copied_code_size;

	return out_ptr - encoded_data_ptr;
}

Decoding

int lzss_decode(uint8_t* decoded_data, const uint8_t* encoded_data, size_t encoded_size)
{
	// fill sliding window with space
	uint8_t sliding_window[WINDOW_SIZE];
	memset(sliding_window, ' ', WINDOW_SIZE);
	size_t window_position = 0;

	uint16_t flags = 0;
	while (encoded_size > 0) {
		// read flags if needed
		if (! (flags & 0x100)) {
			flags = 0xFF00 | *encoded_data++;
			--encoded_size;
			continue;
		}

		// process literal
		if (flags & 1) {
			uint8_t input_byte = *encoded_data++;
			--encoded_size;
			*decoded_data++ = input_byte;
			SAVE_IN_SLIDING_WINDOW(input_byte);	
		} else {
			uint8_t first_byte = *encoded_data++;
			--encoded_size;

			if (!encoded_size) {
				// missing input bytes
				return 1;
			}

			uint8_t second_byte = *encoded_data++;
			--encoded_size;

			uint16_t offset = ((second_byte & 0xf0) << 4) | first_byte;
			uint8_t length = (second_byte & 0xf) + BREAK_EVEN;

			for (size_t i = 0; i <= length; ++i) {
				// note: window_position is updated via SAVE at the end of the loop
				uint8_t input_byte = sliding_window[(window_position + offset) % WINDOW_SIZE];
				*decoded_data++ = input_byte;

				SAVE_IN_SLIDING_WINDOW(input_byte);
			}
		}

		flags >>= 1;
	}

	return 0;
}

Example Usage

const char in_data[] =
"I am Sam,\n"
"Sam I am.\n"
"That Sam-I-am! That Sam-I-am!\n"
"I do not like that Sam-I-am!\n"
"Do you like green eggs and ham?\n"
"I do not like them, Sam-I-am.\n"
"I do not like green eggs and ham.\n";

int main(void)
{
	size_t num_in = sizeof(in_data);
	size_t num_out = 2 * num_in;
	uint8_t* encoded_data = (uint8_t*)malloc(num_out * 9 / 8);
	uint8_t* decoded_data = (uint8_t*)malloc(num_in);

	printf("Original size: %zu\n", num_in);
	printf("Original data >>>\n");
	outputBuffer(in_data, num_in);
	printf("<<<\n");
	printf("\n");

	size_t encoded_size = lzss_encode(encoded_data, num_out, (const uint8_t*)in_data, num_in);
	printf("Encoded size: %zu\n", encoded_size);
	printf("Saved: %zu %%\n", 100 - 100 * encoded_size / num_in);

	if (lzss_decode(decoded_data, encoded_data, encoded_size)) {
		puts("error when decoding stream");
		return 1;
	}

	printf("Decoded data >>>\n");
	outputBuffer(decoded_data, num_in);
	printf("<<<\n");

	free(decoded_data);
	free(encoded_data);

	return 0;
}

Example encoding

Note, that for clarity, the offset in pairs are shown relative to beginning window buffer. Note, how second pair uses 4095 as an offset, because it uses space, that the window was initially filled with, when referencing 5 byte string ␣I␣am

I am Sam,
(5, 3)(4095, 5).
That (4, 4)-I-am! (20, 14)
I do not like t(36, 14)
Do you(58, 6)green eggs and ham?
(49, 17)em,(68, 9).
(110, 15) (91, 18).

References