Lempel–Ziv–Welch

From WikiCoder

Lempel-Ziv-Welch (LZW) is descendent of LZ78. It's been made infamous by a Unisys patent trolling usage in GIF file format. All patents expired in 2004. Nobody knows how to pronounce GIF anyway.

LZW is a series of improvements over LZ78:

  • it initialize the dictionary with all one-byte values (so first 256 entries in dictionary correspond to bytes)
  • it completly removes pairs, outputing only proper indexes (note that index into a dictionary at the same time defines match lenght)
  • initial implementation was using fixed 12-bit codes, later variable-width codes were introduced
  • additional special codes were introduce to reset the dictionary and mark end of data

Encoding

struct lzw {
    int bitcnt;
    int bits, off;
};
static unsigned char* lzw_write(unsigned char *out, struct lzw *s, int code)
{
    s->bits |= code << s->off;
    s->off += s->bitcnt;
    while (s->off >= 8) {
        *out++ = (unsigned char)(s->bits & 0xFF);
        s->bits >>= 8;
        s->off -= 8;
    } return out;
}
int lzw_encode(unsigned char *out, const unsigned char *in, int size)
{
    #define LZW_HASH_SIZE 5003
    int htbl[LZW_HASH_SIZE]; /* {{sym,entry},...} */
    short codetbl[LZW_HASH_SIZE]; /* {entry,...} */
    short free_ent = 0x102; /* 0x100 reset dict, 0x101 eof */
    int maxcode = 511;

    unsigned char *o = out;
    int ent = *in++;
    struct lzw st = {0};
    st.bitcnt=9;

    o = lzw_write(o, &st, 0x100);
    memset(htbl, 0xFF, sizeof(htbl));
    rn: while (--size) {
        int c = *in++;
        int code = (c << 12) + ent; /* {sym,entry} */
        int key = (c << 4) ^ ent; /* hash */
        while (htbl[key] >= 0) {
            if (htbl[key] == code) {
                ent = codetbl[key];
                goto rn;
            }
            ++key;
            key = key >= LZW_HASH_SIZE ? key - LZW_HASH_SIZE : key;
        } o = lzw_write(o, &st, ent);
        ent = c;

        if (free_ent < 4096) {
            if(free_ent > maxcode) {
                ++st.bitcnt;
                if (st.bitcnt == 12)
                    maxcode = 4096;
                else maxcode = (1 << st.bitcnt)-1;
            }
            codetbl[key] = free_ent++;
            htbl[key] = code;
        } else {
            o = lzw_write(o, &st, 0x100);
            memset(htbl, 0xFF, sizeof(htbl));
            free_ent = 0x102;
            maxcode = 511;
            st.bitcnt = 9;
        }
    }
    o = lzw_write(o, &st, ent);
    o = lzw_write(o, &st, 0x101);
    o = lzw_write(o, &st, 0);
    return (int)(o-out);
}

Decoding

struct lzw {
    int bitcnt;
    int bits, off;
};
static const unsigned char* lzw_read(const unsigned char *in, struct lzw *s, int *code)
{
    if (s->off < s->bitcnt) {
        s->bits |= ((*in++) << s->off);
        s->bits |= ((*in++) << (s->off+8));
        s->off += 16;
    }
    *code = s->bits & ((1 << s->bitcnt)-1);
    s->bits >>= s->bitcnt;
    s->off -= s->bitcnt;
    return in;
}
int lzw_decode( unsigned char *out, const unsigned char *in, int size)
{
    const unsigned char *e = in + size, *o = out;
    int dict[LZW_HASH_SIZE];
    short free_ent = 0x102;
    int maxcode = 511;
    int next = 0, last = 0;

    struct lzw st = {0};
    st.bitcnt = 9;

    in = lzw_read(in, &st, &last);
    assert(last == 0x100); /* skip */
    in = lzw_read(in, &st, &last);
   
    while (in <= e) {
        int code, c, ent;
        if (last <= 0xFF) {
            *out++ = (unsigned char)last;
        } else {
            /* unpack dict-entry */
            int idx = last, cnt = 1;
            assert(free_ent > idx);
            while (idx > 0x101) {
                code = dict[idx];
                c = (code >> 12);
                *out++ = (unsigned char)c;
                idx = code & 0xFFF;
                cnt++;
            } *out++ = (unsigned char)idx;

            /* reverse output */
            {unsigned char *s = out - cnt;
            unsigned char *d = out - 1;
            cnt >>= 1;
            while (cnt--) {
                unsigned char t = *s; /* swap */
                *s++ = *d; *d-- = t;
            }}
        }
        /* grow bit width */
        if (free_ent > maxcode) {
            ++st.bitcnt;
            if(st.bitcnt == 12)
                maxcode = 4096;
            else maxcode = (1<<st.bitcnt)-1;
        }
        /* read next entry */
        in = lzw_read(in, &st, &next);
        if (next == 0x100) {
            maxcode = 511;
            st.bitcnt = 9;
            free_ent = 0x102;
            in = lzw_read(in, &st, &last);
            continue;
        } else if (next == 0x101) {
            break;
        }
        /* find first symbol */
        ent = (next < free_ent) ? next: last;
        while (ent > 0x101) {
            code = dict[ent];
            ent = code & 0xFFF;
        }
        /* add new entry into dict */
        code = (ent << 12) + last;
        assert(free_ent < LZW_HASH_SIZE);
        dict[free_ent++] = code;
        last = next;
    }
    return (int)(out-o);
}

References