Run Length Encoding

From WikiCoder

Run Length Encoding (RLE) is a very simple compression method that works well on data with lots of repeating characters. Namely, it transforms the data from ABBCCDDDEEF to 1xA,2xB,2xC,3xD,2xE,1xF - thus naturally if your data has long runs of repeating data - like AAAAAAAAAAAAAAAAB that can be compressed into 16xA,1B - which is significantly less data. However, It has a worst case of doubling the total amount of data. Such as when your data has no repeating characters like ABCDEF which would compress to 1xA,1xB,1xC,1xD,1xF which is exactly double the amount of data. When used properly on specific kinds of data it can be a great technique though as it is extremely fast to encode and decode.

Encoding

 // out_ptr needs to be at least num_in * 2 size 
 // as that is worst case output size for any possible input
 int rle_encode(void *out_ptr, int num_out, const void *in_ptr, int num_in)
 {
   // Verify that the out_ptr length is at least num_in * 2, as that is the worst case output size
   if(num_out < num_in * 2) {
      return -1;
   }
 
   // Convert the input pointers to unsigned char*'s so we can do byte pointer math on them
   unsigned char *out = (unsigned char *)out_ptr;
   unsigned char *in = (unsigned char *)in_ptr;
   int num_repeats = 0;
   unsigned char value = *in++;
   unsigned char last_value = value;
 
   for( ; num_in > 0; value=*in++,--num_in ) {
 
     // If the value is the same as the last value, count it as a repeat.
     if ( value == last_value ) {
       ++num_repeats;
 
       // We can have a maximum repeat of 256, so we need to cut off
       // the sequence at 256 and start a new one.
       if ( num_repeats > 256 ) {
 
         // A repeat count of 0 can never happen. Valid repeat counts are
         // 1 .. 256, but we need to store this in a byte so we subtract
         // 1 to get 0 .. 255.
         out[0] = 256 - 1;
         out[1] = value;
         out += 2;
         num_repeats = 1;
       }
       continue;
     }
 
     // Output a code word of repeat count and value to repeat
     out[0] = (unsigned char)(num_repeats - 1);
     out[1] = last_value;
     out += 2;
 
     // Reset the repeat count to 1
     num_repeats = 1;
     last_value = value;
   }
 
   // Store the last value or repeat sequence.
   out[0] = (unsigned char)(num_repeats - 1);
   out[1] = last_value;
   out += 2;
 
   // Return how many bytes we wrote
   return out - (unsigned char*)out_ptr;
 }

Decoding

 int rle_decode(void *out_ptr, int out_size, const void *in_ptr, int num_in) 
 {
   // Convert the input pointers to unsigned char*'s so we can do byte pointer math on them
   unsigned char *in = (unsigned char*)in_ptr;
   unsigned char *out = (unsigned char*)out_ptr;
   unsigned char *out_end = out + out_size;
 
   // For each input byte
   for(; num_in>0; num_in-=2) {
     // Read the count, value pair from the input
     unsigned int num_repeats = in[0] + 1;
     unsigned char value = in[1];
     in += 2;
 
     // Check to make sure we aren't overflowing our output buffer
     if(out + num_repeats > out_end) {
         break;
     }
 
     // Set the value num_repeats number of times in the output.
     memset(out, value, num_repeats);
     out += num_repeats;
   }
 
   // Return how many bytes we wrote
   return out - (unsigned char*)out_ptr;
 }

Example Usage

 const char *in_data =                                                                                                                                                                                                               
 "WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW\
 WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW\
 WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW\
 WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWW\
 WWWBWWWWWWWWWWWWWWBCD";
 
 int main(void)
 {
  int num_in = strlen(in_data);
  int num_out = num_in*2 + 1;
  char *encoded_data = (char*)malloc(num_out);
  // +1 for the terminating 0 on the string
  char *out_data = (char*)calloc(num_out, 1);
 
  printf("Original data:%s\n", in_data);
 
  int encoded_size = rle_encode(encoded_data, num_out, in_data, num_in);
  // fwrite(encoded_data, 1, encoded_size, stdout); // Encoded data
 
  if(encoded_size == -1) {
      printf("Error encoding data: size of out_data is too small");
      return 1;
  }
 
  rle_decode(out_data, encoded_data, encoded_size);
  printf("Decoded data: %s\n", out_data);
 
  return 0;
 }

References

This code on godbolt