LZW Compressor
 All Classes Files Functions Typedefs
lzw_v2.cpp File Reference

LZW file compressor. More...

#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <exception>
#include <fstream>
#include <functional>
#include <ios>
#include <iostream>
#include <istream>
#include <limits>
#include <memory>
#include <ostream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <vector>

Go to the source code of this file.

Classes

struct  HashCharVector
 Helper hash functor, to be used on character containers. More...
 

Typedefs

using CodeType = std::uint16_t
 Type used to store and retrieve codes.
 

Functions

std::vector< char > operator+ (std::vector< char > vc, char c)
 Helper operator intended to simplify code. More...
 
void compress (std::istream &is, std::ostream &os)
 Compresses the contents of is and writes the result to os. More...
 
void decompress (std::istream &is, std::ostream &os)
 Decompresses the contents of is and writes the result to os. More...
 
void print_usage (const std::string &s="", bool su=true)
 Prints usage information and a custom error message. More...
 
int main (int argc, char *argv[])
 Actual program entry point. More...
 

Variables

const CodeType globals::dms {std::numeric_limits<CodeType>::max()}
 Dictionary Maximum Size (when reached, the dictionary will be reset)
 

Detailed Description

LZW file compressor.

Author
Julius Pettersson
Version
2

This is the C++11 implementation of a Lempel-Ziv-Welch single-file command-line compressor. It uses the simpler fixed-width code compression method. It was written with Doxygen comments.

See Also
http://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch
http://marknelson.us/2011/11/08/lzw-revisited/
http://www.cs.duke.edu/csed/curious/compression/lzw.html
http://warp.povusers.org/EfficientLZW/index.html
http://en.cppreference.com/
http://www.doxygen.org/

Definition in file lzw_v2.cpp.

Function Documentation

void compress ( std::istream &  is,
std::ostream &  os 
)

Compresses the contents of is and writes the result to os.

Parameters
[in]isinput stream
[out]osoutput stream

Definition at line 79 of file lzw_v2.cpp.

Referenced by main().

80 {
81  std::unordered_map<std::vector<char>, CodeType, HashCharVector> dictionary;
82 
83  // "named" lambda function, used to reset the dictionary to its initial contents
84  const auto reset_dictionary = [&dictionary] {
85  dictionary.clear();
86 
87  const long int minc = std::numeric_limits<char>::min();
88  const long int maxc = std::numeric_limits<char>::max();
89 
90  for (long int c = minc; c <= maxc; ++c)
91  {
92  // to prevent Undefined Behavior, resulting from reading and modifying
93  // the dictionary object at the same time
94  const CodeType dictionary_size = dictionary.size();
95 
96  dictionary[{static_cast<char> (c)}] = dictionary_size;
97  }
98  };
99 
100  reset_dictionary();
101 
102  std::vector<char> s; // String
103  char c;
104 
105  while (is.get(c))
106  {
107  // dictionary's maximum size was reached
108  if (dictionary.size() == globals::dms)
109  reset_dictionary();
110 
111  s.push_back(c);
112 
113  if (dictionary.count(s) == 0)
114  {
115  // to prevent Undefined Behavior, resulting from reading and modifying
116  // the dictionary object at the same time
117  const CodeType dictionary_size = dictionary.size();
118 
119  dictionary[s] = dictionary_size;
120  s.pop_back();
121  os.write(reinterpret_cast<const char *> (&dictionary.at(s)), sizeof (CodeType));
122  s = {c};
123  }
124  }
125 
126  if (!s.empty())
127  os.write(reinterpret_cast<const char *> (&dictionary.at(s)), sizeof (CodeType));
128 }
void decompress ( std::istream &  is,
std::ostream &  os 
)

Decompresses the contents of is and writes the result to os.

Parameters
[in]isinput stream
[out]osoutput stream

Definition at line 135 of file lzw_v2.cpp.

Referenced by main().

136 {
137  std::vector<std::vector<char>> dictionary;
138 
139  // "named" lambda function, used to reset the dictionary to its initial contents
140  const auto reset_dictionary = [&dictionary] {
141  dictionary.clear();
142  dictionary.reserve(globals::dms);
143 
144  const long int minc = std::numeric_limits<char>::min();
145  const long int maxc = std::numeric_limits<char>::max();
146 
147  for (long int c = minc; c <= maxc; ++c)
148  dictionary.push_back({static_cast<char> (c)});
149  };
150 
151  reset_dictionary();
152 
153  std::vector<char> s; // String
154  CodeType k; // Key
155 
156  while (is.read(reinterpret_cast<char *> (&k), sizeof (CodeType)))
157  {
158  // dictionary's maximum size was reached
159  if (dictionary.size() == globals::dms)
160  reset_dictionary();
161 
162  if (k > dictionary.size())
163  throw std::runtime_error("invalid compressed code");
164 
165  if (k == dictionary.size())
166  dictionary.push_back(s + s.front());
167  else
168  if (!s.empty())
169  dictionary.push_back(s + dictionary.at(k).front());
170 
171  os.write(&dictionary.at(k).front(), dictionary.at(k).size());
172  s = dictionary.at(k);
173  }
174 
175  if (!is.eof() || is.gcount() != 0)
176  throw std::runtime_error("corrupted compressed file");
177 }
int main ( int  argc,
char *  argv[] 
)

Actual program entry point.

Parameters
argcnumber of command line arguments
[in]argvarray of command line arguments
Return values
EXIT_FAILUREfor failed operation
EXIT_SUCCESSfor successful operation

Definition at line 210 of file lzw_v2.cpp.

References compress(), decompress(), and print_usage().

211 {
212  if (argc != 4)
213  {
214  print_usage("Wrong number of arguments.");
215  return EXIT_FAILURE;
216  }
217 
218  enum class Mode {
219  Compress,
220  Decompress
221  };
222 
223  Mode m;
224 
225  if (std::string(argv[1]) == "-c")
226  m = Mode::Compress;
227  else
228  if (std::string(argv[1]) == "-d")
229  m = Mode::Decompress;
230  else
231  {
232  print_usage(std::string("flag `") + argv[1] + "' is not recognized.");
233  return EXIT_FAILURE;
234  }
235 
236  const std::size_t buffer_size {1024 * 1024};
237 
238  // these custom buffers should be larger than the default ones
239  const std::unique_ptr<char[]> input_buffer(new char[buffer_size]);
240  const std::unique_ptr<char[]> output_buffer(new char[buffer_size]);
241 
242  std::ifstream input_file;
243  std::ofstream output_file;
244 
245  input_file.rdbuf()->pubsetbuf(input_buffer.get(), buffer_size);
246  input_file.open(argv[2], std::ios_base::binary);
247 
248  if (!input_file.is_open())
249  {
250  print_usage(std::string("input_file `") + argv[2] + "' could not be opened.");
251  return EXIT_FAILURE;
252  }
253 
254  output_file.rdbuf()->pubsetbuf(output_buffer.get(), buffer_size);
255  output_file.open(argv[3], std::ios_base::binary);
256 
257  if (!output_file.is_open())
258  {
259  print_usage(std::string("output_file `") + argv[3] + "' could not be opened.");
260  return EXIT_FAILURE;
261  }
262 
263  try
264  {
265  input_file.exceptions(std::ios_base::badbit);
266  output_file.exceptions(std::ios_base::badbit | std::ios_base::failbit);
267 
268  if (m == Mode::Compress)
269  compress(input_file, output_file);
270  else
271  if (m == Mode::Decompress)
272  decompress(input_file, output_file);
273  }
274  catch (const std::ios_base::failure &f)
275  {
276  print_usage(std::string("File input/output failure: ") + f.what() + '.', false);
277  return EXIT_FAILURE;
278  }
279  catch (const std::exception &e)
280  {
281  print_usage(std::string("Caught exception: ") + e.what() + '.', false);
282  return EXIT_FAILURE;
283  }
284 
285  return EXIT_SUCCESS;
286 }
std::vector<char> operator+ ( std::vector< char >  vc,
char  c 
)

Helper operator intended to simplify code.

Parameters
vcoriginal vector
celement to be appended
Returns
vector resulting from appending c to vc

Definition at line 68 of file lzw_v2.cpp.

69 {
70  vc.push_back(c);
71  return vc;
72 }
void print_usage ( const std::string &  s = "",
bool  su = true 
)

Prints usage information and a custom error message.

Parameters
scustom error message to be printed
suShow Usage information

Definition at line 184 of file lzw_v2.cpp.

Referenced by main().

185 {
186  if (!s.empty())
187  std::cerr << "\nERROR: " << s << '\n';
188 
189  if (su)
190  {
191  std::cerr << "\nUsage:\n";
192  std::cerr << "\tprogram -flag input_file output_file\n\n";
193  std::cerr << "Where `flag' is either `c' for compressing, or `d' for decompressing, and\n";
194  std::cerr << "`input_file' and `output_file' are distinct files.\n\n";
195  std::cerr << "Examples:\n";
196  std::cerr << "\tlzw_v2.exe -c license.txt license.lzw\n";
197  std::cerr << "\tlzw_v2.exe -d license.lzw new_license.txt\n";
198  }
199 
200  std::cerr << std::endl;
201 }