master
  1//===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
  2//
  3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4// See https://llvm.org/LICENSE.txt for license information.
  5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6//
  7//===----------------------------------------------------------------------===//
  8//
  9// Lempel–Ziv–Welch encoding/decoding
 10//
 11//===----------------------------------------------------------------------===//
 12
 13#ifndef SANITIZER_LZW_H
 14#define SANITIZER_LZW_H
 15
 16#include "sanitizer_dense_map.h"
 17
 18namespace __sanitizer {
 19
 20using LzwCodeType = u32;
 21
 22template <class T, class ItIn, class ItOut>
 23ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
 24  using Substring =
 25      detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
 26
 27  // Sentinel value for substrings of len 1.
 28  static constexpr LzwCodeType kNoPrefix =
 29      Min(DenseMapInfo<Substring>::getEmptyKey().first,
 30          DenseMapInfo<Substring>::getTombstoneKey().first) -
 31      1;
 32  DenseMap<Substring, LzwCodeType> prefix_to_code;
 33  {
 34    // Add all substring of len 1 as initial dictionary.
 35    InternalMmapVector<T> dict_len1;
 36    for (auto it = begin; it != end; ++it)
 37      if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
 38        dict_len1.push_back(*it);
 39
 40    // Slightly helps with later delta encoding.
 41    Sort(dict_len1.data(), dict_len1.size());
 42
 43    // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
 44    // just generate them.
 45    *out = dict_len1.size();
 46    ++out;
 47
 48    for (uptr i = 0; i != dict_len1.size(); ++i) {
 49      // Remap after the Sort.
 50      prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
 51      *out = dict_len1[i];
 52      ++out;
 53    }
 54    CHECK_EQ(prefix_to_code.size(), dict_len1.size());
 55  }
 56
 57  if (begin == end)
 58    return out;
 59
 60  // Main LZW encoding loop.
 61  LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
 62  ++begin;
 63  for (auto it = begin; it != end; ++it) {
 64    // Extend match with the new item.
 65    auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
 66    if (ins.second) {
 67      // This is a new substring, but emit the code for the current match
 68      // (before extend). This allows LZW decoder to recover the dictionary.
 69      *out = match;
 70      ++out;
 71      // Reset the match to a single item, which must be already in the map.
 72      match = prefix_to_code.find({kNoPrefix, *it})->second;
 73    } else {
 74      // Already known, use as the current match.
 75      match = ins.first->second;
 76    }
 77  }
 78
 79  *out = match;
 80  ++out;
 81
 82  return out;
 83}
 84
 85template <class T, class ItIn, class ItOut>
 86ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
 87  if (begin == end)
 88    return out;
 89
 90  // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
 91  InternalMmapVector<T> dict_len1(*begin);
 92  ++begin;
 93
 94  if (begin == end)
 95    return out;
 96
 97  for (auto& v : dict_len1) {
 98    v = *begin;
 99    ++begin;
100  }
101
102  // Substrings of len 2 and up. Indexes are shifted because [0,
103  // dict_len1.size()) stored in dict_len1. Substings get here after being
104  // emitted to the output, so we can use output position.
105  InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
106      code_to_substr;
107
108  // Copies already emitted substrings into the output again.
109  auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
110    if (code < dict_len1.size()) {
111      *out = dict_len1[code];
112      ++out;
113      return out;
114    }
115    const auto& s = code_to_substr[code - dict_len1.size()];
116
117    for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
118    return out;
119  };
120
121  // Returns lens of the substring with the given code.
122  auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
123    if (code < dict_len1.size())
124      return 1;
125    const auto& s = code_to_substr[code - dict_len1.size()];
126    return s.second - s.first;
127  };
128
129  // Main LZW decoding loop.
130  LzwCodeType prev_code = *begin;
131  ++begin;
132  out = copy(prev_code, out);
133  for (auto it = begin; it != end; ++it) {
134    LzwCodeType code = *it;
135    auto start = out;
136    if (code == dict_len1.size() + code_to_substr.size()) {
137      // Special LZW case. The code is not in the dictionary yet. This is
138      // possible only when the new substring is the same as previous one plus
139      // the first item of the previous substring. We can emit that in two
140      // steps.
141      out = copy(prev_code, out);
142      *out = *start;
143      ++out;
144    } else {
145      out = copy(code, out);
146    }
147
148    // Every time encoded emits the code, it also creates substing of len + 1
149    // including the first item of the just emmited substring. Do the same here.
150    uptr len = code_to_len(prev_code);
151    code_to_substr.push_back({start - len, start + 1});
152
153    prev_code = code;
154  }
155  return out;
156}
157
158}  // namespace __sanitizer
159#endif