From 30a283c1b311e97e3968143a4264e1ed4e4a074e Mon Sep 17 00:00:00 2001 From: Jorijn van der Graaf Date: Mon, 11 May 2026 18:37:30 +0200 Subject: [PATCH] asset compression --- implementations/Crafter.Asset-Compression.cpp | 136 + implementations/main.cpp | 32 + interfaces/Crafter.Asset-Compression.cppm | 59 + interfaces/Crafter.Asset-Mesh.cppm | 88 + interfaces/Crafter.Asset-Texture.cppm | 55 + interfaces/Crafter.Asset.cppm | 1 + lib/gdeflate/GDeflate.h | 46 + lib/gdeflate/GDeflateCompress.cpp | 270 ++ lib/gdeflate/GDeflateDecompress.cpp | 170 + lib/gdeflate/LICENSE | 202 + lib/gdeflate/TileStream.h | 83 + lib/gdeflate/Utils.h | 82 + lib/gdeflate/config.h | 29 + lib/gdeflate/libdeflate/COPYING | 36 + lib/gdeflate/libdeflate/common/common_defs.h | 338 ++ lib/gdeflate/libdeflate/common/compiler_gcc.h | 217 ++ lib/gdeflate/libdeflate/common/compiler_msc.h | 80 + lib/gdeflate/libdeflate/lib/adler32.c | 130 + .../libdeflate/lib/adler32_vec_template.h | 124 + .../libdeflate/lib/arm/adler32_impl.h | 125 + .../libdeflate/lib/arm/cpu_features.c | 133 + .../libdeflate/lib/arm/cpu_features.h | 40 + lib/gdeflate/libdeflate/lib/arm/crc32_impl.h | 247 ++ .../libdeflate/lib/arm/matchfinder_impl.h | 86 + lib/gdeflate/libdeflate/lib/bt_matchfinder.h | 363 ++ .../libdeflate/lib/cpu_features_common.h | 88 + lib/gdeflate/libdeflate/lib/crc32.c | 313 ++ lib/gdeflate/libdeflate/lib/crc32_table.h | 526 +++ .../libdeflate/lib/crc32_vec_template.h | 61 + .../libdeflate/lib/decompress_template.h | 421 +++ .../libdeflate/lib/deflate_compress.c | 3360 +++++++++++++++++ .../libdeflate/lib/deflate_compress.h | 13 + .../libdeflate/lib/deflate_constants.h | 102 + .../libdeflate/lib/deflate_decompress.c | 1021 +++++ .../libdeflate/lib/gdeflate_compress.c | 246 ++ .../libdeflate/lib/gdeflate_decompress.c | 134 + .../lib/gdeflate_decompress_template.h | 611 +++ lib/gdeflate/libdeflate/lib/gzip_compress.c | 95 + lib/gdeflate/libdeflate/lib/gzip_constants.h | 45 + lib/gdeflate/libdeflate/lib/gzip_decompress.c | 148 + lib/gdeflate/libdeflate/lib/hc_matchfinder.h | 412 ++ lib/gdeflate/libdeflate/lib/lib_common.h | 67 + .../libdeflate/lib/matchfinder_common.h | 180 + lib/gdeflate/libdeflate/lib/unaligned.h | 228 ++ lib/gdeflate/libdeflate/lib/utils.c | 142 + .../libdeflate/lib/x86/adler32_impl.h | 337 ++ .../libdeflate/lib/x86/cpu_features.c | 152 + .../libdeflate/lib/x86/cpu_features.h | 41 + lib/gdeflate/libdeflate/lib/x86/crc32_impl.h | 92 + .../lib/x86/crc32_pclmul_template.h | 262 ++ .../libdeflate/lib/x86/decompress_impl.h | 35 + .../libdeflate/lib/x86/matchfinder_impl.h | 122 + lib/gdeflate/libdeflate/lib/zlib_compress.c | 87 + lib/gdeflate/libdeflate/lib/zlib_constants.h | 21 + lib/gdeflate/libdeflate/lib/zlib_decompress.c | 108 + lib/gdeflate/libdeflate/libdeflate.h | 540 +++ project.cpp | 63 +- 57 files changed, 13237 insertions(+), 8 deletions(-) create mode 100644 implementations/Crafter.Asset-Compression.cpp create mode 100644 interfaces/Crafter.Asset-Compression.cppm create mode 100644 lib/gdeflate/GDeflate.h create mode 100644 lib/gdeflate/GDeflateCompress.cpp create mode 100644 lib/gdeflate/GDeflateDecompress.cpp create mode 100644 lib/gdeflate/LICENSE create mode 100644 lib/gdeflate/TileStream.h create mode 100644 lib/gdeflate/Utils.h create mode 100644 lib/gdeflate/config.h create mode 100644 lib/gdeflate/libdeflate/COPYING create mode 100644 lib/gdeflate/libdeflate/common/common_defs.h create mode 100644 lib/gdeflate/libdeflate/common/compiler_gcc.h create mode 100644 lib/gdeflate/libdeflate/common/compiler_msc.h create mode 100644 lib/gdeflate/libdeflate/lib/adler32.c create mode 100644 lib/gdeflate/libdeflate/lib/adler32_vec_template.h create mode 100644 lib/gdeflate/libdeflate/lib/arm/adler32_impl.h create mode 100644 lib/gdeflate/libdeflate/lib/arm/cpu_features.c create mode 100644 lib/gdeflate/libdeflate/lib/arm/cpu_features.h create mode 100644 lib/gdeflate/libdeflate/lib/arm/crc32_impl.h create mode 100644 lib/gdeflate/libdeflate/lib/arm/matchfinder_impl.h create mode 100644 lib/gdeflate/libdeflate/lib/bt_matchfinder.h create mode 100644 lib/gdeflate/libdeflate/lib/cpu_features_common.h create mode 100644 lib/gdeflate/libdeflate/lib/crc32.c create mode 100644 lib/gdeflate/libdeflate/lib/crc32_table.h create mode 100644 lib/gdeflate/libdeflate/lib/crc32_vec_template.h create mode 100644 lib/gdeflate/libdeflate/lib/decompress_template.h create mode 100644 lib/gdeflate/libdeflate/lib/deflate_compress.c create mode 100644 lib/gdeflate/libdeflate/lib/deflate_compress.h create mode 100644 lib/gdeflate/libdeflate/lib/deflate_constants.h create mode 100644 lib/gdeflate/libdeflate/lib/deflate_decompress.c create mode 100644 lib/gdeflate/libdeflate/lib/gdeflate_compress.c create mode 100644 lib/gdeflate/libdeflate/lib/gdeflate_decompress.c create mode 100644 lib/gdeflate/libdeflate/lib/gdeflate_decompress_template.h create mode 100644 lib/gdeflate/libdeflate/lib/gzip_compress.c create mode 100644 lib/gdeflate/libdeflate/lib/gzip_constants.h create mode 100644 lib/gdeflate/libdeflate/lib/gzip_decompress.c create mode 100644 lib/gdeflate/libdeflate/lib/hc_matchfinder.h create mode 100644 lib/gdeflate/libdeflate/lib/lib_common.h create mode 100644 lib/gdeflate/libdeflate/lib/matchfinder_common.h create mode 100644 lib/gdeflate/libdeflate/lib/unaligned.h create mode 100644 lib/gdeflate/libdeflate/lib/utils.c create mode 100644 lib/gdeflate/libdeflate/lib/x86/adler32_impl.h create mode 100644 lib/gdeflate/libdeflate/lib/x86/cpu_features.c create mode 100644 lib/gdeflate/libdeflate/lib/x86/cpu_features.h create mode 100644 lib/gdeflate/libdeflate/lib/x86/crc32_impl.h create mode 100644 lib/gdeflate/libdeflate/lib/x86/crc32_pclmul_template.h create mode 100644 lib/gdeflate/libdeflate/lib/x86/decompress_impl.h create mode 100644 lib/gdeflate/libdeflate/lib/x86/matchfinder_impl.h create mode 100644 lib/gdeflate/libdeflate/lib/zlib_compress.c create mode 100644 lib/gdeflate/libdeflate/lib/zlib_constants.h create mode 100644 lib/gdeflate/libdeflate/lib/zlib_decompress.c create mode 100644 lib/gdeflate/libdeflate/libdeflate.h diff --git a/implementations/Crafter.Asset-Compression.cpp b/implementations/Crafter.Asset-Compression.cpp new file mode 100644 index 0000000..ad31d9f --- /dev/null +++ b/implementations/Crafter.Asset-Compression.cpp @@ -0,0 +1,136 @@ +/* +Crafter®.Asset +Copyright (C) 2026 Catcrafts® +catcrafts.net + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License version 3.0 as published by the Free Software Foundation; + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +module; +// Vendored GDeflate (Microsoft DirectStorage reference, Apache-2.0). Headers +// pull libdeflate (MIT) via -Ilib/gdeflate/libdeflate. The C++ wrappers are +// pulled inline because Crafter.Build's cFiles only handles .c TUs — folding +// them into this module impl avoids adding a parallel cppFiles channel. +#include "../lib/gdeflate/GDeflate.h" +#include "../lib/gdeflate/GDeflateCompress.cpp" +#include "../lib/gdeflate/GDeflateDecompress.cpp" + +module Crafter.Asset; +import std; + +namespace Crafter::Compression { + CompressedBlob CompressStreams(std::span> streams) { + CompressedBlob blob; + blob.regions.reserve(streams.size()); + + // First pass: compress each stream into its own buffer so we know the + // exact final size. GDeflate::CompressBound is an upper bound; we'd + // waste capacity if we appended the bound directly into a single + // shared buffer. + std::vector> compressed; + compressed.reserve(streams.size()); + + std::uint64_t totalSize = 0; + for (const std::span& stream : streams) { + if (stream.empty()) { + compressed.emplace_back(); + continue; + } + std::size_t boundSize = GDeflate::CompressBound(stream.size()); + std::vector out(boundSize); + std::size_t actualSize = boundSize; + bool ok = GDeflate::Compress( + reinterpret_cast(out.data()), + &actualSize, + reinterpret_cast(stream.data()), + stream.size(), + GDeflate::MaximumCompressionLevel, + 0); + if (!ok) { + throw std::runtime_error("GDeflate::Compress failed"); + } + out.resize(actualSize); + totalSize += actualSize; + compressed.push_back(std::move(out)); + } + + // Second pass: concatenate and build the region table. + blob.bytes.reserve(totalSize); + for (std::size_t i = 0; i < streams.size(); ++i) { + RegionMeta r { + .srcOffset = blob.bytes.size(), + .compressedSize = compressed[i].size(), + .decompressedSize = streams[i].size(), + }; + blob.regions.push_back(r); + blob.bytes.insert(blob.bytes.end(), compressed[i].begin(), compressed[i].end()); + } + return blob; + } + + void WriteBlob(std::ostream& file, const CompressedBlob& blob) { + std::uint32_t regionCount = static_cast(blob.regions.size()); + file.write(reinterpret_cast(®ionCount), sizeof(regionCount)); + if (regionCount > 0) { + file.write(reinterpret_cast(blob.regions.data()), + regionCount * sizeof(RegionMeta)); + } + std::uint64_t payloadSize = blob.bytes.size(); + file.write(reinterpret_cast(&payloadSize), sizeof(payloadSize)); + if (payloadSize > 0) { + file.write(reinterpret_cast(blob.bytes.data()), payloadSize); + } + } + + CompressedBlob ReadBlob(std::istream& file) { + CompressedBlob blob; + std::uint32_t regionCount = 0; + file.read(reinterpret_cast(®ionCount), sizeof(regionCount)); + blob.regions.resize(regionCount); + if (regionCount > 0) { + file.read(reinterpret_cast(blob.regions.data()), + regionCount * sizeof(RegionMeta)); + } + std::uint64_t payloadSize = 0; + file.read(reinterpret_cast(&payloadSize), sizeof(payloadSize)); + blob.bytes.resize(payloadSize); + if (payloadSize > 0) { + file.read(reinterpret_cast(blob.bytes.data()), payloadSize); + } + return blob; + } + + void DecompressCPU(const CompressedBlob& blob, std::span> outputs) { + if (outputs.size() != blob.regions.size()) { + throw std::runtime_error("DecompressCPU: outputs.size() != regions.size()"); + } + for (std::size_t i = 0; i < blob.regions.size(); ++i) { + const RegionMeta& r = blob.regions[i]; + const std::span& out = outputs[i]; + if (out.size() != r.decompressedSize) { + throw std::runtime_error("DecompressCPU: output size mismatch"); + } + if (r.decompressedSize == 0) continue; + bool ok = GDeflate::Decompress( + reinterpret_cast(out.data()), + r.decompressedSize, + reinterpret_cast(blob.bytes.data() + r.srcOffset), + r.compressedSize, + /*numWorkers=*/1); + if (!ok) { + throw std::runtime_error("GDeflate::Decompress failed"); + } + } + } +} diff --git a/implementations/main.cpp b/implementations/main.cpp index 2bfbefb..008a542 100644 --- a/implementations/main.cpp +++ b/implementations/main.cpp @@ -19,6 +19,35 @@ import std; using namespace Crafter; namespace fs = std::filesystem; +// CPU GDeflate roundtrip sanity test across the size boundaries from the +// implementation plan. Returns 0 on pass, 1 on first byte-mismatch. +static int RunCompressionRoundtrip() { + const std::array sizes = { 1, 65535, 65536, 65537, 16 * 1024 * 1024 }; + std::mt19937_64 rng(0xC0FFEEu); + for (std::size_t n : sizes) { + std::vector input(n); + for (std::size_t i = 0; i < n; ++i) { + // Mix random bytes with a deterministic pattern so the codec is + // exercised on both compressible and noisy regions. + input[i] = static_cast((i * 0x9E3779B97F4A7C15ULL ^ rng()) & 0xFF); + } + std::array, 1> streams = { std::span(input) }; + Compression::CompressedBlob blob = Compression::CompressStreams(streams); + std::vector output(n); + std::array, 1> outputs = { std::span(output) }; + Compression::DecompressCPU(blob, outputs); + if (output != input) { + std::cerr << "[FAIL] roundtrip size=" << n << "\n"; + return 1; + } + std::cout << "[ok] size=" << n + << " compressed=" << blob.bytes.size() + << " ratio=" << (double(blob.bytes.size()) / double(n)) << "\n"; + } + std::cout << "All roundtrips passed.\n"; + return 0; +} + int main(int argc, char** argv) { // Parse arguments: crafter-asset [output_file] [--format u8|f16] fs::path inputPath; @@ -29,6 +58,9 @@ int main(int argc, char** argv) { std::vector positional; for (int i = 1; i < argc; ++i) { std::string arg = argv[i]; + if (arg == "--test-compression") { + return RunCompressionRoundtrip(); + } if (arg == "--format" || arg == "-f") { if (i + 1 >= argc) { std::cerr << "Error: --format requires a value (u8 or f16)\n"; diff --git a/interfaces/Crafter.Asset-Compression.cppm b/interfaces/Crafter.Asset-Compression.cppm new file mode 100644 index 0000000..a144411 --- /dev/null +++ b/interfaces/Crafter.Asset-Compression.cppm @@ -0,0 +1,59 @@ +/* +Crafter®.Asset +Copyright (C) 2026 Catcrafts® +catcrafts.net + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License version 3.0 as published by the Free Software Foundation; + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +export module Crafter.Asset:Compression; +import std; + +export namespace Crafter::Compression { + // One independently-decompressable GDeflate stream inside CompressedBlob::bytes. + // Layout matches what VK_EXT_memory_decompression's VkDecompressMemoryRegionEXT + // expects, minus the device addresses (which the consumer fills in). + struct RegionMeta { + std::uint64_t srcOffset; + std::uint64_t compressedSize; + std::uint64_t decompressedSize; + }; + + struct CompressedBlob { + std::vector bytes; + std::vector regions; + + std::uint64_t TotalDecompressedSize() const noexcept { + std::uint64_t sum = 0; + for (const RegionMeta& r : regions) sum += r.decompressedSize; + return sum; + } + }; + + // Compresses each input span as its own GDeflate tile-stream; concatenates + // them into one byte buffer with a parallel region table. Streams are + // independent and can be addressed individually by VkDecompressMemoryRegionEXT + // entries on the GPU path. + CompressedBlob CompressStreams(std::span> streams); + + // CPU fallback decoder. outputs.size() must equal blob.regions.size(); + // outputs[i].size() must equal blob.regions[i].decompressedSize. + void DecompressCPU(const CompressedBlob& blob, std::span> outputs); + + // Length-prefixed serialization of a CompressedBlob. Used by per-asset + // SaveCompressed/LoadCompressed implementations after they've written + // their own type-specific header. + void WriteBlob(std::ostream& file, const CompressedBlob& blob); + CompressedBlob ReadBlob(std::istream& file); +} diff --git a/interfaces/Crafter.Asset-Mesh.cppm b/interfaces/Crafter.Asset-Mesh.cppm index 6cd4981..7138ef9 100644 --- a/interfaces/Crafter.Asset-Mesh.cppm +++ b/interfaces/Crafter.Asset-Mesh.cppm @@ -18,6 +18,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ export module Crafter.Asset:Mesh; +import :Compression; import Crafter.Math; import std; namespace fs = std::filesystem; @@ -35,6 +36,44 @@ export namespace Crafter { Vector uv; }; + // GDeflate-compressed counterpart of MeshAsset::Save output. Three + // regions: [vertex, index, data]. dataCount==0 leaves the data region + // empty (zero compressedSize/decompressedSize). dataStride records sizeof(T) + // at compress time so consumers can validate. + struct CompressedMeshAsset { + std::uint32_t vertexCount = 0; + std::uint32_t indexCount = 0; + std::uint32_t dataCount = 0; + std::uint32_t dataStride = 0; + Compression::CompressedBlob blob; + }; + + namespace MeshAssetFormat { + inline constexpr char magic[4] = {'C', 'G', 'D', 'M'}; + inline constexpr std::uint32_t version = 1; + } + + inline CompressedMeshAsset LoadCompressedMesh(fs::path path) { + std::ifstream file(path, std::ios::binary); + char magic[4]; + file.read(magic, 4); + if (std::memcmp(magic, MeshAssetFormat::magic, 4) != 0) { + throw std::runtime_error("LoadCompressedMesh: bad magic on " + path.string()); + } + std::uint32_t version = 0; + file.read(reinterpret_cast(&version), sizeof(version)); + if (version != MeshAssetFormat::version) { + throw std::runtime_error("LoadCompressedMesh: unsupported version on " + path.string()); + } + CompressedMeshAsset out; + file.read(reinterpret_cast(&out.vertexCount), sizeof(out.vertexCount)); + file.read(reinterpret_cast(&out.indexCount), sizeof(out.indexCount)); + file.read(reinterpret_cast(&out.dataCount), sizeof(out.dataCount)); + file.read(reinterpret_cast(&out.dataStride), sizeof(out.dataStride)); + out.blob = Compression::ReadBlob(file); + return out; + } + template struct MeshAsset { std::vector> vertexes; @@ -57,6 +96,29 @@ export namespace Crafter { file.write(reinterpret_cast(datas.data()), dataCount * sizeof(T)); } + void SaveCompressed(fs::path path) const { + std::array, 3> streams = { + std::as_bytes(std::span(vertexes)), + std::as_bytes(std::span(indexes)), + std::as_bytes(std::span(datas)), + }; + Compression::CompressedBlob blob = Compression::CompressStreams(streams); + + std::ofstream file(path, std::ios::binary); + file.write(MeshAssetFormat::magic, 4); + std::uint32_t version = MeshAssetFormat::version; + std::uint32_t vc = static_cast(vertexes.size()); + std::uint32_t ic = static_cast(indexes.size()); + std::uint32_t dc = static_cast(datas.size()); + std::uint32_t stride = static_cast(sizeof(T)); + file.write(reinterpret_cast(&version), sizeof(version)); + file.write(reinterpret_cast(&vc), sizeof(vc)); + file.write(reinterpret_cast(&ic), sizeof(ic)); + file.write(reinterpret_cast(&dc), sizeof(dc)); + file.write(reinterpret_cast(&stride), sizeof(stride)); + Compression::WriteBlob(file, blob); + } + static MeshAsset Load(fs::path path) { MeshAsset mesh; @@ -196,6 +258,32 @@ export namespace Crafter { file.write(reinterpret_cast(vertexes.data()), vertexCount * sizeof(Vector)); file.write(reinterpret_cast(indexes.data()), indexCount * sizeof(std::uint32_t)); } + + void SaveCompressed(fs::path path) const { + // Three regions to keep file format identical to the templated + // variant; the data region is empty (skipped on the GPU path). + std::array, 3> streams = { + std::as_bytes(std::span(vertexes)), + std::as_bytes(std::span(indexes)), + std::span{}, + }; + Compression::CompressedBlob blob = Compression::CompressStreams(streams); + + std::ofstream file(path, std::ios::binary); + file.write(MeshAssetFormat::magic, 4); + std::uint32_t version = MeshAssetFormat::version; + std::uint32_t vc = static_cast(vertexes.size()); + std::uint32_t ic = static_cast(indexes.size()); + std::uint32_t dc = 0; + std::uint32_t stride = 0; + file.write(reinterpret_cast(&version), sizeof(version)); + file.write(reinterpret_cast(&vc), sizeof(vc)); + file.write(reinterpret_cast(&ic), sizeof(ic)); + file.write(reinterpret_cast(&dc), sizeof(dc)); + file.write(reinterpret_cast(&stride), sizeof(stride)); + Compression::WriteBlob(file, blob); + } + static MeshAsset Load(fs::path path) { MeshAsset mesh; diff --git a/interfaces/Crafter.Asset-Texture.cppm b/interfaces/Crafter.Asset-Texture.cppm index 06e836e..b2c96ac 100644 --- a/interfaces/Crafter.Asset-Texture.cppm +++ b/interfaces/Crafter.Asset-Texture.cppm @@ -21,6 +21,7 @@ module; #define STB_IMAGE_IMPLEMENTATION #include "../lib/stb_image.h" export module Crafter.Asset:Texture; +import :Compression; import std; import Crafter.Math; namespace fs = std::filesystem; @@ -38,6 +39,42 @@ export namespace Crafter { OpaqueType opaque; }; + // GDeflate-compressed counterpart of TextureAsset::Save output. + // Single-region blob: the pixel array as one stream. + struct CompressedTextureAsset { + std::uint16_t sizeX = 0; + std::uint16_t sizeY = 0; + OpaqueType opaque = OpaqueType::FullyOpaque; + std::uint32_t pixelStride = 0; + Compression::CompressedBlob blob; + }; + + namespace TextureAssetFormat { + inline constexpr char magic[4] = {'C', 'G', 'D', 'T'}; + inline constexpr std::uint32_t version = 1; + } + + inline CompressedTextureAsset LoadCompressedTexture(fs::path path) { + std::ifstream file(path, std::ios::binary); + char magic[4]; + file.read(magic, 4); + if (std::memcmp(magic, TextureAssetFormat::magic, 4) != 0) { + throw std::runtime_error("LoadCompressedTexture: bad magic on " + path.string()); + } + std::uint32_t version = 0; + file.read(reinterpret_cast(&version), sizeof(version)); + if (version != TextureAssetFormat::version) { + throw std::runtime_error("LoadCompressedTexture: unsupported version on " + path.string()); + } + CompressedTextureAsset out; + file.read(reinterpret_cast(&out.sizeX), sizeof(out.sizeX)); + file.read(reinterpret_cast(&out.sizeY), sizeof(out.sizeY)); + file.read(reinterpret_cast(&out.opaque), sizeof(out.opaque)); + file.read(reinterpret_cast(&out.pixelStride), sizeof(out.pixelStride)); + out.blob = Compression::ReadBlob(file); + return out; + } + template struct TextureAsset { std::uint16_t sizeX; @@ -54,6 +91,24 @@ export namespace Crafter { file.write(reinterpret_cast(pixels.data()), pixels.size() * sizeof(T)); } + void SaveCompressed(fs::path path) const { + std::array, 1> streams = { + std::as_bytes(std::span(pixels)), + }; + Compression::CompressedBlob blob = Compression::CompressStreams(streams); + + std::ofstream file(path, std::ios::binary); + file.write(TextureAssetFormat::magic, 4); + std::uint32_t version = TextureAssetFormat::version; + std::uint32_t stride = static_cast(sizeof(T)); + file.write(reinterpret_cast(&version), sizeof(version)); + file.write(reinterpret_cast(&sizeX), sizeof(sizeX)); + file.write(reinterpret_cast(&sizeY), sizeof(sizeY)); + file.write(reinterpret_cast(&opaque), sizeof(opaque)); + file.write(reinterpret_cast(&stride), sizeof(stride)); + Compression::WriteBlob(file, blob); + } + static TextureAsset Load(fs::path path) { TextureAsset tex; diff --git a/interfaces/Crafter.Asset.cppm b/interfaces/Crafter.Asset.cppm index 6697bc1..9da1d6b 100644 --- a/interfaces/Crafter.Asset.cppm +++ b/interfaces/Crafter.Asset.cppm @@ -19,5 +19,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA export module Crafter.Asset; +export import :Compression; export import :Mesh; export import :Texture; \ No newline at end of file diff --git a/lib/gdeflate/GDeflate.h b/lib/gdeflate/GDeflate.h new file mode 100644 index 0000000..7cb9954 --- /dev/null +++ b/lib/gdeflate/GDeflate.h @@ -0,0 +1,46 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) Microsoft Corportaion. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "config.h" + +namespace GDeflate +{ + // See README.MD in libdeflate_1_8 for details on Compression Levels + static const uint32_t MinimumCompressionLevel = 1; + static const uint32_t MaximumCompressionLevel = 12; + + enum Flags + { + COMPRESS_SINGLE_THREAD = 0x200, /*!< Force compression using a single thread. */ + }; + + size_t CompressBound(size_t size); + + bool Compress( + uint8_t* output, + size_t* outputSize, + const uint8_t* in, + size_t inSize, + uint32_t level, + uint32_t flags); + + bool Decompress(uint8_t* output, size_t outputSize, const uint8_t* in, size_t inSize, uint32_t numWorkers); + +} // namespace GDeflate diff --git a/lib/gdeflate/GDeflateCompress.cpp b/lib/gdeflate/GDeflateCompress.cpp new file mode 100644 index 0000000..9a168ea --- /dev/null +++ b/lib/gdeflate/GDeflateCompress.cpp @@ -0,0 +1,270 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) Microsoft Corporation. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "GDeflate.h" +#include "TileStream.h" +#include "Utils.h" +#include "config.h" + +#include +#include "libdeflate/libdeflate.h" +#include + +#include +#include +#include +#include + +template<> +struct std::default_delete +{ + void operator()(libdeflate_gdeflate_compressor* p) const + { + libdeflate_free_gdeflate_compressor(p); + } +}; + +namespace GDeflate +{ + static constexpr uint32_t kMaxWorkers = 31; + static constexpr uint32_t kMinTilesPerWorker = 64; + + struct OutputStreamWrapper + { + uint8_t* ptr = nullptr; + size_t size = 0; + size_t pos = 0; + + template + void StreamOut(T* d, size_t n = 1) + { + const size_t dataSize = sizeof(T) * n; + + if (pos + dataSize > size) + { + printf("Fatal: stream overrun!\n"); + return; + } + + memcpy(ptr + pos, d, dataSize); + pos += dataSize; + } + + bool SetPos(size_t n) + { + if (n > size) + { + printf("Fatal: stream overrun!\n"); + return false; + } + + pos = n; + + return true; + } + }; + + struct CompressionContext + { + const uint8_t* inputPtr; + size_t inputSize; + + struct Tile + { + std::vector data; + size_t uncompressedSize = 0; + }; + + std::vector tiles; + + std::atomic_uint32_t globalIndex; + uint32_t numItems; + + std::atomic_bool failed; + }; + + static bool DoCompress( + uint8_t* output, + size_t* outputSize, + const uint8_t* in, + size_t inSize, + uint32_t level, + uint32_t flags) + { + if (outputSize == nullptr || output == nullptr || in == nullptr || inSize == 0) + return false; + + if (inSize > kDefaultTileSize * TileStream::kMaxTiles) + return false; + + CompressionContext context{}; + + context.inputPtr = in; + context.inputSize = inSize; + context.numItems = static_cast((inSize + kDefaultTileSize - 1) / kDefaultTileSize); + context.tiles.resize(context.numItems); + context.failed = false; + + auto TileCompressionJob = [&context, level]() + { + size_t pageCount = 0; + const size_t scratchSize = libdeflate_gdeflate_compress_bound(nullptr, kDefaultTileSize, &pageCount); + assert(pageCount == 1); + + void* scratch = alloca(scratchSize); + + std::unique_ptr compressor(libdeflate_alloc_gdeflate_compressor(level)); + + while (true) + { + const uint32_t tileIndex = context.globalIndex.fetch_add(1, std::memory_order_relaxed); + + if (tileIndex >= context.numItems) + break; + + const size_t tilePos = tileIndex * kDefaultTileSize; + + size_t remaining = context.inputSize - tilePos; + size_t uncompressedSize = std::min(remaining, kDefaultTileSize); + + auto& tile = context.tiles[tileIndex]; + tile.uncompressedSize = uncompressedSize; + + libdeflate_gdeflate_out_page compressedPage{scratch, scratchSize}; + + size_t result = libdeflate_gdeflate_compress( + compressor.get(), + context.inputPtr + tilePos, + uncompressedSize, + &compressedPage, + 1); + + if (result == 0) + { + context.failed = true; + break; + } + + tile.data.resize(compressedPage.nbytes); + memcpy(tile.data.data(), compressedPage.data, compressedPage.nbytes); + } + }; + + std::thread workers[kMaxWorkers + 1]; + const uint32_t maxWorkers = std::min(kMaxWorkers, std::thread::hardware_concurrency()); + + uint32_t numWorkersLeft = + std::min(maxWorkers, (context.numItems + kMinTilesPerWorker - 1) / kMinTilesPerWorker); + + if (flags & COMPRESS_SINGLE_THREAD) + numWorkersLeft = 0; + + for (auto& worker : workers) + { + if (numWorkersLeft == 0) + break; + + worker = std::thread([TileCompressionJob]() { TileCompressionJob(); }); + + --numWorkersLeft; + } + + TileCompressionJob(); + + for (auto& worker : workers) + { + if (worker.joinable()) + worker.join(); + } + + // Compression failed + if (context.failed) + return false; + + // Compression is done. Prepare the output stream. + + std::vector tilePtrs; + size_t dataPos = 0; + + for (auto const& tile : context.tiles) + { + tilePtrs.push_back(static_cast(dataPos)); + dataPos += tile.data.size(); + } + + // tilePtrs[0] is used to store the size of the last tile; all the other + // elements are offsets to the tile data. + tilePtrs[0] = static_cast(context.tiles.back().data.size()); + + assert(tilePtrs.size() <= TileStream::kMaxTiles); + assert(tilePtrs.size() == context.tiles.size()); + assert(tilePtrs.size() == context.numItems); + + OutputStreamWrapper outputStream; + outputStream.ptr = output; + outputStream.size = *outputSize; + + // calculate uncompressed size + size_t uncompressedSize = tilePtrs.size() * kDefaultTileSize; + size_t tailSize = context.inputSize - (tilePtrs.size() - 1) * kDefaultTileSize; + if (tailSize < kDefaultTileSize) + uncompressedSize -= kDefaultTileSize - tailSize; + + assert(uncompressedSize == inSize); + + TileStream header(uncompressedSize); + + assert(tilePtrs.size() == header.numTiles); + + outputStream.StreamOut(&header); + outputStream.StreamOut(tilePtrs.data(), tilePtrs.size()); + + size_t dataOffset = outputStream.pos; + + for (size_t i = 0; i < tilePtrs.size(); ++i) + { + CompressionContext::Tile const& tile = context.tiles[i]; + uint32_t tileOffset = (i == 0) ? 0 : tilePtrs[i]; + + outputStream.SetPos(dataOffset + tileOffset); + outputStream.StreamOut(tile.data.data(), tile.data.size()); + } + + *outputSize = outputStream.pos; + + return true; + } + + size_t CompressBound(size_t size) + { + size_t numTiles = std::min(size_t(TileStream::kMaxTiles), (size + kDefaultTileSize - 1) / kDefaultTileSize); + numTiles = std::max(size_t(1), numTiles); + + const size_t tileSize = kDefaultTileSize + + // Tile header. Ideally need to make it a part of compressor API. + (sizeof(uint32_t) + 4 * 208 + 4 * 8); + + return numTiles * tileSize + sizeof(TileStream) + sizeof(uint64_t); + } + + bool Compress(uint8_t* output, size_t* outputSize, const uint8_t* in, size_t inSize, uint32_t level, uint32_t flags) + { + return DoCompress(output, outputSize, in, inSize, level, flags); + } + +} // namespace GDeflate diff --git a/lib/gdeflate/GDeflateDecompress.cpp b/lib/gdeflate/GDeflateDecompress.cpp new file mode 100644 index 0000000..96f82c6 --- /dev/null +++ b/lib/gdeflate/GDeflateDecompress.cpp @@ -0,0 +1,170 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) Microsoft Corporation. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TileStream.h" +#include "Utils.h" + +#include "libdeflate/libdeflate.h" + +#include +#include +#include + +template<> +struct std::default_delete +{ + void operator()(libdeflate_gdeflate_decompressor* p) const + { + libdeflate_free_gdeflate_decompressor(p); + } +}; + +namespace GDeflate +{ + static constexpr uint32_t kMaxDecompressWorkers = 31; + + static bool ValidateStream(const TileStream* header) + { + if (!header->IsValid()) + { + printf("Malformed stream encountered.\n"); + return false; + } + + if (header->id != kGDeflateId) + { + printf("Unknown stream format: %d\n", header->id); + return false; + } + + return true; + } + + struct DecompressionContext + { + const uint8_t* inputPtr; + size_t inputSize; + + uint8_t* outputPtr; + size_t outputSize; + + std::atomic_uint32_t globalIndex; + uint32_t numItems; + + std::atomic_bool failed; + }; + + static void TileDecompressionJob(DecompressionContext& context, uint32_t compressorId) + { + std::unique_ptr decompressor(libdeflate_alloc_gdeflate_decompressor()); + + const uint32_t* tileOffsets = reinterpret_cast(context.inputPtr + sizeof(TileStream)); + const uint8_t* inDataPtr = reinterpret_cast(tileOffsets + context.numItems); + + libdeflate_result decompressResult = LIBDEFLATE_SUCCESS; + while (true) + { + const uint32_t tileIndex = context.globalIndex.fetch_add(1, std::memory_order_relaxed); + + if (tileIndex >= context.numItems) + break; + + const size_t tileOffset = tileIndex > 0 ? tileOffsets[tileIndex] : 0; + libdeflate_gdeflate_in_page compressedPage{}; + compressedPage.data = inDataPtr + tileOffset; + compressedPage.nbytes = + tileIndex < context.numItems - 1 ? tileOffsets[tileIndex + 1] - tileOffset : tileOffsets[0]; + + auto outputOffset = tileIndex * kDefaultTileSize; + + decompressResult = libdeflate_gdeflate_decompress( + decompressor.get(), + &compressedPage, + 1, + context.outputPtr + outputOffset, + static_cast(kDefaultTileSize), + nullptr); + + if ( decompressResult != LIBDEFLATE_SUCCESS) + { + context.failed = true; + break; + } + } + } + + bool Decompress(uint8_t* output, size_t outputSize, const uint8_t* in, size_t inSize, uint32_t numWorkers) + { + if (nullptr == output || nullptr == in || 0 == outputSize || 0 == inSize) + return false; + + numWorkers = std::min(kMaxDecompressWorkers, numWorkers); + numWorkers = std::max(1u, numWorkers); + + auto header = reinterpret_cast(in); + + if (!ValidateStream(header)) + return false; + + std::thread workers[kMaxDecompressWorkers]; + + // Run a tile per thread + header = reinterpret_cast(in); + + if (!ValidateStream(header)) + { + return false; + } + + DecompressionContext context{}; + + context.inputPtr = in; + context.inputSize = inSize; + + context.outputPtr = output; + context.outputSize = header->GetUncompressedSize(); + + context.globalIndex = 0; + context.numItems = header->numTiles; + + context.failed = false; + + uint32_t numWorkersLeft = context.numItems > (2 * numWorkers) ? numWorkers : 1; + const uint32_t compressorId = header->id; + + for (auto& worker : workers) + { + if (numWorkersLeft == 1) + break; + + worker = std::thread([&context, compressorId]() { TileDecompressionJob(context, compressorId); }); + + --numWorkersLeft; + } + + TileDecompressionJob(context, compressorId); + + for (auto& worker : workers) + { + if (worker.joinable()) + worker.join(); + } + + return (!context.failed); + } +} // namespace GDeflate diff --git a/lib/gdeflate/LICENSE b/lib/gdeflate/LICENSE new file mode 100644 index 0000000..5961953 --- /dev/null +++ b/lib/gdeflate/LICENSE @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + Copyright (c) Microsoft Corporation. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lib/gdeflate/TileStream.h b/lib/gdeflate/TileStream.h new file mode 100644 index 0000000..c420bd1 --- /dev/null +++ b/lib/gdeflate/TileStream.h @@ -0,0 +1,83 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) Microsoft Corporation. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Utils.h" +#include "config.h" + +#include +#include + +#include + +namespace GDeflate +{ +#pragma pack(push, 1) + struct TileStream + { + static constexpr uint32_t kMaxTiles = (1 << 16) - 1; + + uint8_t id; + uint8_t magic; + + uint16_t numTiles; + + uint32_t tileSizeIdx : 2; // this must be set to 1 + uint32_t lastTileSize : 18; + uint32_t reserved1 : 12; + + TileStream(size_t uncompressedSize) + { + memset(this, 0, sizeof(*this)); + tileSizeIdx = 1; + SetCodecId(kGDeflateId); + SetUncompressedSize(uncompressedSize); + } + + bool IsValid() const + { + return id == (magic ^ 0xff); + } + + size_t GetUncompressedSize() const + { + return numTiles * kDefaultTileSize - (lastTileSize == 0 ? 0 : kDefaultTileSize - lastTileSize); + } + + private: + void SetCodecId(uint8_t inId) + { + id = inId; + magic = inId ^ 0xff; + } + + void SetUncompressedSize(size_t size) + { + numTiles = static_cast(size / kDefaultTileSize); + lastTileSize = static_cast(size - numTiles * kDefaultTileSize); + + numTiles += lastTileSize != 0 ? 1 : 0; + } + }; + +#pragma pack(pop) + + static_assert(sizeof(TileStream) == 8, "Tile stream header size overrun!"); + +} // namespace GDeflate diff --git a/lib/gdeflate/Utils.h b/lib/gdeflate/Utils.h new file mode 100644 index 0000000..6dab640 --- /dev/null +++ b/lib/gdeflate/Utils.h @@ -0,0 +1,82 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) Microsoft Corportaion. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace GDeflate +{ + template + static inline T _align(T a) + { + return (a + T(N) - 1) & ~(T(N) - 1); + } + + template + static inline T _divRoundup(T a, T b) + { + return (a + b - 1) / b; + } + + template + static inline uint32_t _lzCount(T a) + { + uint32_t n = 0; + + while (0 == (a & 1) && n < sizeof(T) * 8) + { + a >>= 1; + ++n; + } + + return n; + } + + template + static inline T GetBits(uint32_t*& in, uint32_t& offset, uint32_t numBitsToRead) + { + constexpr uint32_t kBitsPerBucket = sizeof(*in) * 8; + + T bits = 0; + uint32_t numBitsConsumed = 0; + while (numBitsConsumed < numBitsToRead) + { + const uint32_t numBits = + std::min(numBitsToRead - numBitsConsumed, kBitsPerBucket - (offset % kBitsPerBucket)); + + const T mask = std::numeric_limits().max() >> (sizeof(T) * 8 - numBits); + + bits |= (T(*in >> (offset % kBitsPerBucket)) & mask) << numBitsConsumed; + + offset += numBits; + numBitsConsumed += numBits; + + if (0 == offset % kBitsPerBucket) + in++; + } + + return bits; + } +} // namespace GDeflate diff --git a/lib/gdeflate/config.h b/lib/gdeflate/config.h new file mode 100644 index 0000000..99bdea4 --- /dev/null +++ b/lib/gdeflate/config.h @@ -0,0 +1,29 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) Microsoft Corporation. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace GDeflate +{ + static constexpr uint8_t kGDeflateId = 4; + + static const size_t kDefaultTileSize = 64 * 1024; /*!< Default tile size */ +} // namespace GDeflate \ No newline at end of file diff --git a/lib/gdeflate/libdeflate/COPYING b/lib/gdeflate/libdeflate/COPYING new file mode 100644 index 0000000..81ca509 --- /dev/null +++ b/lib/gdeflate/libdeflate/COPYING @@ -0,0 +1,36 @@ +Copyright 2016 Eric Biggers + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation files +(the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of the Software, +and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: Apache-2.0 + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/lib/gdeflate/libdeflate/common/common_defs.h b/lib/gdeflate/libdeflate/common/common_defs.h new file mode 100644 index 0000000..bd54bdf --- /dev/null +++ b/lib/gdeflate/libdeflate/common/common_defs.h @@ -0,0 +1,338 @@ +/* + * common_defs.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef COMMON_COMMON_DEFS_H +#define COMMON_COMMON_DEFS_H + +#ifdef __GNUC__ +# include "compiler_gcc.h" +#elif defined(_MSC_VER) +# include "compiler_msc.h" +#else +# pragma message("Unrecognized compiler. Please add a header file for your compiler. Compilation will proceed, but performance may suffer!") +#endif + +/* ========================================================================== */ +/* Type definitions */ +/* ========================================================================== */ + +#include /* size_t */ + +#ifndef __bool_true_false_are_defined +# include /* bool */ +#endif + +/* Fixed-width integer types */ +#include +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +/* Concatenation macros */ +#define CONCAT2(x,y) x##y +#define CONCAT(x,y) CONCAT2(x,y) + +/* + * Word type of the target architecture. Use 'size_t' instead of 'unsigned + * long' to account for platforms such as Windows that use 32-bit 'unsigned + * long' on 64-bit architectures. + */ +typedef size_t machine_word_t; + +/* Number of bytes in a word */ +#define WORDBYTES ((int)sizeof(machine_word_t)) + +/* Number of bits in a word */ +#define WORDBITS (8 * WORDBYTES) + +/* ========================================================================== */ +/* Optional compiler features */ +/* ========================================================================== */ + +/* LIBEXPORT - export a function from a shared library */ +#ifndef LIBEXPORT +# define LIBEXPORT +#endif + +/* inline - suggest that a function be inlined */ +#ifndef inline +# define inline +#endif + +/* forceinline - force a function to be inlined, if possible */ +#ifndef forceinline +# define forceinline inline +#endif + +/* restrict - annotate a non-aliased pointer */ +#ifndef restrict +# define restrict +#endif + +/* likely(expr) - hint that an expression is usually true */ +#ifndef likely +# define likely(expr) (expr) +#endif + +/* unlikely(expr) - hint that an expression is usually false */ +#ifndef unlikely +# define unlikely(expr) (expr) +#endif + +/* prefetchr(addr) - prefetch into L1 cache for read */ +#ifndef prefetchr +# define prefetchr(addr) +#endif + +/* prefetchw(addr) - prefetch into L1 cache for write */ +#ifndef prefetchw +# define prefetchw(addr) +#endif + +/* Does the compiler support the 'target' function attribute? */ +#ifndef COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE +# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0 +#endif + +/* Which targets are supported with the 'target' function attribute? */ +#ifndef COMPILER_SUPPORTS_BMI2_TARGET +# define COMPILER_SUPPORTS_BMI2_TARGET 0 +#endif +#ifndef COMPILER_SUPPORTS_AVX_TARGET +# define COMPILER_SUPPORTS_AVX_TARGET 0 +#endif +#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET +# define COMPILER_SUPPORTS_AVX512BW_TARGET 0 +#endif + +/* + * Which targets are supported with the 'target' function attribute and have + * intrinsics that work within 'target'-ed functions? + */ +#ifndef COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS +# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 0 +#endif +#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS +# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS 0 +#endif +#ifndef COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS +# define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS 0 +#endif +#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS +# define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS 0 +#endif +#ifndef COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS +# define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS 0 +#endif +#ifndef COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS +# define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS 0 +#endif +#ifndef COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS +# define COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS 0 +#endif + +/* _aligned_attribute(n) - declare that the annotated variable, or variables of + * the annotated type, are to be aligned on n-byte boundaries */ +#ifndef _aligned_attribute +#endif + +/* ========================================================================== */ +/* Miscellaneous macros */ +/* ========================================================================== */ + +#define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0])) +#define MIN(a, b) ((a) <= (b) ? (a) : (b)) +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)])) +#define ALIGN(n, a) (((n) + (a) - 1) & ~((a) - 1)) + +/* ========================================================================== */ +/* Endianness handling */ +/* ========================================================================== */ + +/* + * CPU_IS_LITTLE_ENDIAN() - a macro which evaluates to 1 if the CPU is little + * endian or 0 if it is big endian. The macro should be defined in a way such + * that the compiler can evaluate it at compilation time. If not defined, a + * fallback is used. + */ +#ifndef CPU_IS_LITTLE_ENDIAN +static forceinline int CPU_IS_LITTLE_ENDIAN(void) +{ + union { + unsigned int v; + unsigned char b; + } u; + u.v = 1; + return u.b; +} +#endif + +/* bswap16(n) - swap the bytes of a 16-bit integer */ +#ifndef bswap16 +static forceinline u16 bswap16(u16 n) +{ + return (n << 8) | (n >> 8); +} +#endif + +/* bswap32(n) - swap the bytes of a 32-bit integer */ +#ifndef bswap32 +static forceinline u32 bswap32(u32 n) +{ + return ((n & 0x000000FF) << 24) | + ((n & 0x0000FF00) << 8) | + ((n & 0x00FF0000) >> 8) | + ((n & 0xFF000000) >> 24); +} +#endif + +/* bswap64(n) - swap the bytes of a 64-bit integer */ +#ifndef bswap64 +static forceinline u64 bswap64(u64 n) +{ + return ((n & 0x00000000000000FF) << 56) | + ((n & 0x000000000000FF00) << 40) | + ((n & 0x0000000000FF0000) << 24) | + ((n & 0x00000000FF000000) << 8) | + ((n & 0x000000FF00000000) >> 8) | + ((n & 0x0000FF0000000000) >> 24) | + ((n & 0x00FF000000000000) >> 40) | + ((n & 0xFF00000000000000) >> 56); +} +#endif + +#define le16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap16(n)) +#define le32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap32(n)) +#define le64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap64(n)) +#define be16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap16(n) : (n)) +#define be32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap32(n) : (n)) +#define be64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap64(n) : (n)) + +/* ========================================================================== */ +/* Unaligned memory accesses */ +/* ========================================================================== */ + +/* + * UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses + * can be performed efficiently on the target platform. + */ +#ifndef UNALIGNED_ACCESS_IS_FAST +# define UNALIGNED_ACCESS_IS_FAST 0 +#endif + +/* ========================================================================== */ +/* Bit scan functions */ +/* ========================================================================== */ + +/* + * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least + * significant end) of the *most* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +#ifndef bsr32 +static forceinline unsigned +bsr32(u32 n) +{ + unsigned i = 0; + while ((n >>= 1) != 0) + i++; + return i; +} +#endif + +#ifndef bsr64 +static forceinline unsigned +bsr64(u64 n) +{ + unsigned i = 0; + while ((n >>= 1) != 0) + i++; + return i; +} +#endif + +static forceinline unsigned +bsrw(machine_word_t n) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsr32(n); + else + return bsr64(n); +} + +/* + * Bit Scan Forward (BSF) - find the 0-based index (relative to the least + * significant end) of the *least* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +#ifndef bsf32 +static forceinline unsigned +bsf32(u32 n) +{ + unsigned i = 0; + while ((n & 1) == 0) { + i++; + n >>= 1; + } + return i; +} +#endif + +#ifndef bsf64 +static forceinline unsigned +bsf64(u64 n) +{ + unsigned i = 0; + while ((n & 1) == 0) { + i++; + n >>= 1; + } + return i; +} +#endif + +static forceinline unsigned +bsfw(machine_word_t n) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsf32(n); + else + return bsf64(n); +} + +#endif /* COMMON_COMMON_DEFS_H */ diff --git a/lib/gdeflate/libdeflate/common/compiler_gcc.h b/lib/gdeflate/libdeflate/common/compiler_gcc.h new file mode 100644 index 0000000..18147a2 --- /dev/null +++ b/lib/gdeflate/libdeflate/common/compiler_gcc.h @@ -0,0 +1,217 @@ +/* + * compiler_gcc.h - definitions for the GNU C Compiler. This also handles clang + * and the Intel C Compiler (icc). + * + * TODO: icc is not well tested, so some things are currently disabled even + * though they maybe can be enabled on some icc versions. + */ + +#if !defined(__clang__) && !defined(__INTEL_COMPILER) +# define GCC_PREREQ(major, minor) \ + (__GNUC__ > (major) || \ + (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) +#else +# define GCC_PREREQ(major, minor) 0 +#endif + +/* Note: only check the clang version when absolutely necessary! + * "Vendors" such as Apple can use different version numbers. */ +#ifdef __clang__ +# ifdef __apple_build_version__ +# define CLANG_PREREQ(major, minor, apple_version) \ + (__apple_build_version__ >= (apple_version)) +# else +# define CLANG_PREREQ(major, minor, apple_version) \ + (__clang_major__ > (major) || \ + (__clang_major__ == (major) && __clang_minor__ >= (minor))) +# endif +#else +# define CLANG_PREREQ(major, minor, apple_version) 0 +#endif + +#ifndef __has_attribute +# define __has_attribute(attribute) 0 +#endif +#ifndef __has_feature +# define __has_feature(feature) 0 +#endif +#ifndef __has_builtin +# define __has_builtin(builtin) 0 +#endif + +#ifdef _WIN32 +# define LIBEXPORT __declspec(dllexport) +#else +# define LIBEXPORT __attribute__((visibility("default"))) +#endif + +#define inline inline +#define forceinline inline __attribute__((always_inline)) +#define restrict __restrict__ +#define likely(expr) __builtin_expect(!!(expr), 1) +#define unlikely(expr) __builtin_expect(!!(expr), 0) +#define prefetchr(addr) __builtin_prefetch((addr), 0) +#define prefetchw(addr) __builtin_prefetch((addr), 1) +#define _aligned_attribute(n) __attribute__((aligned(n))) + +#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE \ + (GCC_PREREQ(4, 4) || __has_attribute(target)) + +#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE + +# if defined(__i386__) || defined(__x86_64__) + +# define COMPILER_SUPPORTS_PCLMUL_TARGET \ + (GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128)) + +# define COMPILER_SUPPORTS_AVX_TARGET \ + (GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256)) + +# define COMPILER_SUPPORTS_BMI2_TARGET \ + (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di)) + +# define COMPILER_SUPPORTS_AVX2_TARGET \ + (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_psadbw256)) + +# define COMPILER_SUPPORTS_AVX512BW_TARGET \ + (GCC_PREREQ(5, 1) || __has_builtin(__builtin_ia32_psadbw512)) + + /* + * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics + * not available in the main target could not be used in 'target' + * attribute functions. Unfortunately clang has no feature test macro + * for this so we have to check its version. + */ +# if GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) +# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 1 +# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS \ + COMPILER_SUPPORTS_PCLMUL_TARGET +# define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS \ + COMPILER_SUPPORTS_AVX2_TARGET +# define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS \ + COMPILER_SUPPORTS_AVX512BW_TARGET +# endif + +# elif defined(__arm__) || defined(__aarch64__) + + /* + * Determine whether NEON and crypto intrinsics are supported. + * + * With gcc prior to 6.1, (r230411 for arm32, r226563 for arm64), neither + * was available unless enabled in the main target. + * + * But even after that, to include (which contains both the + * basic NEON intrinsics and the crypto intrinsics) the main target still + * needs to have: + * - gcc: hardware floating point support + * - clang: NEON support (but not necessarily crypto support) + */ +# if (GCC_PREREQ(6, 1) && defined(__ARM_FP)) || \ + (defined(__clang__) && defined(__ARM_NEON)) +# define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS 1 + /* + * The crypto intrinsics are broken on arm32 with clang, even when using + * -mfpu=crypto-neon-fp-armv8, because clang's puts them + * behind __aarch64__. Undefine __ARM_FEATURE_CRYPTO in that case... + */ +# if defined(__clang__) && defined(__arm__) +# undef __ARM_FEATURE_CRYPTO +# elif __has_builtin(__builtin_neon_vmull_p64) || !defined(__clang__) +# define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS 1 +# endif +# endif + + /* + * Determine whether CRC32 intrinsics are supported. + * + * With gcc r274827 or later (gcc 10.1+, 9.3+, or 8.4+), or with clang, + * they work as expected. (Well, not quite. There's still a bug, but we + * have to work around it later when including arm_acle.h.) + */ +# if GCC_PREREQ(10, 1) || \ + (GCC_PREREQ(9, 3) && !GCC_PREREQ(10, 0)) || \ + (GCC_PREREQ(8, 4) && !GCC_PREREQ(9, 0)) || \ + (defined(__clang__) && __has_builtin(__builtin_arm_crc32b)) +# define COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS 1 +# endif + +# endif /* __arm__ || __aarch64__ */ + +#endif /* COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE */ + +/* + * Prior to gcc 5.1 and clang 3.9, emmintrin.h only defined vectors of signed + * integers (e.g. __v4si), not vectors of unsigned integers (e.g. __v4su). But + * we need the unsigned ones in order to avoid signed integer overflow, which is + * undefined behavior. Add the missing definitions for the unsigned ones if + * needed. + */ +#if (GCC_PREREQ(4, 0) && !GCC_PREREQ(5, 1)) || \ + (defined(__clang__) && !CLANG_PREREQ(3, 9, 8020000)) || \ + defined(__INTEL_COMPILER) +typedef unsigned long long __v2du __attribute__((__vector_size__(16))); +typedef unsigned int __v4su __attribute__((__vector_size__(16))); +typedef unsigned short __v8hu __attribute__((__vector_size__(16))); +typedef unsigned char __v16qu __attribute__((__vector_size__(16))); +typedef unsigned long long __v4du __attribute__((__vector_size__(32))); +typedef unsigned int __v8su __attribute__((__vector_size__(32))); +typedef unsigned short __v16hu __attribute__((__vector_size__(32))); +typedef unsigned char __v32qu __attribute__((__vector_size__(32))); +#endif + +#ifdef __INTEL_COMPILER +typedef int __v16si __attribute__((__vector_size__(64))); +typedef short __v32hi __attribute__((__vector_size__(64))); +typedef char __v64qi __attribute__((__vector_size__(64))); +#endif + +/* Newer gcc supports __BYTE_ORDER__. Older gcc doesn't. */ +#ifdef __BYTE_ORDER__ +# define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#endif + +#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) +# define bswap16 __builtin_bswap16 +#endif + +#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) +# define bswap32 __builtin_bswap32 +#endif + +#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) +# define bswap64 __builtin_bswap64 +#endif + +#if defined(__x86_64__) || defined(__i386__) || \ + defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \ + /* + * For all compilation purposes, WebAssembly behaves like any other CPU + * instruction set. Even though WebAssembly engine might be running on top + * of different actual CPU architectures, the WebAssembly spec itself + * permits unaligned access and it will be fast on most of those platforms, + * and simulated at the engine level on others, so it's worth treating it + * as a CPU architecture with fast unaligned access. + */ defined(__wasm__) +# define UNALIGNED_ACCESS_IS_FAST 1 +#endif + +#define bsr32(n) (31 - __builtin_clz(n)) +#define bsr64(n) (63 - __builtin_clzll(n)) +#define bsf32(n) __builtin_ctz(n) +#define bsf64(n) __builtin_ctzll(n) + +/* + * Setup rotation macros similar to MSVS intrinsics. + * These should recognized by compilers. + */ +#ifndef _rotr16 +#define _rotr16(x,n) ((x>>n) + (x<<(16-n))) +#endif + +#ifndef _rotr +#define _rotr(x,n) ((x>>n) + (x<<(32-n))) +#endif + +#ifndef _rotr64 +#define _rotr64(x,n) ((x>>n) + (x<<(64-n))) +#endif diff --git a/lib/gdeflate/libdeflate/common/compiler_msc.h b/lib/gdeflate/libdeflate/common/compiler_msc.h new file mode 100644 index 0000000..18cfa12 --- /dev/null +++ b/lib/gdeflate/libdeflate/common/compiler_msc.h @@ -0,0 +1,80 @@ +/* + * compiler_msc.h - definitions for the Microsoft C Compiler + */ + +#include +#include /* for _byteswap_*() */ + +#define LIBEXPORT __declspec(dllexport) + +/* + * Old versions (e.g. VS2010) of MSC don't have the C99 header stdbool.h. + * Beware: the below replacement isn't fully standard, since normally any value + * != 0 should be implicitly cast to a bool with value 1... but that doesn't + * happen if bool is really just an 'int'. + */ +typedef int bool; +#define true 1 +#define false 0 +#define __bool_true_false_are_defined 1 + +/* Define ssize_t */ +#ifdef _WIN64 +typedef long long ssize_t; +#else +typedef int ssize_t; +#endif + +/* Assume a little endian architecture with fast unaligned access */ +#define CPU_IS_LITTLE_ENDIAN() 1 +#define UNALIGNED_ACCESS_IS_FAST 1 + +/* __restrict has nonstandard behavior; don't use it */ +#define restrict + +/* ... but we can use __inline and __forceinline */ +#define inline __inline +#define forceinline __forceinline + +/* Byte swap functions */ +#define bswap16 _byteswap_ushort +#define bswap32 _byteswap_ulong +#define bswap64 _byteswap_uint64 + +/* Bit scan functions (32-bit) */ + +static forceinline unsigned +bsr32(uint32_t n) +{ + _BitScanReverse(&n, n); + return n; +} +#define bsr32 bsr32 + +static forceinline unsigned +bsf32(uint32_t n) +{ + _BitScanForward(&n, n); + return n; +} +#define bsf32 bsf32 + +#ifdef _M_X64 /* Bit scan functions (64-bit) */ + +static forceinline unsigned +bsr64(uint64_t n) +{ + _BitScanReverse64(&n, n); + return n; +} +#define bsr64 bsr64 + +static forceinline unsigned +bsf64(uint64_t n) +{ + _BitScanForward64(&n, n); + return n; +} +#define bsf64 bsf64 + +#endif /* _M_X64 */ diff --git a/lib/gdeflate/libdeflate/lib/adler32.c b/lib/gdeflate/libdeflate/lib/adler32.c new file mode 100644 index 0000000..32ab0ce --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/adler32.c @@ -0,0 +1,130 @@ +/* + * adler32.c - Adler-32 checksum algorithm + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "lib_common.h" +#include "libdeflate.h" + +/* The Adler-32 divisor, or "base", value. */ +#define DIVISOR 65521 + +/* + * MAX_CHUNK_SIZE is the most bytes that can be processed without the + * possibility of s2 overflowing when it is represented as an unsigned 32-bit + * integer. This value was computed using the following Python script: + * + * divisor = 65521 + * count = 0 + * s1 = divisor - 1 + * s2 = divisor - 1 + * while True: + * s1 += 0xFF + * s2 += s1 + * if s2 > 0xFFFFFFFF: + * break + * count += 1 + * print(count) + * + * Note that to get the correct worst-case value, we must assume that every byte + * has value 0xFF and that s1 and s2 started with the highest possible values + * modulo the divisor. + */ +#define MAX_CHUNK_SIZE 5552 + +typedef u32 (*adler32_func_t)(u32, const u8 *, size_t); + +/* Include architecture-specific implementations if available */ +#undef DEFAULT_IMPL +#undef DISPATCH +#if defined(__arm__) || defined(__aarch64__) +# include "arm/adler32_impl.h" +#elif defined(__i386__) || defined(__x86_64__) +# include "x86/adler32_impl.h" +#endif + +/* Define a generic implementation if needed */ +#ifndef DEFAULT_IMPL +#define DEFAULT_IMPL adler32_generic +static u32 adler32_generic(u32 adler, const u8 *p, size_t size) +{ + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; + const u8 * const end = p + size; + + while (p != end) { + size_t chunk_size = MIN(end - p, MAX_CHUNK_SIZE); + const u8 *chunk_end = p + chunk_size; + size_t num_unrolled_iterations = chunk_size / 4; + + while (num_unrolled_iterations--) { + s1 += *p++; + s2 += s1; + s1 += *p++; + s2 += s1; + s1 += *p++; + s2 += s1; + s1 += *p++; + s2 += s1; + } + while (p != chunk_end) { + s1 += *p++; + s2 += s1; + } + s1 %= DIVISOR; + s2 %= DIVISOR; + } + + return (s2 << 16) | s1; +} +#endif /* !DEFAULT_IMPL */ + +#ifdef DISPATCH +static u32 dispatch(u32, const u8 *, size_t); + +static volatile adler32_func_t adler32_impl = dispatch; + +/* Choose the fastest implementation at runtime */ +static u32 dispatch(u32 adler, const u8 *buffer, size_t size) +{ + adler32_func_t f = arch_select_adler32_func(); + + if (f == NULL) + f = DEFAULT_IMPL; + + adler32_impl = f; + return adler32_impl(adler, buffer, size); +} +#else +# define adler32_impl DEFAULT_IMPL /* only one implementation, use it */ +#endif + +LIBDEFLATEEXPORT u32 LIBDEFLATEAPI +libdeflate_adler32(u32 adler, const void *buffer, size_t size) +{ + if (buffer == NULL) /* return initial value */ + return 1; + return adler32_impl(adler, buffer, size); +} diff --git a/lib/gdeflate/libdeflate/lib/adler32_vec_template.h b/lib/gdeflate/libdeflate/lib/adler32_vec_template.h new file mode 100644 index 0000000..4eb8c2a --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/adler32_vec_template.h @@ -0,0 +1,124 @@ +/* + * adler32_vec_template.h - template for vectorized Adler-32 implementations + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This file contains a template for vectorized Adler-32 implementations. + * + * The inner loop between reductions modulo 65521 of an unvectorized Adler-32 + * implementation looks something like this: + * + * do { + * s1 += *p; + * s2 += s1; + * } while (++p != chunk_end); + * + * For vectorized calculation of s1, we only need to sum the input bytes. They + * can be accumulated into multiple counters which are eventually summed + * together. + * + * For vectorized calculation of s2, the basic idea is that for each iteration + * that processes N bytes, we can perform the following vectorizable + * calculation: + * + * s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N + * + * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N + * separate counters, then do the multiplications by N...1 just once at the end + * rather than once per iteration. + * + * Also, we must account for how previous bytes will affect s2 by doing the + * following at beginning of each iteration: + * + * s2 += s1 * N + * + * Furthermore, like s1, "s2" can actually be multiple counters which are + * eventually summed together. + */ + +static u32 ATTRIBUTES +FUNCNAME(u32 adler, const u8 *p, size_t size) +{ + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; + const u8 * const end = p + size; + const u8 *vend; + const size_t max_chunk_size = + MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) - + (MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) % + IMPL_SEGMENT_SIZE); + + /* Process a byte at a time until the needed alignment is reached */ + if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) { + do { + s1 += *p++; + s2 += s1; + } while (p != end && (uintptr_t)p % IMPL_ALIGNMENT); + s1 %= DIVISOR; + s2 %= DIVISOR; + } + + /* + * Process "chunks" of bytes using vector instructions. Chunk sizes are + * limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never + * overflow before being reduced modulo DIVISOR. For vector processing, + * chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and + * may be further limited to IMPL_MAX_CHUNK_SIZE. + */ + STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0); + vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE); + while (p != vend) { + size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size); + + s2 += s1 * chunk_size; + + FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size), + &s1, &s2); + + p += chunk_size; + s1 %= DIVISOR; + s2 %= DIVISOR; + } + + /* Process any remaining bytes */ + if (p != end) { + do { + s1 += *p++; + s2 += s1; + } while (p != end); + s1 %= DIVISOR; + s2 %= DIVISOR; + } + + return (s2 << 16) | s1; +} + +#undef FUNCNAME +#undef FUNCNAME_CHUNK +#undef ATTRIBUTES +#undef IMPL_ALIGNMENT +#undef IMPL_SEGMENT_SIZE +#undef IMPL_MAX_CHUNK_SIZE diff --git a/lib/gdeflate/libdeflate/lib/arm/adler32_impl.h b/lib/gdeflate/libdeflate/lib/arm/adler32_impl.h new file mode 100644 index 0000000..17e56c0 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/arm/adler32_impl.h @@ -0,0 +1,125 @@ +/* + * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_ARM_ADLER32_IMPL_H +#define LIB_ARM_ADLER32_IMPL_H + +#include "cpu_features.h" + +/* NEON implementation */ +#undef DISPATCH_NEON +#if !defined(DEFAULT_IMPL) && \ + (defined(__ARM_NEON) || (ARM_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS)) +# define FUNCNAME adler32_neon +# define FUNCNAME_CHUNK adler32_neon_chunk +# define IMPL_ALIGNMENT 16 +# define IMPL_SEGMENT_SIZE 32 +/* Prevent unsigned overflow of the 16-bit precision byte counters */ +# define IMPL_MAX_CHUNK_SIZE (32 * (0xFFFF / 0xFF)) +# ifdef __ARM_NEON +# define ATTRIBUTES +# define DEFAULT_IMPL adler32_neon +# else +# ifdef __arm__ +# define ATTRIBUTES __attribute__((target("fpu=neon"))) +# else +# define ATTRIBUTES __attribute__((target("+simd"))) +# endif +# define DISPATCH 1 +# define DISPATCH_NEON 1 +# endif +# include +static forceinline ATTRIBUTES void +adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, + u32 *s1, u32 *s2) +{ + uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 }; + uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 }; + uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 }; + uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 }; + uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 }; + uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 }; + + do { + const uint8x16_t bytes1 = *p++; + const uint8x16_t bytes2 = *p++; + uint16x8_t tmp; + + v_s2 += v_s1; + + /* Vector Pairwise Add Long (u8 => u16) */ + tmp = vpaddlq_u8(bytes1); + + /* Vector Pairwise Add and Accumulate Long (u8 => u16) */ + tmp = vpadalq_u8(tmp, bytes2); + + /* Vector Pairwise Add and Accumulate Long (u16 => u32) */ + v_s1 = vpadalq_u16(v_s1, tmp); + + /* Vector Add Wide (u8 => u16) */ + v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1)); + v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1)); + v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2)); + v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2)); + + } while (p != end); + + /* Vector Shift Left (u32) */ + v_s2 = vqshlq_n_u32(v_s2, 5); + + /* Vector Multiply Accumulate Long (u16 => u32) */ + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), (uint16x4_t) { 32, 31, 30, 29 }); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 }); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), (uint16x4_t) { 24, 23, 22, 21 }); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 }); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), (uint16x4_t) { 16, 15, 14, 13 }); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10, 9 }); + v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) { 8, 7, 6, 5 }); + v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) { 4, 3, 2, 1 }); + + *s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3]; + *s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3]; +} +# include "../adler32_vec_template.h" +#endif /* NEON implementation */ + +#ifdef DISPATCH +static inline adler32_func_t +arch_select_adler32_func(void) +{ + u32 features = get_cpu_features(); + +#ifdef DISPATCH_NEON + if (features & ARM_CPU_FEATURE_NEON) + return adler32_neon; +#endif + return NULL; +} +#endif /* DISPATCH */ + +#endif /* LIB_ARM_ADLER32_IMPL_H */ diff --git a/lib/gdeflate/libdeflate/lib/arm/cpu_features.c b/lib/gdeflate/libdeflate/lib/arm/cpu_features.c new file mode 100644 index 0000000..60b1be3 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/arm/cpu_features.c @@ -0,0 +1,133 @@ +/* + * arm/cpu_features.c - feature detection for ARM processors + * + * Copyright 2018 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * ARM processors don't have a standard way for unprivileged programs to detect + * processor features. But, on Linux we can read the AT_HWCAP and AT_HWCAP2 + * values from /proc/self/auxv. + * + * Ideally we'd use the C library function getauxval(), but it's not guaranteed + * to be available: it was only added to glibc in 2.16, and in Android it was + * added to API level 18 for ARM and level 21 for AArch64. + */ + +#include "../cpu_features_common.h" /* must be included first */ +#include "cpu_features.h" + +#if ARM_CPU_FEATURES_ENABLED + +#include +#include +#include +#include + +#define AT_HWCAP 16 +#define AT_HWCAP2 26 + +volatile u32 _cpu_features = 0; + +static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2) +{ + int fd; + unsigned long auxbuf[32]; + int filled = 0; + int i; + + fd = open("/proc/self/auxv", O_RDONLY); + if (fd < 0) + return; + + for (;;) { + do { + int ret = read(fd, &((char *)auxbuf)[filled], + sizeof(auxbuf) - filled); + if (ret <= 0) { + if (ret < 0 && errno == EINTR) + continue; + goto out; + } + filled += ret; + } while (filled < 2 * sizeof(long)); + + i = 0; + do { + unsigned long type = auxbuf[i]; + unsigned long value = auxbuf[i + 1]; + + if (type == AT_HWCAP) + *hwcap = value; + else if (type == AT_HWCAP2) + *hwcap2 = value; + i += 2; + filled -= 2 * sizeof(long); + } while (filled >= 2 * sizeof(long)); + + memmove(auxbuf, &auxbuf[i], filled); + } +out: + close(fd); +} + +static const struct cpu_feature arm_cpu_feature_table[] = { + {ARM_CPU_FEATURE_NEON, "neon"}, + {ARM_CPU_FEATURE_PMULL, "pmull"}, + {ARM_CPU_FEATURE_CRC32, "crc32"}, +}; + +void setup_cpu_features(void) +{ + u32 features = 0; + unsigned long hwcap = 0; + unsigned long hwcap2 = 0; + + scan_auxv(&hwcap, &hwcap2); + +#ifdef __arm__ + STATIC_ASSERT(sizeof(long) == 4); + if (hwcap & (1 << 12)) /* HWCAP_NEON */ + features |= ARM_CPU_FEATURE_NEON; + if (hwcap2 & (1 << 1)) /* HWCAP2_PMULL */ + features |= ARM_CPU_FEATURE_PMULL; + if (hwcap2 & (1 << 4)) /* HWCAP2_CRC32 */ + features |= ARM_CPU_FEATURE_CRC32; +#else + STATIC_ASSERT(sizeof(long) == 8); + if (hwcap & (1 << 1)) /* HWCAP_ASIMD */ + features |= ARM_CPU_FEATURE_NEON; + if (hwcap & (1 << 4)) /* HWCAP_PMULL */ + features |= ARM_CPU_FEATURE_PMULL; + if (hwcap & (1 << 7)) /* HWCAP_CRC32 */ + features |= ARM_CPU_FEATURE_CRC32; +#endif + + disable_cpu_features_for_testing(&features, arm_cpu_feature_table, + ARRAY_LEN(arm_cpu_feature_table)); + + _cpu_features = features | ARM_CPU_FEATURES_KNOWN; +} + +#endif /* ARM_CPU_FEATURES_ENABLED */ diff --git a/lib/gdeflate/libdeflate/lib/arm/cpu_features.h b/lib/gdeflate/libdeflate/lib/arm/cpu_features.h new file mode 100644 index 0000000..69d7235 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/arm/cpu_features.h @@ -0,0 +1,40 @@ +/* + * arm/cpu_features.h - feature detection for ARM processors + */ + +#ifndef LIB_ARM_CPU_FEATURES_H +#define LIB_ARM_CPU_FEATURES_H + +#include "../lib_common.h" + +#if (defined(__arm__) || defined(__aarch64__)) && \ + defined(__linux__) && \ + COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE && \ + !defined(FREESTANDING) +# define ARM_CPU_FEATURES_ENABLED 1 +#else +# define ARM_CPU_FEATURES_ENABLED 0 +#endif + +#if ARM_CPU_FEATURES_ENABLED + +#define ARM_CPU_FEATURE_NEON 0x00000001 +#define ARM_CPU_FEATURE_PMULL 0x00000002 +#define ARM_CPU_FEATURE_CRC32 0x00000004 + +#define ARM_CPU_FEATURES_KNOWN 0x80000000 + +extern volatile u32 _cpu_features; + +void setup_cpu_features(void); + +static inline u32 get_cpu_features(void) +{ + if (_cpu_features == 0) + setup_cpu_features(); + return _cpu_features; +} + +#endif /* ARM_CPU_FEATURES_ENABLED */ + +#endif /* LIB_ARM_CPU_FEATURES_H */ diff --git a/lib/gdeflate/libdeflate/lib/arm/crc32_impl.h b/lib/gdeflate/libdeflate/lib/arm/crc32_impl.h new file mode 100644 index 0000000..238a85a --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/arm/crc32_impl.h @@ -0,0 +1,247 @@ +/* + * arm/crc32_impl.h + * + * Copyright 2017 Jun He + * Copyright 2018 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_ARM_CRC32_IMPL_H +#define LIB_ARM_CRC32_IMPL_H + +#include "cpu_features.h" + +/* Implementation using ARM CRC32 instructions */ +#undef DISPATCH_ARM +#if !defined(DEFAULT_IMPL) && \ + (defined(__ARM_FEATURE_CRC32) || \ + (ARM_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS)) +# ifdef __ARM_FEATURE_CRC32 +# define ATTRIBUTES +# define DEFAULT_IMPL crc32_arm +# else +# ifdef __arm__ +# ifdef __clang__ +# define ATTRIBUTES __attribute__((target("armv8-a,crc"))) +# else +# define ATTRIBUTES __attribute__((target("arch=armv8-a+crc"))) +# endif +# else +# ifdef __clang__ +# define ATTRIBUTES __attribute__((target("crc"))) +# else +# define ATTRIBUTES __attribute__((target("+crc"))) +# endif +# endif +# define DISPATCH 1 +# define DISPATCH_ARM 1 +# endif + +/* + * gcc's (as of 10.1) version of arm_acle.h for arm32, and clang's (as of + * 10.0.1) version of arm_acle.h for both arm32 and arm64, have a bug where they + * only define the CRC32 functions like __crc32b() when __ARM_FEATURE_CRC32 is + * defined. That prevents them from being used via __attribute__((target)) when + * the main target doesn't have CRC32 support enabled. The actual built-ins + * like __builtin_arm_crc32b() are available and work, however; it's just the + * wrappers in arm_acle.h like __crc32b() that erroneously don't get defined. + * Work around this by manually defining __ARM_FEATURE_CRC32. + */ +#ifndef __ARM_FEATURE_CRC32 +# define __ARM_FEATURE_CRC32 1 +#endif +#include + +static u32 ATTRIBUTES +crc32_arm(u32 remainder, const u8 *p, size_t size) +{ + while (size != 0 && (uintptr_t)p & 7) { + remainder = __crc32b(remainder, *p++); + size--; + } + + while (size >= 32) { + remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 0))); + remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 1))); + remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 2))); + remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 3))); + p += 32; + size -= 32; + } + + while (size >= 8) { + remainder = __crc32d(remainder, le64_bswap(*(u64 *)p)); + p += 8; + size -= 8; + } + + while (size != 0) { + remainder = __crc32b(remainder, *p++); + size--; + } + + return remainder; +} +#undef ATTRIBUTES +#endif /* Implementation using ARM CRC32 instructions */ + +/* + * CRC-32 folding with ARM Crypto extension-PMULL + * + * This works the same way as the x86 PCLMUL version. + * See x86/crc32_pclmul_template.h for an explanation. + */ +#undef DISPATCH_PMULL +#if !defined(DEFAULT_IMPL) && \ + (defined(__ARM_FEATURE_CRYPTO) || \ + (ARM_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS)) && \ + /* not yet tested on big endian, probably needs changes to work there */ \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define FUNCNAME crc32_pmull +# define FUNCNAME_ALIGNED crc32_pmull_aligned +# ifdef __ARM_FEATURE_CRYPTO +# define ATTRIBUTES +# define DEFAULT_IMPL crc32_pmull +# else +# ifdef __arm__ +# define ATTRIBUTES __attribute__((target("fpu=crypto-neon-fp-armv8"))) +# else +# ifdef __clang__ +# define ATTRIBUTES __attribute__((target("crypto"))) +# else +# define ATTRIBUTES __attribute__((target("+crypto"))) +# endif +# endif +# define DISPATCH 1 +# define DISPATCH_PMULL 1 +# endif + +#include + +static forceinline ATTRIBUTES uint8x16_t +clmul_00(uint8x16_t a, uint8x16_t b) +{ + return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a), + (poly64_t)vget_low_u8(b)); +} + +static forceinline ATTRIBUTES uint8x16_t +clmul_10(uint8x16_t a, uint8x16_t b) +{ + return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a), + (poly64_t)vget_high_u8(b)); +} + +static forceinline ATTRIBUTES uint8x16_t +clmul_11(uint8x16_t a, uint8x16_t b) +{ + return (uint8x16_t)vmull_high_p64((poly64x2_t)a, (poly64x2_t)b); +} + +static forceinline ATTRIBUTES uint8x16_t +fold_128b(uint8x16_t dst, uint8x16_t src, uint8x16_t multipliers) +{ + return dst ^ clmul_00(src, multipliers) ^ clmul_11(src, multipliers); +} + +static forceinline ATTRIBUTES u32 +crc32_pmull_aligned(u32 remainder, const uint8x16_t *p, size_t nr_segs) +{ + /* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */ + const uint8x16_t multipliers_4 = + (uint8x16_t)(uint64x2_t){ 0x8F352D95, 0x1D9513D7 }; + const uint8x16_t multipliers_1 = + (uint8x16_t)(uint64x2_t){ 0xAE689191, 0xCCAA009E }; + const uint8x16_t final_multiplier = + (uint8x16_t)(uint64x2_t){ 0xB8BC6765 }; + const uint8x16_t mask32 = (uint8x16_t)(uint32x4_t){ 0xFFFFFFFF }; + const uint8x16_t barrett_reduction_constants = + (uint8x16_t)(uint64x2_t){ 0x00000001F7011641, + 0x00000001DB710641 }; + const uint8x16_t zeroes = (uint8x16_t){ 0 }; + + const uint8x16_t * const end = p + nr_segs; + const uint8x16_t * const end512 = p + (nr_segs & ~3); + uint8x16_t x0, x1, x2, x3; + + x0 = *p++ ^ (uint8x16_t)(uint32x4_t){ remainder }; + if (nr_segs >= 4) { + x1 = *p++; + x2 = *p++; + x3 = *p++; + + /* Fold 512 bits at a time */ + while (p != end512) { + x0 = fold_128b(*p++, x0, multipliers_4); + x1 = fold_128b(*p++, x1, multipliers_4); + x2 = fold_128b(*p++, x2, multipliers_4); + x3 = fold_128b(*p++, x3, multipliers_4); + } + + /* Fold 512 bits => 128 bits */ + x1 = fold_128b(x1, x0, multipliers_1); + x2 = fold_128b(x2, x1, multipliers_1); + x0 = fold_128b(x3, x2, multipliers_1); + } + + /* Fold 128 bits at a time */ + while (p != end) + x0 = fold_128b(*p++, x0, multipliers_1); + + /* Fold 128 => 96 bits, implicitly appending 32 zeroes */ + x0 = vextq_u8(x0, zeroes, 8) ^ clmul_10(x0, multipliers_1); + + /* Fold 96 => 64 bits */ + x0 = vextq_u8(x0, zeroes, 4) ^ clmul_00(x0 & mask32, final_multiplier); + + /* Reduce 64 => 32 bits using Barrett reduction */ + x1 = x0; + x0 = clmul_00(x0 & mask32, barrett_reduction_constants); + x0 = clmul_10(x0 & mask32, barrett_reduction_constants); + return vgetq_lane_u32((uint32x4_t)(x0 ^ x1), 1); +} +#define IMPL_ALIGNMENT 16 +#define IMPL_SEGMENT_SIZE 16 +#include "../crc32_vec_template.h" +#endif /* PMULL implementation */ + +#ifdef DISPATCH +static inline crc32_func_t +arch_select_crc32_func(void) +{ + u32 features = get_cpu_features(); + +#ifdef DISPATCH_ARM + if (features & ARM_CPU_FEATURE_CRC32) + return crc32_arm; +#endif +#ifdef DISPATCH_PMULL + if (features & ARM_CPU_FEATURE_PMULL) + return crc32_pmull; +#endif + return NULL; +} +#endif /* DISPATCH */ + +#endif /* LIB_ARM_CRC32_IMPL_H */ diff --git a/lib/gdeflate/libdeflate/lib/arm/matchfinder_impl.h b/lib/gdeflate/libdeflate/lib/arm/matchfinder_impl.h new file mode 100644 index 0000000..da0d2fd --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/arm/matchfinder_impl.h @@ -0,0 +1,86 @@ +/* + * arm/matchfinder_impl.h - ARM implementations of matchfinder functions + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_ARM_MATCHFINDER_IMPL_H +#define LIB_ARM_MATCHFINDER_IMPL_H + +#ifdef __ARM_NEON +# include +static forceinline void +matchfinder_init_neon(mf_pos_t *data, size_t size) +{ + int16x8_t *p = (int16x8_t *)data; + int16x8_t v = (int16x8_t) { + MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, + MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, + MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, + }; + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + p[0] = v; + p[1] = v; + p[2] = v; + p[3] = v; + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); +} +#define matchfinder_init matchfinder_init_neon + +static forceinline void +matchfinder_rebase_neon(mf_pos_t *data, size_t size) +{ + int16x8_t *p = (int16x8_t *)data; + int16x8_t v = (int16x8_t) { + (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, + (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, + (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, + (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, + }; + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + p[0] = vqaddq_s16(p[0], v); + p[1] = vqaddq_s16(p[1], v); + p[2] = vqaddq_s16(p[2], v); + p[3] = vqaddq_s16(p[3], v); + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); +} +#define matchfinder_rebase matchfinder_rebase_neon + +#endif /* __ARM_NEON */ + +#endif /* LIB_ARM_MATCHFINDER_IMPL_H */ diff --git a/lib/gdeflate/libdeflate/lib/bt_matchfinder.h b/lib/gdeflate/libdeflate/lib/bt_matchfinder.h new file mode 100644 index 0000000..a994a57 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/bt_matchfinder.h @@ -0,0 +1,363 @@ +/* + * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees + * + * Originally public domain; changes after 2016-09-07 are copyrighted. + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * ---------------------------------------------------------------------------- + * + * This is a Binary Trees (bt) based matchfinder. + * + * The main data structure is a hash table where each hash bucket contains a + * binary tree of sequences whose first 4 bytes share the same hash code. Each + * sequence is identified by its starting position in the input buffer. Each + * binary tree is always sorted such that each left child represents a sequence + * lexicographically lesser than its parent and each right child represents a + * sequence lexicographically greater than its parent. + * + * The algorithm processes the input buffer sequentially. At each byte + * position, the hash code of the first 4 bytes of the sequence beginning at + * that position (the sequence being matched against) is computed. This + * identifies the hash bucket to use for that position. Then, a new binary tree + * node is created to represent the current sequence. Then, in a single tree + * traversal, the hash bucket's binary tree is searched for matches and is + * re-rooted at the new node. + * + * Compared to the simpler algorithm that uses linked lists instead of binary + * trees (see hc_matchfinder.h), the binary tree version gains more information + * at each node visitation. Ideally, the binary tree version will examine only + * 'log(n)' nodes to find the same matches that the linked list version will + * find by examining 'n' nodes. In addition, the binary tree version can + * examine fewer bytes at each node by taking advantage of the common prefixes + * that result from the sort order, whereas the linked list version may have to + * examine up to the full length of the match at each node. + * + * However, it is not always best to use the binary tree version. It requires + * nearly twice as much memory as the linked list version, and it takes time to + * keep the binary trees sorted, even at positions where the compressor does not + * need matches. Generally, when doing fast compression on small buffers, + * binary trees are the wrong approach. They are best suited for thorough + * compression and/or large buffers. + * + * ---------------------------------------------------------------------------- + */ + +#ifndef LIB_BT_MATCHFINDER_H +#define LIB_BT_MATCHFINDER_H + +#include "matchfinder_common.h" + +#define BT_MATCHFINDER_HASH3_ORDER 16 +#define BT_MATCHFINDER_HASH3_WAYS 2 +#define BT_MATCHFINDER_HASH4_ORDER 16 + +#define BT_MATCHFINDER_TOTAL_HASH_SIZE \ + (((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \ + (1UL << BT_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t)) + +/* Representation of a match found by the bt_matchfinder */ +struct lz_match { + + /* The number of bytes matched. */ + u16 length; + + /* The offset back from the current position that was matched. */ + u16 offset; +}; + +struct bt_matchfinder { + + /* The hash table for finding length 3 matches */ + mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS]; + + /* The hash table which contains the roots of the binary trees for + * finding length 4+ matches */ + mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER]; + + /* The child node references for the binary trees. The left and right + * children of the node for the sequence with position 'pos' are + * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */ + mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE]; + +} +#ifdef _aligned_attribute +_aligned_attribute(MATCHFINDER_MEM_ALIGNMENT) +#endif +; + +/* Prepare the matchfinder for a new input buffer. */ +static forceinline void +bt_matchfinder_init(struct bt_matchfinder *mf) +{ + STATIC_ASSERT(BT_MATCHFINDER_TOTAL_HASH_SIZE % + MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_SIZE); +} + +static forceinline void +bt_matchfinder_slide_window(struct bt_matchfinder *mf) +{ + STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf)); +} + +static forceinline mf_pos_t * +bt_left_child(struct bt_matchfinder *mf, s32 node) +{ + return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0]; +} + +static forceinline mf_pos_t * +bt_right_child(struct bt_matchfinder *mf, s32 node) +{ + return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1]; +} + +/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches() + * and bt_matchfinder_skip_position(). There must be sufficiently many bytes + * remaining to load a 32-bit integer from the *next* position. */ +#define BT_MATCHFINDER_REQUIRED_NBYTES 5 + +/* Advance the binary tree matchfinder by one byte, optionally recording + * matches. @record_matches should be a compile-time constant. */ +static forceinline struct lz_match * +bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf, + const u8 * const restrict in_base, + const ptrdiff_t cur_pos, + const u32 max_len, + const u32 nice_len, + const u32 max_search_depth, + u32 * const restrict next_hashes, + u32 * const restrict best_len_ret, + struct lz_match * restrict lz_matchptr, + const bool record_matches) +{ + const u8 *in_next = in_base + cur_pos; + u32 depth_remaining = max_search_depth; + const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; + u32 next_hashseq; + u32 hash3; + u32 hash4; + s32 cur_node; +#if BT_MATCHFINDER_HASH3_WAYS >= 2 + s32 cur_node_2; +#endif + const u8 *matchptr; + mf_pos_t *pending_lt_ptr, *pending_gt_ptr; + u32 best_lt_len, best_gt_len; + u32 len; + u32 best_len = 3; + + STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 && + BT_MATCHFINDER_HASH3_WAYS <= 2); + + next_hashseq = get_unaligned_le32(in_next + 1); + + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + + next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER); + next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER); + prefetchw(&mf->hash3_tab[next_hashes[0]]); + prefetchw(&mf->hash4_tab[next_hashes[1]]); + + cur_node = mf->hash3_tab[hash3][0]; + mf->hash3_tab[hash3][0] = cur_pos; +#if BT_MATCHFINDER_HASH3_WAYS >= 2 + cur_node_2 = mf->hash3_tab[hash3][1]; + mf->hash3_tab[hash3][1] = cur_node; +#endif + if (record_matches && cur_node > cutoff) { + u32 seq3 = load_u24_unaligned(in_next); + if (seq3 == load_u24_unaligned(&in_base[cur_node])) { + lz_matchptr->length = 3; + lz_matchptr->offset = in_next - &in_base[cur_node]; + lz_matchptr++; + } + #if BT_MATCHFINDER_HASH3_WAYS >= 2 + else if (cur_node_2 > cutoff && + seq3 == load_u24_unaligned(&in_base[cur_node_2])) + { + lz_matchptr->length = 3; + lz_matchptr->offset = in_next - &in_base[cur_node_2]; + lz_matchptr++; + } + #endif + } + + cur_node = mf->hash4_tab[hash4]; + mf->hash4_tab[hash4] = cur_pos; + + pending_lt_ptr = bt_left_child(mf, cur_pos); + pending_gt_ptr = bt_right_child(mf, cur_pos); + + if (cur_node <= cutoff) { + *pending_lt_ptr = MATCHFINDER_INITVAL; + *pending_gt_ptr = MATCHFINDER_INITVAL; + *best_len_ret = best_len; + return lz_matchptr; + } + + best_lt_len = 0; + best_gt_len = 0; + len = 0; + + for (;;) { + matchptr = &in_base[cur_node]; + + if (matchptr[len] == in_next[len]) { + len = lz_extend(in_next, matchptr, len + 1, max_len); + if (!record_matches || len > best_len) { + if (record_matches) { + best_len = len; + lz_matchptr->length = len; + lz_matchptr->offset = in_next - matchptr; + lz_matchptr++; + } + if (len >= nice_len) { + *pending_lt_ptr = *bt_left_child(mf, cur_node); + *pending_gt_ptr = *bt_right_child(mf, cur_node); + *best_len_ret = best_len; + return lz_matchptr; + } + } + } + + if (matchptr[len] < in_next[len]) { + *pending_lt_ptr = cur_node; + pending_lt_ptr = bt_right_child(mf, cur_node); + cur_node = *pending_lt_ptr; + best_lt_len = len; + if (best_gt_len < len) + len = best_gt_len; + } else { + *pending_gt_ptr = cur_node; + pending_gt_ptr = bt_left_child(mf, cur_node); + cur_node = *pending_gt_ptr; + best_gt_len = len; + if (best_lt_len < len) + len = best_lt_len; + } + + if (cur_node <= cutoff || !--depth_remaining) { + *pending_lt_ptr = MATCHFINDER_INITVAL; + *pending_gt_ptr = MATCHFINDER_INITVAL; + *best_len_ret = best_len; + return lz_matchptr; + } + } +} + +/* + * Retrieve a list of matches with the current position. + * + * @mf + * The matchfinder structure. + * @in_base + * Pointer to the next byte in the input buffer to process _at the last + * time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_. + * @cur_pos + * The current position in the input buffer relative to @in_base (the + * position of the sequence being matched against). + * @max_len + * The maximum permissible match length at this position. Must be >= + * BT_MATCHFINDER_REQUIRED_NBYTES. + * @nice_len + * Stop searching if a match of at least this length is found. + * Must be <= @max_len. + * @max_search_depth + * Limit on the number of potential matches to consider. Must be >= 1. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + 1. + * @best_len_ret + * If a match of length >= 4 was found, then the length of the longest such + * match is written here; otherwise 3 is written here. (Note: this is + * redundant with the 'struct lz_match' array, but this is easier for the + * compiler to optimize when inlined and the caller immediately does a + * check against 'best_len'.) + * @lz_matchptr + * An array in which this function will record the matches. The recorded + * matches will be sorted by strictly increasing length and (non-strictly) + * increasing offset. The maximum number of matches that may be found is + * 'nice_len - 2'. + * + * The return value is a pointer to the next available slot in the @lz_matchptr + * array. (If no matches were found, this will be the same as @lz_matchptr.) + */ +static forceinline struct lz_match * +bt_matchfinder_get_matches(struct bt_matchfinder *mf, + const u8 *in_base, + ptrdiff_t cur_pos, + u32 max_len, + u32 nice_len, + u32 max_search_depth, + u32 next_hashes[2], + u32 *best_len_ret, + struct lz_match *lz_matchptr) +{ + return bt_matchfinder_advance_one_byte(mf, + in_base, + cur_pos, + max_len, + nice_len, + max_search_depth, + next_hashes, + best_len_ret, + lz_matchptr, + true); +} + +/* + * Advance the matchfinder, but don't record any matches. + * + * This is very similar to bt_matchfinder_get_matches() because both functions + * must do hashing and tree re-rooting. + */ +static forceinline void +bt_matchfinder_skip_position(struct bt_matchfinder *mf, + const u8 *in_base, + ptrdiff_t cur_pos, + u32 nice_len, + u32 max_search_depth, + u32 next_hashes[2]) +{ + u32 best_len; + bt_matchfinder_advance_one_byte(mf, + in_base, + cur_pos, + nice_len, + nice_len, + max_search_depth, + next_hashes, + &best_len, + NULL, + false); +} + +#endif /* LIB_BT_MATCHFINDER_H */ diff --git a/lib/gdeflate/libdeflate/lib/cpu_features_common.h b/lib/gdeflate/libdeflate/lib/cpu_features_common.h new file mode 100644 index 0000000..570b62d --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/cpu_features_common.h @@ -0,0 +1,88 @@ +/* + * cpu_features_common.h - code shared by all lib/$arch/cpu_features.c + * + * Copyright 2020 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_CPU_FEATURES_COMMON_H +#define LIB_CPU_FEATURES_COMMON_H + +#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) +# define _GNU_SOURCE 1 /* for strdup() and strtok_r() */ +# include +# include +# include +#endif + +#include "lib_common.h" + +struct cpu_feature { + u32 bit; + const char *name; +}; + +#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) +/* Disable any features that are listed in $LIBDEFLATE_DISABLE_CPU_FEATURES. */ +static inline void +disable_cpu_features_for_testing(u32 *features, + const struct cpu_feature *feature_table, + size_t feature_table_length) +{ + char *env_value, *strbuf, *p, *saveptr = NULL; + size_t i; + + env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES"); + if (!env_value) + return; + strbuf = strdup(env_value); + if (!strbuf) + abort(); + p = strtok_r(strbuf, ",", &saveptr); + while (p) { + for (i = 0; i < feature_table_length; i++) { + if (strcmp(p, feature_table[i].name) == 0) { + *features &= ~feature_table[i].bit; + break; + } + } + if (i == feature_table_length) { + fprintf(stderr, + "unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n", + p); + abort(); + } + p = strtok_r(NULL, ",", &saveptr); + } + free(strbuf); +} +#else /* TEST_SUPPORT__DO_NOT_USE */ +static inline void +disable_cpu_features_for_testing(u32 *features, + const struct cpu_feature *feature_table, + size_t feature_table_length) +{ +} +#endif /* !TEST_SUPPORT__DO_NOT_USE */ + +#endif /* LIB_CPU_FEATURES_COMMON_H */ diff --git a/lib/gdeflate/libdeflate/lib/crc32.c b/lib/gdeflate/libdeflate/lib/crc32.c new file mode 100644 index 0000000..6adacc5 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/crc32.c @@ -0,0 +1,313 @@ +/* + * crc32.c - CRC-32 checksum algorithm for the gzip format + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * High-level description of CRC + * ============================= + * + * Consider a bit sequence 'bits[1...len]'. Interpret 'bits' as the "message" + * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2), + * where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute: + * + * R(x) = M(x)*x^n mod G(x) + * + * where G(x) is a selected "generator" polynomial of degree 'n'. The remainder + * R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x) + * interpreted as a bitstring of length 'n'. + * + * CRC used in gzip + * ================ + * + * In the gzip format (RFC 1952): + * + * - The bitstring to checksum is formed from the bytes of the uncompressed + * data by concatenating the bits from the bytes in order, proceeding + * from the low-order bit to the high-order bit within each byte. + * + * - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 + + * x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1. + * Consequently, the CRC length is 32 bits ("CRC-32"). + * + * - The highest order 32 coefficients of M(x)*x^n are inverted. + * + * - All 32 coefficients of R(x) are inverted. + * + * The two inversions cause added leading and trailing zero bits to affect the + * resulting CRC, whereas with a regular CRC such bits would have no effect on + * the CRC. + * + * Computation and optimizations + * ============================= + * + * We can compute R(x) through "long division", maintaining only 32 bits of + * state at any given time. Multiplication by 'x' can be implemented as + * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the + * highest order bit represents the coefficient of x^0), and both addition and + * subtraction can be implemented as bitwise exclusive OR (since we are working + * in GF(2)). Here is an unoptimized implementation: + * + * static u32 crc32_gzip(const u8 *buffer, size_t size) + * { + * u32 remainder = 0; + * const u32 divisor = 0xEDB88320; + * + * for (size_t i = 0; i < size * 8 + 32; i++) { + * int bit; + * u32 multiple; + * + * if (i < size * 8) + * bit = (buffer[i / 8] >> (i % 8)) & 1; + * else + * bit = 0; // one of the 32 appended 0 bits + * + * if (i < 32) // the first 32 bits are inverted + * bit ^= 1; + * + * if (remainder & 1) + * multiple = divisor; + * else + * multiple = 0; + * + * remainder >>= 1; + * remainder |= (u32)bit << 31; + * remainder ^= multiple; + * } + * + * return ~remainder; + * } + * + * In this implementation, the 32-bit integer 'remainder' maintains the + * remainder of the currently processed portion of the message (with 32 zero + * bits appended) when divided by the generator polynomial. 'remainder' is the + * representation of R(x), and 'divisor' is the representation of G(x) excluding + * the x^32 coefficient. For each bit to process, we multiply R(x) by 'x^1', + * then add 'x^0' if the new bit is a 1. If this causes R(x) to gain a nonzero + * x^32 term, then we subtract G(x) from R(x). + * + * We can speed this up by taking advantage of the fact that XOR is commutative + * and associative, so the order in which we combine the inputs into 'remainder' + * is unimportant. And since each message bit we add doesn't affect the choice + * of 'multiple' until 32 bits later, we need not actually add each message bit + * until that point: + * + * static u32 crc32_gzip(const u8 *buffer, size_t size) + * { + * u32 remainder = ~0; + * const u32 divisor = 0xEDB88320; + * + * for (size_t i = 0; i < size * 8; i++) { + * int bit; + * u32 multiple; + * + * bit = (buffer[i / 8] >> (i % 8)) & 1; + * remainder ^= bit; + * if (remainder & 1) + * multiple = divisor; + * else + * multiple = 0; + * remainder >>= 1; + * remainder ^= multiple; + * } + * + * return ~remainder; + * } + * + * With the above implementation we get the effect of 32 appended 0 bits for + * free; they never affect the choice of a divisor, nor would they change the + * value of 'remainder' if they were to be actually XOR'ed in. And by starting + * with a remainder of all 1 bits, we get the effect of complementing the first + * 32 message bits. + * + * The next optimization is to process the input in multi-bit units. Suppose + * that we insert the next 'n' message bits into the remainder. Then we get an + * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n' + * bits is the amount by which the low 32 bits of the remainder will change as a + * result of cancelling out those 'n' bits. Taking n=8 (one byte) and + * precomputing a table containing the CRC of each possible byte, we get + * crc32_slice1() defined below. + * + * As a further optimization, we could increase the multi-bit unit size to 16. + * However, that is inefficient because the table size explodes from 256 entries + * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't + * fit in L1 cache on typical processors. + * + * However, we can actually process 4 bytes at a time using 4 different tables + * with 256 entries each. Logically, we form a 64-bit intermediate remainder + * and cancel out the high 32 bits in 8-bit chunks. Bits 32-39 are cancelled + * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the + * CRC of those bits with 8 zero bits appended, and so on. This method is + * implemented in crc32_slice4(), defined below. + * + * In crc32_slice8(), this method is extended to 8 bytes at a time. The + * intermediate remainder (which we never actually store explicitly) is 96 bits. + * + * On CPUs that support fast carryless multiplication, CRCs can be computed even + * more quickly via "folding". See e.g. the x86 PCLMUL implementation. + */ + +#include "lib_common.h" +#include "libdeflate.h" + +typedef u32 (*crc32_func_t)(u32, const u8 *, size_t); + +/* Include architecture-specific implementations if available */ +#undef CRC32_SLICE1 +#undef CRC32_SLICE4 +#undef CRC32_SLICE8 +#undef DEFAULT_IMPL +#undef DISPATCH +#if defined(__arm__) || defined(__aarch64__) +# include "arm/crc32_impl.h" +#elif defined(__i386__) || defined(__x86_64__) +# include "x86/crc32_impl.h" +#endif + +/* + * Define a generic implementation (crc32_slice8()) if needed. crc32_slice1() + * may also be needed as a fallback for architecture-specific implementations. + */ + +#ifndef DEFAULT_IMPL +# define CRC32_SLICE8 1 +# define DEFAULT_IMPL crc32_slice8 +#endif + +#if defined(CRC32_SLICE1) || defined(CRC32_SLICE4) || defined(CRC32_SLICE8) +#include "crc32_table.h" +static forceinline u32 +crc32_update_byte(u32 remainder, u8 next_byte) +{ + return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte]; +} +#endif + +#ifdef CRC32_SLICE1 +static u32 +crc32_slice1(u32 remainder, const u8 *buffer, size_t size) +{ + size_t i; + + STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100); + + for (i = 0; i < size; i++) + remainder = crc32_update_byte(remainder, buffer[i]); + return remainder; +} +#endif /* CRC32_SLICE1 */ + +#ifdef CRC32_SLICE4 +static u32 +crc32_slice4(u32 remainder, const u8 *buffer, size_t size) +{ + const u8 *p = buffer; + const u8 *end = buffer + size; + const u8 *end32; + + STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400); + + for (; ((uintptr_t)p & 3) && p != end; p++) + remainder = crc32_update_byte(remainder, *p); + + end32 = p + ((end - p) & ~3); + for (; p != end32; p += 4) { + u32 v = le32_bswap(*(const u32 *)p); + remainder = + crc32_table[0x300 + (u8)((remainder ^ v) >> 0)] ^ + crc32_table[0x200 + (u8)((remainder ^ v) >> 8)] ^ + crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^ + crc32_table[0x000 + (u8)((remainder ^ v) >> 24)]; + } + + for (; p != end; p++) + remainder = crc32_update_byte(remainder, *p); + + return remainder; +} +#endif /* CRC32_SLICE4 */ + +#ifdef CRC32_SLICE8 +static u32 +crc32_slice8(u32 remainder, const u8 *buffer, size_t size) +{ + const u8 *p = buffer; + const u8 *end = buffer + size; + const u8 *end64; + + STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800); + + for (; ((uintptr_t)p & 7) && p != end; p++) + remainder = crc32_update_byte(remainder, *p); + + end64 = p + ((end - p) & ~7); + for (; p != end64; p += 8) { + u32 v1 = le32_bswap(*(const u32 *)(p + 0)); + u32 v2 = le32_bswap(*(const u32 *)(p + 4)); + remainder = + crc32_table[0x700 + (u8)((remainder ^ v1) >> 0)] ^ + crc32_table[0x600 + (u8)((remainder ^ v1) >> 8)] ^ + crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^ + crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^ + crc32_table[0x300 + (u8)(v2 >> 0)] ^ + crc32_table[0x200 + (u8)(v2 >> 8)] ^ + crc32_table[0x100 + (u8)(v2 >> 16)] ^ + crc32_table[0x000 + (u8)(v2 >> 24)]; + } + + for (; p != end; p++) + remainder = crc32_update_byte(remainder, *p); + + return remainder; +} +#endif /* CRC32_SLICE8 */ + +#ifdef DISPATCH +static u32 dispatch(u32, const u8 *, size_t); + +static volatile crc32_func_t crc32_impl = dispatch; + +/* Choose the fastest implementation at runtime */ +static u32 dispatch(u32 remainder, const u8 *buffer, size_t size) +{ + crc32_func_t f = arch_select_crc32_func(); + + if (f == NULL) + f = DEFAULT_IMPL; + + crc32_impl = f; + return crc32_impl(remainder, buffer, size); +} +#else +# define crc32_impl DEFAULT_IMPL /* only one implementation, use it */ +#endif + +LIBDEFLATEEXPORT u32 LIBDEFLATEAPI +libdeflate_crc32(u32 remainder, const void *buffer, size_t size) +{ + if (buffer == NULL) /* return initial value */ + return 0; + return ~crc32_impl(~remainder, buffer, size); +} diff --git a/lib/gdeflate/libdeflate/lib/crc32_table.h b/lib/gdeflate/libdeflate/lib/crc32_table.h new file mode 100644 index 0000000..05421b9 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/crc32_table.h @@ -0,0 +1,526 @@ +/* + * crc32_table.h - data table to accelerate CRC-32 computation + * + * THIS FILE WAS AUTOMATICALLY GENERATED BY gen_crc32_table.c. DO NOT EDIT. + */ + +#include + +static const uint32_t crc32_table[] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, + 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, + 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, + 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, + 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, + 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, + 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, + 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, + 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, + 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, + 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, + 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, + 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, + 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, + 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, + 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, + 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, + 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, + 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, + 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, + 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, + 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, + 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, + 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, + 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, + 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, + 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, + 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, + 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d, +#if defined(CRC32_SLICE4) || defined(CRC32_SLICE8) + 0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3, + 0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7, + 0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb, + 0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf, + 0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192, + 0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496, + 0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a, + 0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e, + 0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761, + 0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265, + 0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69, + 0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d, + 0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530, + 0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034, + 0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38, + 0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c, + 0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6, + 0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2, + 0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce, + 0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca, + 0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97, + 0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93, + 0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f, + 0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b, + 0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864, + 0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60, + 0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c, + 0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768, + 0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35, + 0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31, + 0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d, + 0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539, + 0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88, + 0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c, + 0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180, + 0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484, + 0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9, + 0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd, + 0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1, + 0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5, + 0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a, + 0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e, + 0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522, + 0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026, + 0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b, + 0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f, + 0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773, + 0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277, + 0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d, + 0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189, + 0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85, + 0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81, + 0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc, + 0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8, + 0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4, + 0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0, + 0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f, + 0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b, + 0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27, + 0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23, + 0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e, + 0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a, + 0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876, + 0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72, + 0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59, + 0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685, + 0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1, + 0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d, + 0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29, + 0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5, + 0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91, + 0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d, + 0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9, + 0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065, + 0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901, + 0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd, + 0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9, + 0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315, + 0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71, + 0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad, + 0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399, + 0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45, + 0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221, + 0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd, + 0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9, + 0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835, + 0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151, + 0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d, + 0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579, + 0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5, + 0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1, + 0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d, + 0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609, + 0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5, + 0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1, + 0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d, + 0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9, + 0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05, + 0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461, + 0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd, + 0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9, + 0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75, + 0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711, + 0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd, + 0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339, + 0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5, + 0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281, + 0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d, + 0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049, + 0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895, + 0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1, + 0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d, + 0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819, + 0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5, + 0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1, + 0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d, + 0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69, + 0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5, + 0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1, + 0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d, + 0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9, + 0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625, + 0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41, + 0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d, + 0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89, + 0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555, + 0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31, + 0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed, + 0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, + 0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9, + 0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701, + 0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056, + 0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871, + 0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26, + 0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e, + 0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9, + 0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0, + 0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787, + 0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f, + 0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68, + 0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f, + 0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018, + 0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0, + 0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7, + 0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3, + 0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084, + 0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c, + 0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b, + 0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c, + 0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b, + 0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3, + 0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4, + 0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed, + 0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba, + 0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002, + 0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755, + 0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72, + 0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825, + 0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d, + 0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca, + 0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5, + 0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82, + 0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a, + 0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d, + 0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a, + 0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d, + 0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5, + 0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2, + 0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb, + 0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc, + 0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04, + 0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953, + 0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174, + 0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623, + 0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b, + 0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc, + 0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8, + 0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf, + 0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907, + 0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50, + 0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677, + 0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120, + 0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98, + 0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf, + 0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6, + 0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981, + 0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639, + 0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e, + 0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949, + 0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e, + 0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6, + 0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1, +#endif /* CRC32_SLICE4 || CRC32_SLICE8 */ +#if defined(CRC32_SLICE8) + 0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0, + 0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10, + 0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111, + 0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1, + 0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52, + 0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92, + 0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693, + 0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053, + 0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4, + 0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314, + 0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15, + 0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5, + 0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256, + 0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496, + 0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997, + 0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57, + 0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299, + 0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459, + 0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958, + 0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98, + 0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b, + 0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db, + 0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda, + 0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a, + 0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d, + 0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d, + 0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c, + 0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c, + 0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f, + 0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf, + 0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de, + 0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e, + 0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42, + 0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82, + 0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183, + 0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743, + 0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0, + 0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00, + 0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601, + 0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1, + 0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546, + 0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386, + 0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87, + 0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847, + 0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4, + 0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404, + 0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905, + 0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5, + 0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b, + 0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb, + 0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca, + 0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a, + 0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589, + 0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349, + 0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48, + 0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888, + 0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f, + 0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf, + 0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce, + 0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e, + 0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d, + 0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d, + 0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c, + 0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c, + 0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae, + 0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8, + 0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3, + 0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5, + 0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035, + 0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223, + 0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258, + 0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e, + 0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798, + 0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e, + 0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5, + 0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3, + 0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503, + 0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715, + 0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e, + 0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578, + 0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2, + 0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4, + 0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf, + 0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9, + 0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59, + 0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f, + 0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834, + 0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22, + 0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4, + 0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2, + 0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99, + 0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f, + 0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f, + 0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79, + 0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02, + 0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14, + 0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676, + 0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460, + 0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b, + 0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d, + 0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed, + 0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb, + 0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680, + 0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496, + 0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340, + 0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156, + 0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d, + 0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b, + 0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db, + 0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd, + 0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6, + 0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0, + 0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a, + 0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c, + 0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77, + 0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61, + 0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81, + 0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97, + 0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec, + 0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa, + 0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c, + 0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a, + 0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41, + 0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957, + 0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7, + 0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1, + 0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da, + 0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc, + 0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d, + 0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e, + 0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa, + 0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9, + 0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653, + 0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240, + 0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834, + 0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27, + 0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301, + 0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712, + 0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66, + 0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975, + 0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf, + 0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc, + 0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8, + 0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb, + 0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4, + 0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7, + 0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183, + 0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590, + 0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a, + 0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739, + 0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d, + 0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e, + 0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678, + 0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b, + 0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f, + 0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c, + 0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6, + 0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5, + 0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1, + 0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2, + 0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f, + 0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c, + 0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08, + 0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b, + 0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1, + 0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2, + 0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6, + 0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5, + 0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3, + 0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0, + 0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794, + 0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387, + 0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d, + 0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e, + 0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a, + 0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49, + 0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516, + 0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105, + 0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71, + 0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62, + 0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8, + 0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb, + 0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf, + 0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac, + 0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a, + 0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899, + 0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed, + 0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe, + 0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044, + 0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457, + 0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23, + 0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30, + 0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, + 0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919, + 0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56, + 0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac, + 0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8, + 0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832, + 0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d, + 0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387, + 0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5, + 0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f, + 0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00, + 0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa, + 0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e, + 0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64, + 0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b, + 0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1, + 0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e, + 0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4, + 0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb, + 0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041, + 0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425, + 0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf, + 0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90, + 0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a, + 0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758, + 0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2, + 0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced, + 0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217, + 0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673, + 0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889, + 0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6, + 0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c, + 0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239, + 0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3, + 0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c, + 0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776, + 0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312, + 0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8, + 0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7, + 0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d, + 0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f, + 0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95, + 0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda, + 0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520, + 0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144, + 0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe, + 0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1, + 0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b, + 0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4, + 0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e, + 0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61, + 0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b, + 0x061d761c, 0xcab77682, 0x44387161, 0x889271ff, + 0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05, + 0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a, + 0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0, + 0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282, + 0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78, + 0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937, + 0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd, + 0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9, + 0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53, + 0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c, + 0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6, +#endif /* CRC32_SLICE8 */ +}; diff --git a/lib/gdeflate/libdeflate/lib/crc32_vec_template.h b/lib/gdeflate/libdeflate/lib/crc32_vec_template.h new file mode 100644 index 0000000..9a2ad5b --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/crc32_vec_template.h @@ -0,0 +1,61 @@ +/* + * crc32_vec_template.h - template for vectorized CRC-32 implementations + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#define CRC32_SLICE1 1 +static u32 crc32_slice1(u32, const u8 *, size_t); + +/* + * Template for vectorized CRC-32 implementations. + * + * Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead + * of crc32_slice8() because only a few bytes need to be processed, so a smaller + * table is preferable. + */ +static u32 ATTRIBUTES +FUNCNAME(u32 remainder, const u8 *p, size_t size) +{ + if ((uintptr_t)p % IMPL_ALIGNMENT) { + size_t n = MIN(size, -(uintptr_t)p % IMPL_ALIGNMENT); + + remainder = crc32_slice1(remainder, p, n); + p += n; + size -= n; + } + if (size >= IMPL_SEGMENT_SIZE) { + remainder = FUNCNAME_ALIGNED(remainder, (const void *)p, + size / IMPL_SEGMENT_SIZE); + p += size - (size % IMPL_SEGMENT_SIZE); + size %= IMPL_SEGMENT_SIZE; + } + return crc32_slice1(remainder, p, size); +} + +#undef FUNCNAME +#undef FUNCNAME_ALIGNED +#undef ATTRIBUTES +#undef IMPL_ALIGNMENT +#undef IMPL_SEGMENT_SIZE diff --git a/lib/gdeflate/libdeflate/lib/decompress_template.h b/lib/gdeflate/libdeflate/lib/decompress_template.h new file mode 100644 index 0000000..c6bcf9f --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/decompress_template.h @@ -0,0 +1,421 @@ +/* + * decompress_template.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This is the actual DEFLATE decompression routine, lifted out of + * deflate_decompress.c so that it can be compiled multiple times with different + * target instruction sets. + */ + +static enum libdeflate_result ATTRIBUTES +FUNCNAME(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + u8 *out_next = out; + u8 * const out_end = out_next + out_nbytes_avail; + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + bitbuf_t bitbuf = 0; + unsigned bitsleft = 0; + size_t overrun_count = 0; + unsigned i; + unsigned is_final_block; + unsigned block_type; + u16 len; + u16 nlen; + unsigned num_litlen_syms; + unsigned num_offset_syms; + u16 tmp16; + u32 tmp32; + +next_block: + /* Starting to read the next block. */ + ; + + STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4)); + ENSURE_BITS(1 + 2 + 5 + 5 + 4); + + /* BFINAL: 1 bit */ + is_final_block = POP_BITS(1); + + /* BTYPE: 2 bits */ + block_type = POP_BITS(2); + + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { + + /* Dynamic Huffman block. */ + + /* The order in which precode lengths are stored. */ + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + unsigned num_explicit_precode_lens; + + /* Read the codeword length counts. */ + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257); + num_litlen_syms = POP_BITS(5) + 257; + + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1); + num_offset_syms = POP_BITS(5) + 1; + + STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4); + num_explicit_precode_lens = POP_BITS(4) + 4; + + d->static_codes_loaded = false; + + /* Read the precode codeword lengths. */ + STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); + for (i = 0; i < num_explicit_precode_lens; i++) { + ENSURE_BITS(3); + d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); + } + + for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) + d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; + + /* Build the decode table for the precode. */ + SAFETY_CHECK(build_precode_decode_table(d)); + + /* Expand the literal/length and offset codeword lengths. */ + for (i = 0; i < num_litlen_syms + num_offset_syms; ) { + u32 entry; + unsigned presym; + u8 rep_val; + unsigned rep_count; + + ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7); + + /* (The code below assumes that the precode decode table + * does not have any subtables.) */ + STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); + + /* Read the next precode symbol. */ + entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)]; + REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); + presym = entry >> HUFFDEC_RESULT_SHIFT; + + if (presym < 16) { + /* Explicit codeword length */ + d->u.l.lens[i++] = presym; + continue; + } + + /* Run-length encoded codeword lengths */ + + /* Note: we don't need verify that the repeat count + * doesn't overflow the number of elements, since we + * have enough extra spaces to allow for the worst-case + * overflow (138 zeroes when only 1 length was + * remaining). + * + * In the case of the small repeat counts (presyms 16 + * and 17), it is fastest to always write the maximum + * number of entries. That gets rid of branches that + * would otherwise be required. + * + * It is not just because of the numerical order that + * our checks go in the order 'presym < 16', 'presym == + * 16', and 'presym == 17'. For typical data this is + * ordered from most frequent to least frequent case. + */ + STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); + + if (presym == 16) { + /* Repeat the previous length 3 - 6 times */ + SAFETY_CHECK(i != 0); + rep_val = d->u.l.lens[i - 1]; + STATIC_ASSERT(3 + ((1 << 2) - 1) == 6); + rep_count = 3 + POP_BITS(2); + d->u.l.lens[i + 0] = rep_val; + d->u.l.lens[i + 1] = rep_val; + d->u.l.lens[i + 2] = rep_val; + d->u.l.lens[i + 3] = rep_val; + d->u.l.lens[i + 4] = rep_val; + d->u.l.lens[i + 5] = rep_val; + i += rep_count; + } else if (presym == 17) { + /* Repeat zero 3 - 10 times */ + STATIC_ASSERT(3 + ((1 << 3) - 1) == 10); + rep_count = 3 + POP_BITS(3); + d->u.l.lens[i + 0] = 0; + d->u.l.lens[i + 1] = 0; + d->u.l.lens[i + 2] = 0; + d->u.l.lens[i + 3] = 0; + d->u.l.lens[i + 4] = 0; + d->u.l.lens[i + 5] = 0; + d->u.l.lens[i + 6] = 0; + d->u.l.lens[i + 7] = 0; + d->u.l.lens[i + 8] = 0; + d->u.l.lens[i + 9] = 0; + i += rep_count; + } else { + /* Repeat zero 11 - 138 times */ + STATIC_ASSERT(11 + ((1 << 7) - 1) == 138); + rep_count = 11 + POP_BITS(7); + memset(&d->u.l.lens[i], 0, + rep_count * sizeof(d->u.l.lens[i])); + i += rep_count; + } + } + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + + /* Uncompressed block: copy 'len' bytes literally from the input + * buffer to the output buffer. */ + + ALIGN_INPUT(); + + SAFETY_CHECK(in_end - in_next >= 4); + + len = READ_U16(); + nlen = READ_U16(); + + SAFETY_CHECK(len == (u16)~nlen); + if (unlikely(len > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + SAFETY_CHECK(len <= in_end - in_next); + + memcpy(out_next, in_next, len); + in_next += len; + out_next += len; + + goto block_done; + + } else { + SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); + + /* + * Static Huffman block: build the decode tables for the static + * codes. Skip doing so if the tables are already set up from + * an earlier static block; this speeds up decompression of + * degenerate input of many empty or very short static blocks. + * + * Afterwards, the remainder is the same as decompressing a + * dynamic Huffman block. + */ + + if (d->static_codes_loaded) + goto have_decode_tables; + + d->static_codes_loaded = true; + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); + + for (i = 0; i < 144; i++) + d->u.l.lens[i] = 8; + for (; i < 256; i++) + d->u.l.lens[i] = 9; + for (; i < 280; i++) + d->u.l.lens[i] = 7; + for (; i < 288; i++) + d->u.l.lens[i] = 8; + + for (; i < 288 + 32; i++) + d->u.l.lens[i] = 5; + + num_litlen_syms = 288; + num_offset_syms = 32; + } + + /* Decompressing a Huffman block (either dynamic or static) */ + + SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); + SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); +have_decode_tables: + + /* The main DEFLATE decode loop */ + for (;;) { + u32 entry; + u32 length; + u32 offset; + const u8 *src; + u8 *dst; + + /* Decode a litlen symbol. */ + ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN); + entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)]; + if (entry & HUFFDEC_SUBTABLE_POINTER) { + /* Litlen subtable required (uncommon case) */ + REMOVE_BITS(LITLEN_TABLEBITS); + entry = d->u.litlen_decode_table[ + ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + + BITS(entry & HUFFDEC_LENGTH_MASK)]; + } + REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); + if (entry & HUFFDEC_LITERAL) { + /* Literal */ + if (unlikely(out_next == out_end)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT); + continue; + } + + /* Match or end-of-block */ + + entry >>= HUFFDEC_RESULT_SHIFT; + ENSURE_BITS(MAX_ENSURE); + + /* Pop the extra length bits and add them to the length base to + * produce the full length. */ + length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) + + POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK); + + /* The match destination must not end after the end of the + * output buffer. For efficiency, combine this check with the + * end-of-block check. We're using 0 for the special + * end-of-block length, so subtract 1 and it turn it into + * SIZE_MAX. */ + STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0); + if (unlikely((size_t)length - 1 >= out_end - out_next)) { + if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + goto block_done; + } + + /* Decode the match offset. */ + + entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)]; + if (entry & HUFFDEC_SUBTABLE_POINTER) { + /* Offset subtable required (uncommon case) */ + REMOVE_BITS(OFFSET_TABLEBITS); + entry = d->offset_decode_table[ + ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + + BITS(entry & HUFFDEC_LENGTH_MASK)]; + } + REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); + entry >>= HUFFDEC_RESULT_SHIFT; + + STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS + + DEFLATE_MAX_OFFSET_CODEWORD_LEN) && + CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS)); + if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS + + DEFLATE_MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS)) + ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS); + + /* Pop the extra offset bits and add them to the offset base to + * produce the full offset. */ + offset = (entry & HUFFDEC_OFFSET_BASE_MASK) + + POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT); + + /* The match source must not begin before the beginning of the + * output buffer. */ + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + + /* + * Copy the match: 'length' bytes at 'out_next - offset' to + * 'out_next', possibly overlapping. If the match doesn't end + * too close to the end of the buffer and offset >= WORDBYTES || + * offset == 1, take a fast path which copies a word at a time + * -- potentially more than the length of the match, but that's + * fine as long as we check for enough extra space. + * + * The remaining cases are not performance-critical so are + * handled by a simple byte-by-byte copy. + */ + + src = out_next - offset; + dst = out_next; + out_next += length; + + if (UNALIGNED_ACCESS_IS_FAST && + /* max overrun is writing 3 words for a min length match */ + likely(out_end - out_next >= + 3 * WORDBYTES - DEFLATE_MIN_MATCH_LEN)) { + if (offset >= WORDBYTES) { /* words don't overlap? */ + copy_word_unaligned(src, dst); + src += WORDBYTES; + dst += WORDBYTES; + copy_word_unaligned(src, dst); + src += WORDBYTES; + dst += WORDBYTES; + do { + copy_word_unaligned(src, dst); + src += WORDBYTES; + dst += WORDBYTES; + } while (dst < out_next); + } else if (offset == 1) { + /* RLE encoding of previous byte, common if the + * data contains many repeated bytes */ + machine_word_t v = repeat_byte(*src); + + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + do { + store_word_unaligned(v, dst); + dst += WORDBYTES; + } while (dst < out_next); + } else { + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + } else { + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + } + +block_done: + /* Finished decoding a block. */ + + if (!is_final_block) + goto next_block; + + /* That was the last block. */ + + /* Discard any readahead bits and check for excessive overread */ + ALIGN_INPUT(); + + /* Optionally return the actual number of bytes read */ + if (actual_in_nbytes_ret) + *actual_in_nbytes_ret = in_next - (u8 *)in; + + /* Optionally return the actual number of bytes written */ + if (actual_out_nbytes_ret) { + *actual_out_nbytes_ret = out_next - (u8 *)out; + } else { + if (out_next != out_end) + return LIBDEFLATE_SHORT_OUTPUT; + } + return LIBDEFLATE_SUCCESS; +} + +#undef FUNCNAME +#undef ATTRIBUTES diff --git a/lib/gdeflate/libdeflate/lib/deflate_compress.c b/lib/gdeflate/libdeflate/lib/deflate_compress.c new file mode 100644 index 0000000..21d4f35 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/deflate_compress.c @@ -0,0 +1,3360 @@ +/* + * deflate_compress.c - a compressor for DEFLATE + * + * Originally public domain; changes after 2016-09-07 are copyrighted. + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "deflate_compress.h" +#include "deflate_constants.h" +#include "unaligned.h" + +#include "libdeflate.h" + +/* + * By default, the near-optimal parsing algorithm is enabled at compression + * level 8 and above. The near-optimal parsing algorithm produces a compression + * ratio significantly better than the greedy and lazy algorithms implemented + * here, and also the algorithm used by zlib at level 9. However, it is slow. + */ +#define SUPPORT_NEAR_OPTIMAL_PARSING 1 + +/* + * Define to 1 to maintain the full map from match offsets to offset slots. + * This slightly speeds up translations of match offsets to offset slots, but it + * uses 32769 bytes of memory rather than the 512 bytes used by the condensed + * map. The speedup provided by the larger map is most helpful when the + * near-optimal parsing algorithm is being used. + */ +#define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING + +#ifdef DEFLATE64 +/* + * DEFLATE64 uses a 65536 byte sliding window; set the matchfinder parameters + * appropriately. + */ +#define MATCHFINDER_WINDOW_ORDER 16 + +/* TODO: condensed map is not supported in deflate64 mode */ +#undef USE_FULL_OFFSET_SLOT_FAST +#define USE_FULL_OFFSET_SLOT_FAST 1 +#else +/* + * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters + * appropriately. + */ +#define MATCHFINDER_WINDOW_ORDER 15 +#endif + +#include "hc_matchfinder.h" +#if SUPPORT_NEAR_OPTIMAL_PARSING +# include "bt_matchfinder.h" +#endif + +/* + * Use wider type for LZ-item in deflate64 mode. + */ +#ifdef DEFLATE64 +typedef u64 lz_item_t; +#else +typedef u32 lz_item_t; +#endif + +/* + * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes, + * except if the last block has to be shorter. + */ +#define MIN_BLOCK_LENGTH 10000 + +/* + * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but + * the final length might be slightly longer due to matches extending beyond + * this limit. + */ +#define SOFT_MAX_BLOCK_LENGTH 300000 + +/* + * The number of observed matches or literals that represents sufficient data to + * decide whether the current block should be terminated or not. + */ +#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512 + + +#if SUPPORT_NEAR_OPTIMAL_PARSING +/* Constants specific to the near-optimal parsing algorithm */ + +/* + * The maximum number of matches the matchfinder can find at a single position. + * Since the matchfinder never finds more than one match for the same length, + * presuming one of each possible length is sufficient for an upper bound. + * (This says nothing about whether it is worthwhile to consider so many + * matches; this is just defining the worst case.) + */ +# define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) + +/* + * The number of lz_match structures in the match cache, excluding the extra + * "overflow" entries. This value should be high enough so that nearly the + * time, all matches found in a given block can fit in the match cache. + * However, fallback behavior (immediately terminating the block) on cache + * overflow is still required. + */ +# define CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5) + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +/* + * These are the compressor-side limits on the codeword lengths for each Huffman + * code. To make outputting bits slightly faster, some of these limits are + * lower than the limits defined by the DEFLATE format. This does not + * significantly affect the compression ratio, at least for the block lengths we + * use. + */ +#define MAX_LITLEN_CODEWORD_LEN 14 +#define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN +#define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN + +/* Table: length slot => length slot base value */ +static const unsigned deflate_length_slot_base[] = { + 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , + 11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 , + 35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 , + 131 , 163 , 195 , 227 , +#ifndef DEFLATE64 + 258 , +#else + 3 , +#endif +}; + +/* Table: length slot => number of extra length bits */ +static const u8 deflate_extra_length_bits[] = { + 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , + 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , + 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 , + 5 , 5 , 5 , 5 , +#ifndef DEFLATE64 + 0 , +#else + 16 , +#endif +}; + +/* Table: offset slot => offset slot base value */ +static const unsigned deflate_offset_slot_base[] = { + 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 , + 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 , + 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 , + 4097 , 6145 , 8193 , 12289 , 16385 , 24577 , +#ifdef DEFLATE64 + 32769, 49153 +#endif +}; + +/* Table: offset slot => number of extra offset bits */ +static const u8 deflate_extra_offset_bits[] = { + 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 , + 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , + 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 , + 11 , 11 , 12 , 12 , 13 , 13 , +#ifdef DEFLATE64 + 14 , 14 +#endif +}; + +/* Table: length => length slot */ +static u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { 0 }; +static u8 deflate_length_slot_inited = 0; + +/* The order in which precode codeword lengths are stored */ +static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 +}; + +/* Codewords for the DEFLATE Huffman codes. */ +struct deflate_codewords { + u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u32 offset[DEFLATE_NUM_OFFSET_SYMS]; +}; + +/* Codeword lengths (in bits) for the DEFLATE Huffman codes. + * A zero length means the corresponding symbol had zero frequency. */ +struct deflate_lens { + u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u8 offset[DEFLATE_NUM_OFFSET_SYMS]; +}; + +/* Codewords and lengths for the DEFLATE Huffman codes. */ +struct deflate_codes { + struct deflate_codewords codewords; + struct deflate_lens lens; +}; + +/* Symbol frequency counters for the DEFLATE Huffman codes. */ +struct deflate_freqs { + u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u32 offset[DEFLATE_NUM_OFFSET_SYMS]; +}; + +#if SUPPORT_NEAR_OPTIMAL_PARSING + +/* Costs for the near-optimal parsing algorithm. */ +struct deflate_costs { + + /* The cost to output each possible literal. */ + u32 literal[DEFLATE_NUM_LITERALS]; + + /* The cost to output each possible match length. */ + u32 length[DEFLATE_MAX_MATCH_LEN + 1]; + + /* The cost to output a match offset of each possible offset slot. */ + u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; +}; + +/* + * COST_SHIFT is a scaling factor that makes it possible to consider fractional + * bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT. + * + * Note: this is only useful as a statistical trick for when the true costs are + * unknown. In reality, each token in DEFLATE requires a whole number of bits + * to output. + */ +#define COST_SHIFT 3 + +/* + * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to + * be needed to output a symbol that was unused in the previous optimization + * pass. Assigning a default cost allows the symbol to be used in the next + * optimization pass. However, the cost should be relatively high because the + * symbol probably won't be used very many times (if at all). + */ +#define LITERAL_NOSTAT_BITS 13 +#define LENGTH_NOSTAT_BITS 13 +#define OFFSET_NOSTAT_BITS 10 + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +/* + * Represents a run of literals followed by a match or end-of-block. This + * struct is needed to temporarily store items chosen by the parser, since items + * cannot be written until all items for the block have been chosen and the + * block's Huffman codes have been computed. + */ +struct deflate_sequence { + + /* Bits 0..22: the number of literals in this run. This may be 0 and + * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not + * stored explicitly in this structure; instead, they are read directly + * from the uncompressed data. + * + * Bits 23..31: the length of the match which follows the literals, or 0 + * if this literal run was the last in the block, so there is no match + * which follows it. + * In deflate64 mode, bits starting from bit 23 will hold the match length + * value. */ + lz_item_t litrunlen_and_length; + + /* If 'length' doesn't indicate end-of-block, then this is the offset of + * the match which follows the literals. */ + u16 offset; + + /* If 'length' doesn't indicate end-of-block, then this is the offset + * symbol of the match which follows the literals. */ + u8 offset_symbol; + + /* If 'length' doesn't indicate end-of-block, then this is the length + * slot of the match which follows the literals. */ + u8 length_slot; +}; + +#if SUPPORT_NEAR_OPTIMAL_PARSING + +/* + * This structure represents a byte position in the input data and a node in the + * graph of possible match/literal choices for the current block. + * + * Logically, each incoming edge to this node is labeled with a literal or a + * match that can be taken to reach this position from an earlier position; and + * each outgoing edge from this node is labeled with a literal or a match that + * can be taken to advance from this position to a later position. + * + * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we + * associate with each node just two pieces of information: + * + * 'cost_to_end' is the minimum cost to reach the end of the block from + * this position. + * + * 'item' represents the literal or match that must be chosen from here to + * reach the end of the block with the minimum cost. Equivalently, this + * can be interpreted as the label of the outgoing edge on the minimum-cost + * path to the "end of block" node from this node. + */ +struct deflate_optimum_node { + + u32 cost_to_end; + + /* + * Notes on the match/literal representation used here: + * + * The low bits of 'item' are the length: 1 if this is a literal, + * or the match length if this is a match. + * + * The high bits of 'item' are the actual literal byte if this is a + * literal, or the match offset if this is a match. + */ +#ifdef DEFLATE64 +#define OPTIMUM_OFFSET_SHIFT 17 +#else +#define OPTIMUM_OFFSET_SHIFT 9 +#endif +#define OPTIMUM_LEN_MASK (((lz_item_t)1 << OPTIMUM_OFFSET_SHIFT) - 1) + lz_item_t item; + +}; + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +/* Block split statistics. See "Block splitting algorithm" below. */ +#define NUM_LITERAL_OBSERVATION_TYPES 8 +#define NUM_MATCH_OBSERVATION_TYPES 2 +#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES) +struct block_split_stats { + u32 new_observations[NUM_OBSERVATION_TYPES]; + u32 observations[NUM_OBSERVATION_TYPES]; + u32 num_new_observations; + u32 num_observations; +}; + +/* The main DEFLATE compressor structure */ +struct libdeflate_compressor { + + /* Pointer to the compress() implementation chosen at allocation time */ + size_t (*impl)(struct libdeflate_compressor *, + const u8 *, size_t, u8 *, size_t); + + /* Frequency counters for the current block */ + struct deflate_freqs freqs; + + /* Dynamic Huffman codes for the current block */ + struct deflate_codes codes; + + /* Static Huffman codes */ + struct deflate_codes static_codes; + + /* Block split statistics for the currently pending block */ + struct block_split_stats split_stats; + + /* A table for fast lookups of offset slot by match offset. + * + * If the full table is being used, it is a direct mapping from offset + * to offset slot. + * + * If the condensed table is being used, the first 256 entries map + * directly to the offset slots of offsets 1 through 256. The next 256 + * entries map to the offset slots for the remaining offsets, stepping + * through the offsets with a stride of 128. This relies on the fact + * that each of the remaining offset slots contains at least 128 offsets + * and has an offset base that is a multiple of 128. */ +#if USE_FULL_OFFSET_SLOT_FAST + u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1]; +#else + u8 offset_slot_fast[512]; +#endif + + /* The "nice" match length: if a match of this length is found, choose + * it immediately without further consideration. */ + unsigned nice_match_length; + + /* The maximum search depth: consider at most this many potential + * matches at each position. */ + unsigned max_search_depth; + + /* The compression level with which this compressor was created. */ + unsigned compression_level; + + /* Anything smaller than this we won't bother trying to compress. */ + unsigned min_size_to_compress; + + /* Temporary space for Huffman code output */ + u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS]; + u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; + u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS]; + unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS]; + unsigned num_litlen_syms; + unsigned num_offset_syms; + unsigned num_explicit_lens; + unsigned num_precode_items; + + union { + /* Data for greedy or lazy parsing */ + struct { + /* Hash chain matchfinder */ + struct hc_matchfinder hc_mf; + + /* The matches and literals that the parser has chosen + * for the current block. The required length of this + * array is limited by the maximum number of matches + * that can ever be chosen for a single block, plus one + * for the special entry at the end. */ + struct deflate_sequence sequences[ + DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, + DEFLATE_MIN_MATCH_LEN) + 1]; + } g; /* (g)reedy */ + + #if SUPPORT_NEAR_OPTIMAL_PARSING + /* Data for near-optimal parsing */ + struct { + + /* Binary tree matchfinder */ + struct bt_matchfinder bt_mf; + + /* + * Cached matches for the current block. This array + * contains the matches that were found at each position + * in the block. Specifically, for each position, there + * is a list of matches found at that position, if any, + * sorted by strictly increasing length. In addition, + * following the matches for each position, there is a + * special 'struct lz_match' whose 'length' member + * contains the number of matches found at that + * position, and whose 'offset' member contains the + * literal at that position. + * + * Note: in rare cases, there will be a very high number + * of matches in the block and this array will overflow. + * If this happens, we force the end of the current + * block. CACHE_LENGTH is the length at which we + * actually check for overflow. The extra slots beyond + * this are enough to absorb the worst case overflow, + * which occurs if starting at &match_cache[CACHE_LENGTH + * - 1], we write MAX_MATCHES_PER_POS matches and a + * match count header, then skip searching for matches + * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write + * the match count header for each. + */ + struct lz_match match_cache[CACHE_LENGTH + + MAX_MATCHES_PER_POS + + DEFLATE_MAX_MATCH_LEN - 1]; + + /* + * Array of nodes, one per position, for running the + * minimum-cost path algorithm. + * + * This array must be large enough to accommodate the + * worst-case number of nodes, which occurs if we find a + * match of length DEFLATE_MAX_MATCH_LEN at position + * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of + * length SOFT_MAX_BLOCK_LENGTH - 1 + + * DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block + * node. + */ + struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 + + DEFLATE_MAX_MATCH_LEN + 1]; + + /* The current cost model being used. */ + struct deflate_costs costs; + + unsigned num_optim_passes; + } n; /* (n)ear-optimal */ + #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + + } p; /* (p)arser */ +}; + +#ifndef GDEFLATE +/* + * The type for the bitbuffer variable, which temporarily holds bits that are + * being packed into bytes and written to the output buffer. For best + * performance, this should have size equal to a machine word. + */ +typedef machine_word_t bitbuf_t; +#define BITBUF_NBITS (8 * sizeof(bitbuf_t)) + +/* Can the specified number of bits always be added to 'bitbuf' after any + * pending bytes have been flushed? */ +#define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7) + +/* + * Structure to keep track of the current state of sending bits to the + * compressed output buffer. + */ +struct deflate_output_bitstream { + + /* Bits that haven't yet been written to the output buffer. */ + bitbuf_t bitbuf; + + /* Number of bits currently held in @bitbuf. */ + unsigned bitcount; + + /* Pointer to the beginning of the output buffer. */ + u8 *begin; + + /* Pointer to the position in the output buffer at which the next byte + * should be written. */ + u8 *next; + + /* Pointer just past the end of the output buffer. */ + u8 *end; +}; + +/* + * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be + * present following os->end, in order to not overrun the buffer when generating + * output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t) + * bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However, + * to make the compression algorithm produce the same result on all CPU + * architectures (which is sometimes desirable), we have to unconditionally use + * the maximum for any CPU, which is sizeof(bitbuf_t) == 8. + */ +#define OUTPUT_END_PADDING 8 + +/* Initialize the output bitstream. 'size' is assumed to be at least + * OUTPUT_END_PADDING. */ +static void +deflate_init_output(struct deflate_output_bitstream *os, + void *buffer, size_t size) +{ + os->bitbuf = 0; + os->bitcount = 0; + os->begin = buffer; + os->next = os->begin; + os->end = os->begin + size - OUTPUT_END_PADDING; +} + +/* Add some bits to the bitbuffer variable of the output bitstream. The caller + * must make sure there is enough room. */ +static forceinline void +deflate_add_bits(struct deflate_output_bitstream *os, + const bitbuf_t bits, const unsigned num_bits) +{ + os->bitbuf |= bits << os->bitcount; + os->bitcount += num_bits; +} + +/* Flush bits from the bitbuffer variable to the output buffer. */ +static forceinline void +deflate_flush_bits(struct deflate_output_bitstream *os) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + /* Flush a whole word (branchlessly). */ + put_unaligned_leword(os->bitbuf, os->next); + os->bitbuf >>= os->bitcount & ~7; + os->next += MIN(os->end - os->next, os->bitcount >> 3); + os->bitcount &= 7; + } else { + /* Flush a byte at a time. */ + while (os->bitcount >= 8) { + *os->next = os->bitbuf; + if (os->next != os->end) + os->next++; + os->bitcount -= 8; + os->bitbuf >>= 8; + } + } +} + +/* Align the bitstream on a byte boundary. */ +static forceinline void +deflate_align_bitstream(struct deflate_output_bitstream *os) +{ + os->bitcount += -os->bitcount & 7; + deflate_flush_bits(os); +} + +/* + * Flush any remaining bits to the output buffer if needed. Return the total + * number of bytes written to the output buffer, or 0 if an overflow occurred. + */ +static size_t +deflate_flush_output(struct deflate_output_bitstream *os) +{ + if (os->next == os->end) /* overflow? */ + return 0; + + while ((int)os->bitcount > 0) { + *os->next++ = os->bitbuf; + os->bitcount -= 8; + os->bitbuf >>= 8; + } + + return os->next - os->begin; +} + +/* Nothing to do. */ +static void +gdeflate_reset(struct deflate_output_bitstream *os) +{} + +/* Nothing to do. */ +static unsigned +gdeflate_advance(struct deflate_output_bitstream *os) +{ + return 0; +} + +#else + +/* + * The type for the bitbuffer variable, which temporarily holds bits that are + * being packed into bytes and written to the output buffer. For GDEFLATE it + * has to be at least 64-bits long. + */ +typedef u64 bitbuf_t; +#define BITBUF_NBITS (8 * sizeof(bitbuf_t)) + +/* Can the specified number of bits always be added to 'bitbuf' after any + * pending bytes have been flushed? */ +#define CAN_BUFFER(n) (true) + +/* + * Structure to keep track of the current state of sending bits to the + * compressed output buffer. + */ +struct deflate_output_bitstream { + + /* Bits that haven't yet been written to the output buffer. */ + bitbuf_t bitbuf[NUM_STREAMS]; + + /* Number of bits currently held in @bitbuf. */ + int bitcount[NUM_STREAMS]; + + /* Write pointer for current bit-packet. */ + u8* write_ptr[NUM_STREAMS]; + + /* Write pointer for next bit-packet. */ + u8* next_ptr[NUM_STREAMS]; + + /* The number of bits "read" from stream. + * Used to emulate GDeflate reading pattern. */ + unsigned input_bitcount[NUM_STREAMS]; + + /* Current GDeflate stream index. */ + u8 idx; + + /* Pointer to the beginning of the output buffer. */ + u8 *begin; + + /* Pointer to the position in the output buffer at which the next byte + * should be written. */ + u8 *next; + + /* Pointer just past the end of the output buffer. */ + u8 *end; +}; + +/* + * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be + * present following os->end, in order to not overrun the buffer when generating + * output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t) + * bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However, + * to make the compression algorithm produce the same result on all CPU + * architectures (which is sometimes desirable), we have to unconditionally use + * the maximum for any CPU, which is sizeof(bitbuf_t) == 8. + */ +#define OUTPUT_END_PADDING 8 + +/* Initialize the output bitstream. 'size' is assumed to be at least + * OUTPUT_END_PADDING. */ +static void +deflate_init_output(struct deflate_output_bitstream *os, + void *buffer, size_t size) +{ + memset(os->bitbuf, 0, sizeof(os->bitbuf)); + memset(os->bitcount, 0, sizeof(os->bitcount)); + os->idx = 0; + os->begin = buffer; + os->next = os->begin; + os->end = os->begin + size - OUTPUT_END_PADDING; + + for (unsigned n = 0; n < NUM_STREAMS; n++) { + os->write_ptr[n] = os->next; + os->next += BITS_PER_PACKET/8; + + /* Clear the output. */ + put_unaligned_le32(0, os->write_ptr[os->idx]); + + /* The next write position is not known yet. */ + os->next_ptr[n] = NULL; + + /* GDeflate starts with a bit packet in each lane. */ + os->input_bitcount[n] = BITS_PER_PACKET; + } +} + +/* Flush a GDeflate packet from the bitbuffer variable to the output buffer. */ +static forceinline void +gdeflate_flush_bitpacket(struct deflate_output_bitstream *os) +{ + put_unaligned_le32((u32)os->bitbuf[os->idx], os->write_ptr[os->idx]); + + os->bitbuf[os->idx] >>= BITS_PER_PACKET; + os->bitcount[os->idx] -= BITS_PER_PACKET; + + /* Advance the write pointer. */ + os->write_ptr[os->idx] = os->next_ptr[os->idx]; + os->next_ptr[os->idx] = NULL; +} + +/* Add some bits to the bitbuffer variable of the output bitstream. The caller + * must make sure there is enough room. */ +static forceinline void +deflate_add_bits(struct deflate_output_bitstream *os, + const bitbuf_t bits, const unsigned num_bits) +{ + os->bitbuf[os->idx] |= bits << os->bitcount[os->idx]; + os->bitcount[os->idx] += num_bits; + + /* Emulate the number of input bits. */ + os->input_bitcount[os->idx] -= num_bits; + + /* Accumulated more than a watermark - flush a bit packet. */ + if (os->bitcount[os->idx] >= LOW_WATERMARK_BITS) + gdeflate_flush_bitpacket(os); + + /* If the emulated input has less than a watermark bits + * it is time to reserve the next write pointer. */ + if (os->input_bitcount[os->idx] < LOW_WATERMARK_BITS && + os->next_ptr[os->idx] == NULL) { + os->next_ptr[os->idx] = os->next; + os->next += BITS_PER_PACKET/8; + + /* Clear. */ + put_unaligned_le32(0, os->next_ptr[os->idx]); + + /* Bump the input bitcount. */ + os->input_bitcount[os->idx] += BITS_PER_PACKET; + } +} + +/* + * Flush bits from the bitbuffer variable to the output buffer. + * Nothing to do in GDeflate mode. +*/ +static forceinline void +deflate_flush_bits(struct deflate_output_bitstream *os) +{} + +/* Nothing to do. GDeflate bitstreams are not aligned. */ +static forceinline void +deflate_align_bitstream(struct deflate_output_bitstream *os) +{} + +/* Reset GDeflate's stream index. */ +static void +gdeflate_reset(struct deflate_output_bitstream *os) +{ + os->idx = 0; +} + +/* Advance GDeflate's stream index. Returns the carry over. */ +static forceinline unsigned +gdeflate_advance(struct deflate_output_bitstream *os) +{ + os->idx = (os->idx + 1)%NUM_STREAMS; + return os->idx == 0 ? 1 : 0; +} + +/* + * Flush any remaining bits to the output buffer if needed. Return the total + * number of bytes written to the output buffer, or 0 if an overflow occurred. + */ +static size_t +deflate_flush_output(struct deflate_output_bitstream *os) +{ + if (os->next == os->end) /* overflow? */ + return 0; + + for (int n = 0; n < NUM_STREAMS; n++) { + gdeflate_flush_bitpacket(os); + gdeflate_advance(os); + } + + return os->next - os->begin; +} + +#endif + +/* Given the binary tree node A[subtree_idx] whose children already + * satisfy the maxheap property, swap the node with its greater child + * until it is greater than both its children, so that the maxheap + * property is satisfied in the subtree rooted at A[subtree_idx]. */ +static void +heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) +{ + unsigned parent_idx; + unsigned child_idx; + u32 v; + + v = A[subtree_idx]; + parent_idx = subtree_idx; + while ((child_idx = parent_idx * 2) <= length) { + if (child_idx < length && A[child_idx + 1] > A[child_idx]) + child_idx++; + if (v >= A[child_idx]) + break; + A[parent_idx] = A[child_idx]; + parent_idx = child_idx; + } + A[parent_idx] = v; +} + +/* Rearrange the array 'A' so that it satisfies the maxheap property. + * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. + */ +static void +heapify_array(u32 A[], unsigned length) +{ + unsigned subtree_idx; + + for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) + heapify_subtree(A, length, subtree_idx); +} + +/* + * Sort the array 'A', which contains 'length' unsigned 32-bit integers. + * + * Note: name this function heap_sort() instead of heapsort() to avoid colliding + * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't + * necessary when compiling with -D_ANSI_SOURCE, which is the better solution. + */ +static void +heap_sort(u32 A[], unsigned length) +{ + A--; /* Use 1-based indices */ + + heapify_array(A, length); + + while (length >= 2) { + u32 tmp = A[length]; + A[length] = A[1]; + A[1] = tmp; + length--; + heapify_subtree(A, length, 1); + } +} + +#define NUM_SYMBOL_BITS 10 +#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) + +#define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3) +/* + * Sort the symbols primarily by frequency and secondarily by symbol + * value. Discard symbols with zero frequency and fill in an array with + * the remaining symbols, along with their frequencies. The low + * NUM_SYMBOL_BITS bits of each array entry will contain the symbol + * value, and the remaining bits will contain the frequency. + * + * @num_syms + * Number of symbols in the alphabet. + * Can't be greater than (1 << NUM_SYMBOL_BITS). + * + * @freqs[num_syms] + * The frequency of each symbol. + * + * @lens[num_syms] + * An array that eventually will hold the length of each codeword. + * This function only fills in the codeword lengths for symbols that + * have zero frequency, which are not well defined per se but will + * be set to 0. + * + * @symout[num_syms] + * The output array, described above. + * + * Returns the number of entries in 'symout' that were filled. This is + * the number of symbols that have nonzero frequency. + */ +static unsigned +sort_symbols(unsigned num_syms, const u32 freqs[restrict], + u8 lens[restrict], u32 symout[restrict]) +{ + unsigned sym; + unsigned i; + unsigned num_used_syms; + unsigned num_counters; + unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)]; + + /* We rely on heapsort, but with an added optimization. Since + * it's common for most symbol frequencies to be low, we first do + * a count sort using a limited number of counters. High + * frequencies will be counted in the last counter, and only they + * will be sorted with heapsort. + * + * Note: with more symbols, it is generally beneficial to have more + * counters. About 1 counter per 4 symbols seems fast. + * + * Note: I also tested radix sort, but even for large symbol + * counts (> 255) and frequencies bounded at 16 bits (enabling + * radix sort by just two base-256 digits), it didn't seem any + * faster than the method implemented here. + * + * Note: I tested the optimized quicksort implementation from + * glibc (with indirection overhead removed), but it was only + * marginally faster than the simple heapsort implemented here. + * + * Tests were done with building the codes for LZX. Results may + * vary for different compression algorithms...! */ + + num_counters = GET_NUM_COUNTERS(num_syms); + + memset(counters, 0, num_counters * sizeof(counters[0])); + + /* Count the frequencies. */ + for (sym = 0; sym < num_syms; sym++) + counters[MIN(freqs[sym], num_counters - 1)]++; + + /* Make the counters cumulative, ignoring the zero-th, which + * counted symbols with zero frequency. As a side effect, this + * calculates the number of symbols with nonzero frequency. */ + num_used_syms = 0; + for (i = 1; i < num_counters; i++) { + unsigned count = counters[i]; + counters[i] = num_used_syms; + num_used_syms += count; + } + + /* Sort nonzero-frequency symbols using the counters. At the + * same time, set the codeword lengths of zero-frequency symbols + * to 0. */ + for (sym = 0; sym < num_syms; sym++) { + u32 freq = freqs[sym]; + if (freq != 0) { + symout[counters[MIN(freq, num_counters - 1)]++] = + sym | (freq << NUM_SYMBOL_BITS); + } else { + lens[sym] = 0; + } + } + + /* Sort the symbols counted in the last counter. */ + heap_sort(symout + counters[num_counters - 2], + counters[num_counters - 1] - counters[num_counters - 2]); + + return num_used_syms; +} + +/* + * Build the Huffman tree. + * + * This is an optimized implementation that + * (a) takes advantage of the frequencies being already sorted; + * (b) only generates non-leaf nodes, since the non-leaf nodes of a + * Huffman tree are sufficient to generate a canonical code; + * (c) Only stores parent pointers, not child pointers; + * (d) Produces the nodes in the same memory used for input + * frequency information. + * + * Array 'A', which contains 'sym_count' entries, is used for both input + * and output. For this function, 'sym_count' must be at least 2. + * + * For input, the array must contain the frequencies of the symbols, + * sorted in increasing order. Specifically, each entry must contain a + * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low + * NUM_SYMBOL_BITS bits of the entries will be ignored by this function. + * Although these bits will, in fact, contain the symbols that correspond + * to the frequencies, this function is concerned with frequencies only + * and keeps the symbols as-is. + * + * For output, this function will produce the non-leaf nodes of the + * Huffman tree. These nodes will be stored in the first (sym_count - 1) + * entries of the array. Entry A[sym_count - 2] will represent the root + * node. Each other node will contain the zero-based index of its parent + * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low + * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again, + * note that although these low bits will, in fact, contain a symbol + * value, this symbol will have *no relationship* with the Huffman tree + * node that happens to occupy the same slot. This is because this + * implementation only generates the non-leaf nodes of the tree. + */ +static void +build_tree(u32 A[], unsigned sym_count) +{ + /* Index, in 'A', of next lowest frequency symbol that has not + * yet been processed. */ + unsigned i = 0; + + /* Index, in 'A', of next lowest frequency parentless non-leaf + * node; or, if equal to 'e', then no such node exists yet. */ + unsigned b = 0; + + /* Index, in 'A', of next node to allocate as a non-leaf. */ + unsigned e = 0; + + do { + unsigned m, n; + u32 freq_shifted; + + /* Choose the two next lowest frequency entries. */ + + if (i != sym_count && + (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) + m = i++; + else + m = b++; + + if (i != sym_count && + (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) + n = i++; + else + n = b++; + + /* Allocate a non-leaf node and link the entries to it. + * + * If we link an entry that we're visiting for the first + * time (via index 'i'), then we're actually linking a + * leaf node and it will have no effect, since the leaf + * will be overwritten with a non-leaf when index 'e' + * catches up to it. But it's not any slower to + * unconditionally set the parent index. + * + * We also compute the frequency of the non-leaf node as + * the sum of its two children's frequencies. */ + + freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK); + + A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); + A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); + A[e] = (A[e] & SYMBOL_MASK) | freq_shifted; + e++; + } while (sym_count - e > 1); + /* When just one entry remains, it is a "leaf" that was + * linked to some other node. We ignore it, since the + * rest of the array contains the non-leaves which we + * need. (Note that we're assuming the cases with 0 or 1 + * symbols were handled separately.) */ +} + +/* + * Given the stripped-down Huffman tree constructed by build_tree(), + * determine the number of codewords that should be assigned each + * possible length, taking into account the length-limited constraint. + * + * @A + * The array produced by build_tree(), containing parent index + * information for the non-leaf nodes of the Huffman tree. Each + * entry in this array is a node; a node's parent always has a + * greater index than that node itself. This function will + * overwrite the parent index information in this array, so + * essentially it will destroy the tree. However, the data in the + * low NUM_SYMBOL_BITS of each entry will be preserved. + * + * @root_idx + * The 0-based index of the root node in 'A', and consequently one + * less than the number of tree node entries in 'A'. (Or, really 2 + * less than the actual length of 'A'.) + * + * @len_counts + * An array of length ('max_codeword_len' + 1) in which the number of + * codewords having each length <= max_codeword_len will be + * returned. + * + * @max_codeword_len + * The maximum permissible codeword length. + */ +static void +compute_length_counts(u32 A[restrict], unsigned root_idx, + unsigned len_counts[restrict], unsigned max_codeword_len) +{ + unsigned len; + int node; + + /* The key observations are: + * + * (1) We can traverse the non-leaf nodes of the tree, always + * visiting a parent before its children, by simply iterating + * through the array in reverse order. Consequently, we can + * compute the depth of each node in one pass, overwriting the + * parent indices with depths. + * + * (2) We can initially assume that in the real Huffman tree, + * both children of the root are leaves. This corresponds to two + * codewords of length 1. Then, whenever we visit a (non-leaf) + * node during the traversal, we modify this assumption to + * account for the current node *not* being a leaf, but rather + * its two children being leaves. This causes the loss of one + * codeword for the current depth and the addition of two + * codewords for the current depth plus one. + * + * (3) We can handle the length-limited constraint fairly easily + * by simply using the largest length available when a depth + * exceeds max_codeword_len. + */ + + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + len_counts[1] = 2; + + /* Set the root node's depth to 0. */ + A[root_idx] &= SYMBOL_MASK; + + for (node = root_idx - 1; node >= 0; node--) { + + /* Calculate the depth of this node. */ + + unsigned parent = A[node] >> NUM_SYMBOL_BITS; + unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; + unsigned depth = parent_depth + 1; + unsigned len = depth; + + /* Set the depth of this node so that it is available + * when its children (if any) are processed. */ + + A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); + + /* If needed, decrease the length to meet the + * length-limited constraint. This is not the optimal + * method for generating length-limited Huffman codes! + * But it should be good enough. */ + if (len >= max_codeword_len) { + len = max_codeword_len; + do { + len--; + } while (len_counts[len] == 0); + } + + /* Account for the fact that we have a non-leaf node at + * the current depth. */ + len_counts[len]--; + len_counts[len + 1] += 2; + } +} + +/* + * Generate the codewords for a canonical Huffman code. + * + * @A + * The output array for codewords. In addition, initially this + * array must contain the symbols, sorted primarily by frequency and + * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of + * each entry. + * + * @len + * Output array for codeword lengths. + * + * @len_counts + * An array that provides the number of codewords that will have + * each possible length <= max_codeword_len. + * + * @max_codeword_len + * Maximum length, in bits, of each codeword. + * + * @num_syms + * Number of symbols in the alphabet, including symbols with zero + * frequency. This is the length of the 'A' and 'len' arrays. + */ +static void +gen_codewords(u32 A[restrict], u8 lens[restrict], + const unsigned len_counts[restrict], + unsigned max_codeword_len, unsigned num_syms) +{ + u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned i; + unsigned len; + unsigned sym; + + /* Given the number of codewords that will have each length, + * assign codeword lengths to symbols. We do this by assigning + * the lengths in decreasing order to the symbols sorted + * primarily by increasing frequency and secondarily by + * increasing symbol value. */ + for (i = 0, len = max_codeword_len; len >= 1; len--) { + unsigned count = len_counts[len]; + while (count--) + lens[A[i++] & SYMBOL_MASK] = len; + } + + /* Generate the codewords themselves. We initialize the + * 'next_codewords' array to provide the lexicographically first + * codeword of each length, then assign codewords in symbol + * order. This produces a canonical code. */ + next_codewords[0] = 0; + next_codewords[1] = 0; + for (len = 2; len <= max_codeword_len; len++) + next_codewords[len] = + (next_codewords[len - 1] + len_counts[len - 1]) << 1; + + for (sym = 0; sym < num_syms; sym++) + A[sym] = next_codewords[lens[sym]]++; +} + +/* + * --------------------------------------------------------------------- + * make_canonical_huffman_code() + * --------------------------------------------------------------------- + * + * Given an alphabet and the frequency of each symbol in it, construct a + * length-limited canonical Huffman code. + * + * @num_syms + * The number of symbols in the alphabet. The symbols are the + * integers in the range [0, num_syms - 1]. This parameter must be + * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS). + * + * @max_codeword_len + * The maximum permissible codeword length. + * + * @freqs + * An array of @num_syms entries, each of which specifies the + * frequency of the corresponding symbol. It is valid for some, + * none, or all of the frequencies to be 0. + * + * @lens + * An array of @num_syms entries in which this function will return + * the length, in bits, of the codeword assigned to each symbol. + * Symbols with 0 frequency will not have codewords per se, but + * their entries in this array will be set to 0. No lengths greater + * than @max_codeword_len will be assigned. + * + * @codewords + * An array of @num_syms entries in which this function will return + * the codeword for each symbol, right-justified and padded on the + * left with zeroes. Codewords for symbols with 0 frequency will be + * undefined. + * + * --------------------------------------------------------------------- + * + * This function builds a length-limited canonical Huffman code. + * + * A length-limited Huffman code contains no codewords longer than some + * specified length, and has exactly (with some algorithms) or + * approximately (with the algorithm used here) the minimum weighted path + * length from the root, given this constraint. + * + * A canonical Huffman code satisfies the properties that a longer + * codeword never lexicographically precedes a shorter codeword, and the + * lexicographic ordering of codewords of the same length is the same as + * the lexicographic ordering of the corresponding symbols. A canonical + * Huffman code, or more generally a canonical prefix code, can be + * reconstructed from only a list containing the codeword length of each + * symbol. + * + * The classic algorithm to generate a Huffman code creates a node for + * each symbol, then inserts these nodes into a min-heap keyed by symbol + * frequency. Then, repeatedly, the two lowest-frequency nodes are + * removed from the min-heap and added as the children of a new node + * having frequency equal to the sum of its two children, which is then + * inserted into the min-heap. When only a single node remains in the + * min-heap, it is the root of the Huffman tree. The codeword for each + * symbol is determined by the path needed to reach the corresponding + * node from the root. Descending to the left child appends a 0 bit, + * whereas descending to the right child appends a 1 bit. + * + * The classic algorithm is relatively easy to understand, but it is + * subject to a number of inefficiencies. In practice, it is fastest to + * first sort the symbols by frequency. (This itself can be subject to + * an optimization based on the fact that most frequencies tend to be + * low.) At the same time, we sort secondarily by symbol value, which + * aids the process of generating a canonical code. Then, during tree + * construction, no heap is necessary because both the leaf nodes and the + * unparented non-leaf nodes can be easily maintained in sorted order. + * Consequently, there can never be more than two possibilities for the + * next-lowest-frequency node. + * + * In addition, because we're generating a canonical code, we actually + * don't need the leaf nodes of the tree at all, only the non-leaf nodes. + * This is because for canonical code generation we don't need to know + * where the symbols are in the tree. Rather, we only need to know how + * many leaf nodes have each depth (codeword length). And this + * information can, in fact, be quickly generated from the tree of + * non-leaves only. + * + * Furthermore, we can build this stripped-down Huffman tree directly in + * the array in which the codewords are to be generated, provided that + * these array slots are large enough to hold a symbol and frequency + * value. + * + * Still furthermore, we don't even need to maintain explicit child + * pointers. We only need the parent pointers, and even those can be + * overwritten in-place with depth information as part of the process of + * extracting codeword lengths from the tree. So in summary, we do NOT + * need a big structure like: + * + * struct huffman_tree_node { + * unsigned int symbol; + * unsigned int frequency; + * unsigned int depth; + * struct huffman_tree_node *left_child; + * struct huffman_tree_node *right_child; + * }; + * + * + * ... which often gets used in "naive" implementations of Huffman code + * generation. + * + * Many of these optimizations are based on the implementation in 7-Zip + * (source file: C/HuffEnc.c), which has been placed in the public domain + * by Igor Pavlov. + */ +static void +make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, + const u32 freqs[restrict], + u8 lens[restrict], u32 codewords[restrict]) +{ + u32 *A = codewords; + unsigned num_used_syms; + + STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); + + /* We begin by sorting the symbols primarily by frequency and + * secondarily by symbol value. As an optimization, the array + * used for this purpose ('A') shares storage with the space in + * which we will eventually return the codewords. */ + + num_used_syms = sort_symbols(num_syms, freqs, lens, A); + + /* 'num_used_syms' is the number of symbols with nonzero + * frequency. This may be less than @num_syms. 'num_used_syms' + * is also the number of entries in 'A' that are valid. Each + * entry consists of a distinct symbol and a nonzero frequency + * packed into a 32-bit integer. */ + + /* Handle special cases where only 0 or 1 symbols were used (had + * nonzero frequency). */ + + if (unlikely(num_used_syms == 0)) { + /* Code is empty. sort_symbols() already set all lengths + * to 0, so there is nothing more to do. */ + return; + } + + if (unlikely(num_used_syms == 1)) { + /* Only one symbol was used, so we only need one + * codeword. But two codewords are needed to form the + * smallest complete Huffman code, which uses codewords 0 + * and 1. Therefore, we choose another symbol to which + * to assign a codeword. We use 0 (if the used symbol is + * not 0) or 1 (if the used symbol is 0). In either + * case, the lesser-valued symbol must be assigned + * codeword 0 so that the resulting code is canonical. */ + + unsigned sym = A[0] & SYMBOL_MASK; + unsigned nonzero_idx = sym ? sym : 1; + + codewords[0] = 0; + lens[0] = 1; + codewords[nonzero_idx] = 1; + lens[nonzero_idx] = 1; + return; + } + + /* Build a stripped-down version of the Huffman tree, sharing the + * array 'A' with the symbol values. Then extract length counts + * from the tree and use them to generate the final codewords. */ + + build_tree(A, num_used_syms); + + { + unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; + + compute_length_counts(A, num_used_syms - 2, + len_counts, max_codeword_len); + + gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); + } +} + +/* + * Clear the Huffman symbol frequency counters. + * This must be called when starting a new DEFLATE block. + */ +static void +deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) +{ + memset(&c->freqs, 0, sizeof(c->freqs)); +} + +/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */ +static u32 +deflate_reverse_codeword(u32 codeword, u8 len) +{ + /* The following branchless algorithm is faster than going bit by bit. + * Note: since no codewords are longer than 16 bits, we only need to + * reverse the low 16 bits of the 'u32'. */ + STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); + + /* Flip adjacent 1-bit fields */ + codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1); + + /* Flip adjacent 2-bit fields */ + codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2); + + /* Flip adjacent 4-bit fields */ + codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4); + + /* Flip adjacent 8-bit fields */ + codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8); + + /* Return the high 'len' bits of the bit-reversed 16 bit value. */ + return codeword >> (16 - len); +} + +/* Make a canonical Huffman code with bit-reversed codewords. */ +static void +deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, + const u32 freqs[], u8 lens[], u32 codewords[]) +{ + unsigned sym; + + make_canonical_huffman_code(num_syms, max_codeword_len, + freqs, lens, codewords); + + for (sym = 0; sym < num_syms; sym++) + codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]); +} + +/* + * Build the literal/length and offset Huffman codes for a DEFLATE block. + * + * This takes as input the frequency tables for each code and produces as output + * a set of tables that map symbols to codewords and codeword lengths. + */ +static void +deflate_make_huffman_codes(const struct deflate_freqs *freqs, + struct deflate_codes *codes) +{ + STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN); + STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN); + + deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, + MAX_LITLEN_CODEWORD_LEN, + freqs->litlen, + codes->lens.litlen, + codes->codewords.litlen); + + deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, + MAX_OFFSET_CODEWORD_LEN, + freqs->offset, + codes->lens.offset, + codes->codewords.offset); +} + +/* Initialize c->static_codes. */ +static void +deflate_init_static_codes(struct libdeflate_compressor *c) +{ + unsigned i; + + for (i = 0; i < 144; i++) + c->freqs.litlen[i] = 1 << (9 - 8); + for (; i < 256; i++) + c->freqs.litlen[i] = 1 << (9 - 9); + for (; i < 280; i++) + c->freqs.litlen[i] = 1 << (9 - 7); + for (; i < 288; i++) + c->freqs.litlen[i] = 1 << (9 - 8); + + for (i = 0; i < 32; i++) + c->freqs.offset[i] = 1 << (5 - 5); + + deflate_make_huffman_codes(&c->freqs, &c->static_codes); +} + +/* Return the offset slot for the specified match offset. */ +static forceinline unsigned +deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset) +{ +#if USE_FULL_OFFSET_SLOT_FAST + return c->offset_slot_fast[offset]; +#else + if (offset <= 256) + return c->offset_slot_fast[offset - 1]; + else + return c->offset_slot_fast[256 + ((offset - 1) >> 7)]; +#endif +} + +/* Write the header fields common to all DEFLATE block types. */ +static void +deflate_write_block_header(struct deflate_output_bitstream *os, + bool is_final_block, unsigned block_type) +{ + gdeflate_reset(os); + deflate_add_bits(os, is_final_block, 1); + deflate_add_bits(os, block_type, 2); + deflate_flush_bits(os); +} + +static unsigned +deflate_compute_precode_items(const u8 lens[restrict], + const unsigned num_lens, + u32 precode_freqs[restrict], + unsigned precode_items[restrict]) +{ + unsigned *itemptr; + unsigned run_start; + unsigned run_end; + unsigned extra_bits; + u8 len; + + memset(precode_freqs, 0, + DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); + + itemptr = precode_items; + run_start = 0; + do { + /* Find the next run of codeword lengths. */ + + /* len = the length being repeated */ + len = lens[run_start]; + + /* Extend the run. */ + run_end = run_start; + do { + run_end++; + } while (run_end != num_lens && len == lens[run_end]); + + if (len == 0) { + /* Run of zeroes. */ + + /* Symbol 18: RLE 11 to 138 zeroes at a time. */ + while ((run_end - run_start) >= 11) { + extra_bits = MIN((run_end - run_start) - 11, 0x7F); + precode_freqs[18]++; + *itemptr++ = 18 | (extra_bits << 5); + run_start += 11 + extra_bits; + } + + /* Symbol 17: RLE 3 to 10 zeroes at a time. */ + if ((run_end - run_start) >= 3) { + extra_bits = MIN((run_end - run_start) - 3, 0x7); + precode_freqs[17]++; + *itemptr++ = 17 | (extra_bits << 5); + run_start += 3 + extra_bits; + } + } else { + + /* A run of nonzero lengths. */ + + /* Symbol 16: RLE 3 to 6 of the previous length. */ + if ((run_end - run_start) >= 4) { + precode_freqs[len]++; + *itemptr++ = len; + run_start++; + do { + extra_bits = MIN((run_end - run_start) - 3, 0x3); + precode_freqs[16]++; + *itemptr++ = 16 | (extra_bits << 5); + run_start += 3 + extra_bits; + } while ((run_end - run_start) >= 3); + } + } + + /* Output any remaining lengths without RLE. */ + while (run_start != run_end) { + precode_freqs[len]++; + *itemptr++ = len; + run_start++; + } + } while (run_start != num_lens); + + return itemptr - precode_items; +} + +/* + * Huffman codeword lengths for dynamic Huffman blocks are compressed using a + * separate Huffman code, the "precode", which contains a symbol for each + * possible codeword length in the larger code as well as several special + * symbols to represent repeated codeword lengths (a form of run-length + * encoding). The precode is itself constructed in canonical form, and its + * codeword lengths are represented literally in 19 3-bit fields that + * immediately precede the compressed codeword lengths of the larger code. + */ + +/* Precompute the information needed to output Huffman codes. */ +static void +deflate_precompute_huffman_header(struct libdeflate_compressor *c) +{ + /* Compute how many litlen and offset symbols are needed. */ + + for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; + c->num_litlen_syms > 257; + c->num_litlen_syms--) + if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0) + break; + + for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; + c->num_offset_syms > 1; + c->num_offset_syms--) + if (c->codes.lens.offset[c->num_offset_syms - 1] != 0) + break; + + /* If we're not using the full set of literal/length codeword lengths, + * then temporarily move the offset codeword lengths over so that the + * literal/length and offset codeword lengths are contiguous. */ + + STATIC_ASSERT(offsetof(struct deflate_lens, offset) == + DEFLATE_NUM_LITLEN_SYMS); + + if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { + memmove((u8 *)&c->codes.lens + c->num_litlen_syms, + (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, + c->num_offset_syms); + } + + /* Compute the "items" (RLE / literal tokens and extra bits) with which + * the codeword lengths in the larger code will be output. */ + c->num_precode_items = + deflate_compute_precode_items((u8 *)&c->codes.lens, + c->num_litlen_syms + + c->num_offset_syms, + c->precode_freqs, + c->precode_items); + + /* Build the precode. */ + STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); + deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, + MAX_PRE_CODEWORD_LEN, + c->precode_freqs, c->precode_lens, + c->precode_codewords); + + /* Count how many precode lengths we actually need to output. */ + for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; + c->num_explicit_lens > 4; + c->num_explicit_lens--) + if (c->precode_lens[deflate_precode_lens_permutation[ + c->num_explicit_lens - 1]] != 0) + break; + + /* Restore the offset codeword lengths if needed. */ + if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { + memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, + (u8 *)&c->codes.lens + c->num_litlen_syms, + c->num_offset_syms); + } +} + +/* Output the Huffman codes. */ +static void +deflate_write_huffman_header(struct libdeflate_compressor *c, + struct deflate_output_bitstream *os) +{ + unsigned i; + + gdeflate_reset(os); + + deflate_add_bits(os, c->num_litlen_syms - 257, 5); + deflate_add_bits(os, c->num_offset_syms - 1, 5); + deflate_add_bits(os, c->num_explicit_lens - 4, 4); + deflate_flush_bits(os); + + /* Output the lengths of the codewords in the precode. */ + for (i = 0; i < c->num_explicit_lens; i++) { + deflate_add_bits(os, c->precode_lens[ + deflate_precode_lens_permutation[i]], 3); + gdeflate_advance(os); + deflate_flush_bits(os); + } + + gdeflate_reset(os); + + /* Output the encoded lengths of the codewords in the larger code. */ + for (i = 0; i < c->num_precode_items; i++) { + unsigned precode_item = c->precode_items[i]; + unsigned precode_sym = precode_item & 0x1F; + deflate_add_bits(os, c->precode_codewords[precode_sym], + c->precode_lens[precode_sym]); + if (precode_sym >= 16) { + if (precode_sym == 16) + deflate_add_bits(os, precode_item >> 5, 2); + else if (precode_sym == 17) + deflate_add_bits(os, precode_item >> 5, 3); + else + deflate_add_bits(os, precode_item >> 5, 7); + } + STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7)); + gdeflate_advance(os); + deflate_flush_bits(os); + } +} + +/* Output the end-of-block symbol. */ +static void +deflate_write_end_of_block(struct deflate_output_bitstream *os, + const struct deflate_codes *codes) +{ + deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK], + codes->lens.litlen[DEFLATE_END_OF_BLOCK]); + deflate_flush_bits(os); +} + +#ifdef GDEFLATE + +/* GDeflate deferred copy. */ +struct gdeflate_deferred_copy { + unsigned round; + unsigned offset_bits; + unsigned offset_bitcount; +}; + +/* Initilizes a deferred copy. */ +static void +gdeflate_init_copy(struct gdeflate_deferred_copy * restrict copy, + const struct deflate_codes * restrict codes, + unsigned offset, unsigned offset_slot, unsigned round) +{ + copy->offset_bits = codes->codewords.offset[offset_slot]; + copy->offset_bits |= (offset - + deflate_offset_slot_base[offset_slot]) << + codes->lens.offset[offset_slot]; + copy->offset_bitcount = codes->lens.offset[offset_slot] + + deflate_extra_offset_bits[offset_slot]; + + copy->round = round; +} + +/* Write copies for the previous round. */ +static unsigned +gdeflate_write_prev_round_copies(struct deflate_output_bitstream * restrict os, + struct gdeflate_deferred_copy * restrict copies, + unsigned round) +{ + while (round > 1 && copies[os->idx].round == round - 1) { + copies[os->idx].round = 0; + + deflate_add_bits(os, copies[os->idx].offset_bits, + copies[os->idx].offset_bitcount); + + round += gdeflate_advance(os); + } + + return round; +} + +/* Flush the tail copies. Will write prev and current round copies. */ +static void +gdeflate_write_tail_copies(struct deflate_output_bitstream * restrict os, + struct gdeflate_deferred_copy * restrict copies, + unsigned round) +{ + const unsigned split_point = os->idx % NUM_STREAMS; + + /* Flush previous round copies. */ + for (unsigned n = split_point; n < NUM_STREAMS; n++) { + if (round > 1 && copies[n].round == round - 1) { + os->idx = n; + deflate_add_bits(os, copies[n].offset_bits, + copies[n].offset_bitcount); + } + } + + /* Flush current round copies. */ + for (unsigned n = 0; n < split_point; n++) { + if (copies[n].round == round) { + os->idx = n; + deflate_add_bits(os, copies[n].offset_bits, + copies[n].offset_bitcount); + } + } +} + +static void +gdeflate_write_sequences(struct deflate_output_bitstream * restrict os, + const struct deflate_codes * restrict codes, + const struct deflate_sequence sequences[restrict], + const u8 * restrict in_next) +{ + const struct deflate_sequence *seq = sequences; + struct gdeflate_deferred_copy copies[NUM_STREAMS] = {0}; + unsigned round = 1; + + gdeflate_reset(os); + + for (;;) { + u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF; + unsigned length = (unsigned)(seq->litrunlen_and_length >> 23); + unsigned length_slot; + unsigned litlen_symbol; + + if (litrunlen) { + do { + const unsigned lit = *in_next++; + + round = gdeflate_write_prev_round_copies(os, + copies, + round); + + deflate_add_bits(os, codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + + round += gdeflate_advance(os); + } while (--litrunlen); + } + + round = gdeflate_write_prev_round_copies(os, copies, round); + + if (length == 0) + break; + + in_next += length; + + length_slot = seq->length_slot; + litlen_symbol = 257 + length_slot; + + /* Litlen symbol */ + deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + + /* Extra length bits */ + deflate_add_bits(os, length - deflate_length_slot_base[length_slot], + deflate_extra_length_bits[length_slot]); + + /* Store offset bits for use later */ + gdeflate_init_copy(copies + os->idx, codes, seq->offset, + seq->offset_symbol, round); + + round += gdeflate_advance(os); + + seq++; + } + + deflate_write_end_of_block(os, codes); + round += gdeflate_advance(os); + + gdeflate_write_tail_copies(os, copies, round); +} +#endif + +static void +deflate_write_sequences(struct deflate_output_bitstream * restrict os, + const struct deflate_codes * restrict codes, + const struct deflate_sequence sequences[restrict], + const u8 * restrict in_next) +{ +#ifdef GDEFLATE + gdeflate_write_sequences(os, codes, sequences, in_next); +#else + const struct deflate_sequence *seq = sequences; + + for (;;) { + u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF; + unsigned length = (unsigned)(seq->litrunlen_and_length >> 23); + unsigned length_slot; + unsigned litlen_symbol; + unsigned offset_symbol; + + if (litrunlen) { + #if 1 + while (litrunlen >= 4) { + unsigned lit0 = in_next[0]; + unsigned lit1 = in_next[1]; + unsigned lit2 = in_next[2]; + unsigned lit3 = in_next[3]; + + deflate_add_bits(os, codes->codewords.litlen[lit0], + codes->lens.litlen[lit0]); + if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN)) + deflate_flush_bits(os); + + deflate_add_bits(os, codes->codewords.litlen[lit1], + codes->lens.litlen[lit1]); + if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) + deflate_flush_bits(os); + + deflate_add_bits(os, codes->codewords.litlen[lit2], + codes->lens.litlen[lit2]); + if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN)) + deflate_flush_bits(os); + + deflate_add_bits(os, codes->codewords.litlen[lit3], + codes->lens.litlen[lit3]); + deflate_flush_bits(os); + in_next += 4; + litrunlen -= 4; + } + if (litrunlen-- != 0) { + deflate_add_bits(os, codes->codewords.litlen[*in_next], + codes->lens.litlen[*in_next]); + if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) + deflate_flush_bits(os); + in_next++; + if (litrunlen-- != 0) { + deflate_add_bits(os, codes->codewords.litlen[*in_next], + codes->lens.litlen[*in_next]); + if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) + deflate_flush_bits(os); + in_next++; + if (litrunlen-- != 0) { + deflate_add_bits(os, codes->codewords.litlen[*in_next], + codes->lens.litlen[*in_next]); + if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) + deflate_flush_bits(os); + in_next++; + } + } + if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) + deflate_flush_bits(os); + } + #else + do { + unsigned lit = *in_next++; + deflate_add_bits(os, codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + deflate_flush_bits(os); + } while (--litrunlen); + #endif + } + + if (length == 0) + return; + + in_next += length; + + length_slot = seq->length_slot; + litlen_symbol = 257 + length_slot; + + /* Litlen symbol */ + deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + + /* Extra length bits */ + STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_LENGTH_BITS)); + deflate_add_bits(os, length - deflate_length_slot_base[length_slot], + deflate_extra_length_bits[length_slot]); + + if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_LENGTH_BITS + + MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS)) + deflate_flush_bits(os); + + /* Offset symbol */ + offset_symbol = seq->offset_symbol; + deflate_add_bits(os, codes->codewords.offset[offset_symbol], + codes->lens.offset[offset_symbol]); + + if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS)) + deflate_flush_bits(os); + + /* Extra offset bits */ + deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol], + deflate_extra_offset_bits[offset_symbol]); + + deflate_flush_bits(os); + + seq++; + } +#endif /* GDEFLATE */ +} + +#if SUPPORT_NEAR_OPTIMAL_PARSING +#ifdef GDEFLATE +/* + * Follow the minimum-cost path in the graph of possible match/literal choices + * for the current block and write out the matches/literals using the specified + * Huffman codes. + * + * Note: this is slightly duplicated with deflate_write_sequences(), the reason + * being that we don't want to waste time translating between intermediate + * match/literal representations. + */ +static void +gdeflate_write_item_list(struct deflate_output_bitstream *os, + const struct deflate_codes *codes, + struct libdeflate_compressor *c, + u32 block_length) +{ + struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; + struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length]; + struct gdeflate_deferred_copy copies[NUM_STREAMS] = {0}; + unsigned round = 1; + + gdeflate_reset(os); + + do { + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = (unsigned)(cur_node->item >> OPTIMUM_OFFSET_SHIFT); + unsigned litlen_symbol; + unsigned length_slot; + unsigned offset_slot; + + round = gdeflate_write_prev_round_copies(os, copies, round); + + if (length == 1) { + /* Literal */ + litlen_symbol = offset; + deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + } else { + /* Match length */ + length_slot = deflate_length_slot[length]; + litlen_symbol = 257 + length_slot; + deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + + deflate_add_bits(os, length - deflate_length_slot_base[length_slot], + deflate_extra_length_bits[length_slot]); + + offset_slot = deflate_get_offset_slot(c, offset); + + /* Store offset bits for use later */ + gdeflate_init_copy(copies + os->idx, codes, offset, + offset_slot, round); + } + + round += gdeflate_advance(os); + + cur_node += length; + } while (cur_node != end_node); + + round = gdeflate_write_prev_round_copies(os, copies, round); + + deflate_write_end_of_block(os, codes); + round += gdeflate_advance(os); + + gdeflate_write_tail_copies(os, copies, round); +} +#endif /* GDEFLATE */ + +/* + * Follow the minimum-cost path in the graph of possible match/literal choices + * for the current block and write out the matches/literals using the specified + * Huffman codes. + * + * Note: this is slightly duplicated with deflate_write_sequences(), the reason + * being that we don't want to waste time translating between intermediate + * match/literal representations. + */ +static void +deflate_write_item_list(struct deflate_output_bitstream *os, + const struct deflate_codes *codes, + struct libdeflate_compressor *c, + u32 block_length) +{ +#ifdef GDEFLATE + gdeflate_write_item_list(os, codes, c, block_length); +#else + struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; + struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length]; + do { + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = (unsigned)(cur_node->item >> OPTIMUM_OFFSET_SHIFT); + unsigned litlen_symbol; + unsigned length_slot; + unsigned offset_slot; + + if (length == 1) { + /* Literal */ + litlen_symbol = offset; + deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + deflate_flush_bits(os); + } else { + /* Match length */ + length_slot = deflate_length_slot[length]; + litlen_symbol = 257 + length_slot; + deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], + codes->lens.litlen[litlen_symbol]); + + deflate_add_bits(os, length - deflate_length_slot_base[length_slot], + deflate_extra_length_bits[length_slot]); + + if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_LENGTH_BITS + + MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS)) + deflate_flush_bits(os); + + + /* Match offset */ + offset_slot = deflate_get_offset_slot(c, offset); + deflate_add_bits(os, codes->codewords.offset[offset_slot], + codes->lens.offset[offset_slot]); + + if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_OFFSET_BITS)) + deflate_flush_bits(os); + + deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot], + deflate_extra_offset_bits[offset_slot]); + + deflate_flush_bits(os); + } + cur_node += length; + } while (cur_node != end_node); +#endif /* GDEFLATE */ +} +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +static void +deflate_write_uncompressed_block(struct deflate_output_bitstream *os, + const u8 *data, u16 len, + bool is_final_block) +{ + deflate_write_block_header(os, is_final_block, + DEFLATE_BLOCKTYPE_UNCOMPRESSED); + deflate_align_bitstream(os); + + if (4 + (u32)len >= os->end - os->next) { + os->next = os->end; + return; + } +#ifndef GDEFLATE + put_unaligned_le16(len, os->next); + os->next += 2; + put_unaligned_le16(~len, os->next); + os->next += 2; + memcpy(os->next, data, len); + os->next += len; +#else + deflate_add_bits(os, len, 16); + + while (len) { + deflate_add_bits(os, *data, 8); + gdeflate_advance(os); + + data++; + len--; + } +#endif +} + +static void +deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os, + const u8 *data, size_t data_length, + bool is_final_block) +{ + do { + u16 len = MIN(data_length, UINT16_MAX); + + deflate_write_uncompressed_block(os, data, len, + is_final_block && len == data_length); + data += len; + data_length -= len; + } while (data_length != 0); +} + +/* + * Choose the best type of block to use (dynamic Huffman, static Huffman, or + * uncompressed), then output it. + */ +static void +deflate_flush_block(struct libdeflate_compressor * restrict c, + struct deflate_output_bitstream * restrict os, + const u8 * restrict block_begin, u32 block_length, + bool is_final_block, bool use_item_list) +{ + static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7, + }; + + /* Costs are measured in bits */ + u32 dynamic_cost = 0; + u32 static_cost = 0; + u32 uncompressed_cost = 0; + struct deflate_codes *codes; + int block_type; + unsigned sym; + + /* Tally the end-of-block symbol. */ + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + + /* Build dynamic Huffman codes. */ + deflate_make_huffman_codes(&c->freqs, &c->codes); + + /* Account for the cost of sending dynamic Huffman codes. */ + deflate_precompute_huffman_header(c); + dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens); + for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { + u32 extra = deflate_extra_precode_bits[sym]; + dynamic_cost += c->precode_freqs[sym] * + (extra + c->precode_lens[sym]); + } + + /* Account for the cost of encoding literals. */ + for (sym = 0; sym < 256; sym++) { + dynamic_cost += c->freqs.litlen[sym] * + c->codes.lens.litlen[sym]; + } + for (sym = 0; sym < 144; sym++) + static_cost += c->freqs.litlen[sym] * 8; + for (; sym < 256; sym++) + static_cost += c->freqs.litlen[sym] * 9; + + /* Account for the cost of encoding the end-of-block symbol. */ + dynamic_cost += c->codes.lens.litlen[256]; + static_cost += 7; + + /* Account for the cost of encoding lengths. */ + for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) { + u32 extra = deflate_extra_length_bits[sym - 257]; + dynamic_cost += c->freqs.litlen[sym] * + (extra + c->codes.lens.litlen[sym]); + static_cost += c->freqs.litlen[sym] * + (extra + c->static_codes.lens.litlen[sym]); + } + + /* Account for the cost of encoding offsets. */ + for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) { + u32 extra = deflate_extra_offset_bits[sym]; + dynamic_cost += c->freqs.offset[sym] * + (extra + c->codes.lens.offset[sym]); + static_cost += c->freqs.offset[sym] * (extra + 5); + } + +#ifdef GDEFLATE + uncompressed_cost = 0; +#else + uncompressed_cost += (-(os->bitcount + 3) & 7) + 16; +#endif + + /* Compute the cost of using uncompressed blocks. */ + uncompressed_cost += 16 + + (40 * (DIV_ROUND_UP(block_length, + UINT16_MAX) - 1)) + + (8 * block_length); + + /* Choose the cheapest block type. */ + if (dynamic_cost < MIN(static_cost, uncompressed_cost)) { + block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN; + codes = &c->codes; + } else if (static_cost < uncompressed_cost) { + block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN; + codes = &c->static_codes; + } else { + block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED; + } + + /* Now actually output the block. */ + + if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + /* Note: the length being flushed may exceed the maximum length + * of an uncompressed block (65535 bytes). Therefore, more than + * one uncompressed block might be needed. */ + deflate_write_uncompressed_blocks(os, block_begin, block_length, + is_final_block); + } else { + /* Output the block header. */ + deflate_write_block_header(os, is_final_block, block_type); + + /* Output the Huffman codes (dynamic Huffman blocks only). */ + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) + deflate_write_huffman_header(c, os); + + /* Output the literals, matches, and end-of-block symbol. */ + #if SUPPORT_NEAR_OPTIMAL_PARSING + if (use_item_list) + deflate_write_item_list(os, codes, c, block_length); + else + #endif + deflate_write_sequences(os, codes, c->p.g.sequences, + block_begin); + #ifndef GDEFLATE + deflate_write_end_of_block(os, codes); + #endif + } +} + +static forceinline void +deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal, + u32 *litrunlen_p) +{ + c->freqs.litlen[literal]++; + ++*litrunlen_p; +} + +static forceinline void +deflate_choose_match(struct libdeflate_compressor *c, + unsigned length, unsigned offset, + u32 *litrunlen_p, struct deflate_sequence **next_seq_p) +{ + struct deflate_sequence *seq = *next_seq_p; + unsigned length_slot = deflate_length_slot[length]; + unsigned offset_slot = deflate_get_offset_slot(c, offset); + + c->freqs.litlen[257 + length_slot]++; + c->freqs.offset[offset_slot]++; + + seq->litrunlen_and_length = ((lz_item_t)length << 23) | *litrunlen_p; + seq->offset = offset; + seq->length_slot = length_slot; + seq->offset_symbol = offset_slot; + + *litrunlen_p = 0; + *next_seq_p = seq + 1; +} + +static forceinline void +deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen) +{ + seq->litrunlen_and_length = litrunlen; /* length = 0 */ +} + +/******************************************************************************/ + +/* + * Block splitting algorithm. The problem is to decide when it is worthwhile to + * start a new block with new Huffman codes. There is a theoretically optimal + * solution: recursively consider every possible block split, considering the + * exact cost of each block, and choose the minimum cost approach. But this is + * far too slow. Instead, as an approximation, we can count symbols and after + * every N symbols, compare the expected distribution of symbols based on the + * previous data with the actual distribution. If they differ "by enough", then + * start a new block. + * + * As an optimization and heuristic, we don't distinguish between every symbol + * but rather we combine many symbols into a single "observation type". For + * literals we only look at the high bits and low bits, and for matches we only + * look at whether the match is long or not. The assumption is that for typical + * "real" data, places that are good block boundaries will tend to be noticeable + * based only on changes in these aggregate frequencies, without looking for + * subtle differences in individual symbols. For example, a change from ASCII + * bytes to non-ASCII bytes, or from few matches (generally less compressible) + * to many matches (generally more compressible), would be easily noticed based + * on the aggregates. + * + * For determining whether the frequency distributions are "different enough" to + * start a new block, the simply heuristic of splitting when the sum of absolute + * differences exceeds a constant seems to be good enough. We also add a number + * proportional to the block length so that the algorithm is more likely to end + * long blocks than short blocks. This reflects the general expectation that it + * will become increasingly beneficial to start a new block as the current + * block grows longer. + * + * Finally, for an approximation, it is not strictly necessary that the exact + * symbols being used are considered. With "near-optimal parsing", for example, + * the actual symbols that will be used are unknown until after the block + * boundary is chosen and the block has been optimized. Since the final choices + * cannot be used, we can use preliminary "greedy" choices instead. + */ + +/* Initialize the block split statistics when starting a new block. */ +static void +init_block_split_stats(struct block_split_stats *stats) +{ + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + stats->new_observations[i] = 0; + stats->observations[i] = 0; + } + stats->num_new_observations = 0; + stats->num_observations = 0; +} + +/* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the + * literal, for 8 possible literal observation types. */ +static forceinline void +observe_literal(struct block_split_stats *stats, u8 lit) +{ + stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; + stats->num_new_observations++; +} + +/* Match observation. Heuristic: use one observation type for "short match" and + * one observation type for "long match". */ +static forceinline void +observe_match(struct block_split_stats *stats, unsigned length) +{ + stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++; + stats->num_new_observations++; +} + +static bool +do_end_block_check(struct block_split_stats *stats, u32 block_length) +{ + int i; + + if (stats->num_observations > 0) { + + /* Note: to avoid slow divisions, we do not divide by + * 'num_observations', but rather do all math with the numbers + * multiplied by 'num_observations'. */ + u32 total_delta = 0; + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + u32 expected = stats->observations[i] * stats->num_new_observations; + u32 actual = stats->new_observations[i] * stats->num_observations; + u32 delta = (actual > expected) ? actual - expected : + expected - actual; + total_delta += delta; + } + + /* Ready to end the block? */ + if (total_delta + (block_length / 4096) * stats->num_observations >= + NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations) + return true; + } + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + stats->num_observations += stats->new_observations[i]; + stats->observations[i] += stats->new_observations[i]; + stats->new_observations[i] = 0; + } + stats->num_new_observations = 0; + return false; +} + +static forceinline bool +should_end_block(struct block_split_stats *stats, + const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) +{ + /* Ready to check block split statistics? */ + if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK || + in_next - in_block_begin < MIN_BLOCK_LENGTH || + in_end - in_next < MIN_BLOCK_LENGTH) + return false; + + return do_end_block_check(stats, in_next - in_block_begin); +} + +/******************************************************************************/ + +/* + * This is the level 0 "compressor". It always outputs uncompressed blocks. + */ +static size_t +deflate_compress_none(struct libdeflate_compressor * restrict c, + const u8 * restrict in, size_t in_nbytes, + u8 * restrict out, size_t out_nbytes_avail) +{ + struct deflate_output_bitstream os; + + deflate_init_output(&os, out, out_nbytes_avail); + + deflate_write_uncompressed_blocks(&os, in, in_nbytes, true); + + return deflate_flush_output(&os); +} + +/* + * This is the "greedy" DEFLATE compressor. It always chooses the longest match. + */ +static size_t +deflate_compress_greedy(struct libdeflate_compressor * restrict c, + const u8 * restrict in, size_t in_nbytes, + u8 * restrict out, size_t out_nbytes_avail) +{ + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + struct deflate_output_bitstream os; + const u8 *in_cur_base = in_next; + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + u32 next_hashes[2] = {0, 0}; + + deflate_init_output(&os, out, out_nbytes_avail); + hc_matchfinder_init(&c->p.g.hc_mf); + + do { + /* Starting a new DEFLATE block. */ + + const u8 * const in_block_begin = in_next; + const u8 * const in_max_block_end = + in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); + u32 litrunlen = 0; + struct deflate_sequence *next_seq = c->p.g.sequences; + + init_block_split_stats(&c->split_stats); + deflate_reset_symbol_frequencies(c); + + do { + u32 length; + u32 offset; + + /* Decrease the maximum and nice match lengths if we're + * approaching the end of the input buffer. */ + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = MIN(nice_len, max_len); + } + + length = hc_matchfinder_longest_match(&c->p.g.hc_mf, + &in_cur_base, + in_next, + DEFLATE_MIN_MATCH_LEN - 1, + max_len, + nice_len, + c->max_search_depth, + next_hashes, + &offset); + + if (length >= DEFLATE_MIN_MATCH_LEN) { + /* Match found. */ + deflate_choose_match(c, length, offset, + &litrunlen, &next_seq); + observe_match(&c->split_stats, length); + in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, + &in_cur_base, + in_next + 1, + in_end, + length - 1, + next_hashes); + } else { + /* No match found. */ + deflate_choose_literal(c, *in_next, &litrunlen); + observe_literal(&c->split_stats, *in_next); + in_next++; + } + + /* Check if it's time to output another block. */ + } while (in_next < in_max_block_end && + !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); + + deflate_finish_sequence(next_seq, litrunlen); + deflate_flush_block(c, &os, in_block_begin, + in_next - in_block_begin, + in_next == in_end, false); + } while (in_next != in_end); + + return deflate_flush_output(&os); +} + +/* + * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to + * see if there's a longer match at the next position. If yes, it outputs a + * literal and continues to the next position. If no, it outputs the match. + */ +static size_t +deflate_compress_lazy(struct libdeflate_compressor * restrict c, + const u8 * restrict in, size_t in_nbytes, + u8 * restrict out, size_t out_nbytes_avail) +{ + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + struct deflate_output_bitstream os; + const u8 *in_cur_base = in_next; + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + u32 next_hashes[2] = {0, 0}; + + deflate_init_output(&os, out, out_nbytes_avail); + hc_matchfinder_init(&c->p.g.hc_mf); + + do { + /* Starting a new DEFLATE block. */ + + const u8 * const in_block_begin = in_next; + const u8 * const in_max_block_end = + in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); + u32 litrunlen = 0; + struct deflate_sequence *next_seq = c->p.g.sequences; + + init_block_split_stats(&c->split_stats); + deflate_reset_symbol_frequencies(c); + + do { + unsigned cur_len; + unsigned cur_offset; + unsigned next_len; + unsigned next_offset; + + if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) { + max_len = in_end - in_next; + nice_len = MIN(nice_len, max_len); + } + + /* Find the longest match at the current position. */ + cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf, + &in_cur_base, + in_next, + DEFLATE_MIN_MATCH_LEN - 1, + max_len, + nice_len, + c->max_search_depth, + next_hashes, + &cur_offset); + in_next += 1; + + if (cur_len < DEFLATE_MIN_MATCH_LEN) { + /* No match found. Choose a literal. */ + deflate_choose_literal(c, *(in_next - 1), &litrunlen); + observe_literal(&c->split_stats, *(in_next - 1)); + continue; + } + + have_cur_match: + observe_match(&c->split_stats, cur_len); + + /* We have a match at the current position. */ + + /* If the current match is very long, choose it + * immediately. */ + if (cur_len >= nice_len) { + deflate_choose_match(c, cur_len, cur_offset, + &litrunlen, &next_seq); + in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, + &in_cur_base, + in_next, + in_end, + cur_len - 1, + next_hashes); + continue; + } + + /* + * Try to find a match at the next position. + * + * Note: since we already have a match at the *current* + * position, we use only half the 'max_search_depth' + * when checking the *next* position. This is a useful + * trade-off because it's more worthwhile to use a + * greater search depth on the initial match. + * + * Note: it's possible to structure the code such that + * there's only one call to longest_match(), which + * handles both the "find the initial match" and "try to + * find a longer match" cases. However, it is faster to + * have two call sites, with longest_match() inlined at + * each. + */ + if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) { + max_len = in_end - in_next; + nice_len = MIN(nice_len, max_len); + } + next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf, + &in_cur_base, + in_next, + cur_len, + max_len, + nice_len, + c->max_search_depth / 2, + next_hashes, + &next_offset); + in_next += 1; + + if (next_len > cur_len) { + /* Found a longer match at the next position. + * Output a literal. Then the next match + * becomes the current match. */ + deflate_choose_literal(c, *(in_next - 2), &litrunlen); + cur_len = next_len; + cur_offset = next_offset; + goto have_cur_match; + } + + /* No longer match at the next position. + * Output the current match. */ + deflate_choose_match(c, cur_len, cur_offset, + &litrunlen, &next_seq); + in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, + &in_cur_base, + in_next, + in_end, + cur_len - 2, + next_hashes); + + /* Check if it's time to output another block. */ + } while (in_next < in_max_block_end && + !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); + + deflate_finish_sequence(next_seq, litrunlen); + deflate_flush_block(c, &os, in_block_begin, + in_next - in_block_begin, + in_next == in_end, false); + } while (in_next != in_end); + + return deflate_flush_output(&os); +} + +#if SUPPORT_NEAR_OPTIMAL_PARSING + +/* + * Follow the minimum-cost path in the graph of possible match/literal choices + * for the current block and compute the frequencies of the Huffman symbols that + * would be needed to output those matches and literals. + */ +static void +deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) +{ + struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; + struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length]; + do { + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = (unsigned)(cur_node->item >> OPTIMUM_OFFSET_SHIFT); + + if (length == 1) { + /* Literal */ + c->freqs.litlen[offset]++; + } else { + /* Match */ + c->freqs.litlen[257 + deflate_length_slot[length]]++; + c->freqs.offset[deflate_get_offset_slot(c, offset)]++; + } + cur_node += length; + } while (cur_node != end_node); +} + +/* Set the current cost model from the codeword lengths specified in @lens. */ +static void +deflate_set_costs_from_codes(struct libdeflate_compressor *c, + const struct deflate_lens *lens) +{ + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { + u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS); + c->p.n.costs.literal[i] = bits << COST_SHIFT; + } + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { + unsigned length_slot = deflate_length_slot[i]; + unsigned litlen_sym = 257 + length_slot; + u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); + bits += deflate_extra_length_bits[length_slot]; + c->p.n.costs.length[i] = bits << COST_SHIFT; + } + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { + u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS); + bits += deflate_extra_offset_bits[i]; + c->p.n.costs.offset_slot[i] = bits << COST_SHIFT; + } +} + +static forceinline u32 +deflate_default_literal_cost(unsigned literal) +{ + STATIC_ASSERT(COST_SHIFT == 3); + /* 66 is 8.25 bits/symbol */ + return 66; +} + +static forceinline u32 +deflate_default_length_slot_cost(unsigned length_slot) +{ + STATIC_ASSERT(COST_SHIFT == 3); + /* 60 is 7.5 bits/symbol */ + return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT); +} + +static forceinline u32 +deflate_default_offset_slot_cost(unsigned offset_slot) +{ + STATIC_ASSERT(COST_SHIFT == 3); + /* 39 is 4.875 bits/symbol */ + return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT); +} + +/* + * Set default symbol costs for the first block's first optimization pass. + * + * It works well to assume that each symbol is equally probable. This results + * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 << + * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding + * alphabet. However, we intentionally bias the parse towards matches rather + * than literals by using a slightly lower default cost for length symbols than + * for literals. This often improves the compression ratio slightly. + */ +static void +deflate_set_default_costs(struct libdeflate_compressor *c) +{ + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) + c->p.n.costs.literal[i] = deflate_default_literal_cost(i); + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) + c->p.n.costs.length[i] = deflate_default_length_slot_cost( + deflate_length_slot[i]); + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) + c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i); +} + +static forceinline void +deflate_adjust_cost(u32 *cost_p, u32 default_cost) +{ + *cost_p += ((s32)default_cost - (s32)*cost_p) >> 1; +} + +/* + * Adjust the costs when beginning a new block. + * + * Since the current costs have been optimized for the data, it's undesirable to + * throw them away and start over with the default costs. At the same time, we + * don't want to bias the parse by assuming that the next block will be similar + * to the current block. As a compromise, make the costs closer to the + * defaults, but don't simply set them to the defaults. + */ +static void +deflate_adjust_costs(struct libdeflate_compressor *c) +{ + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) + deflate_adjust_cost(&c->p.n.costs.literal[i], + deflate_default_literal_cost(i)); + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) + deflate_adjust_cost(&c->p.n.costs.length[i], + deflate_default_length_slot_cost( + deflate_length_slot[i])); + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) + deflate_adjust_cost(&c->p.n.costs.offset_slot[i], + deflate_default_offset_slot_cost(i)); +} + +/* + * Find the minimum-cost path through the graph of possible match/literal + * choices for this block. + * + * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which + * represents the node at the beginning of the block, to + * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of + * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'. + * + * The algorithm works backwards, starting at the end node and proceeding + * backwards one node at a time. At each node, the minimum cost to reach the + * end node is computed and the match/literal choice that begins that path is + * saved. + */ +static void +deflate_find_min_cost_path(struct libdeflate_compressor *c, + const u32 block_length, + const struct lz_match *cache_ptr) +{ + struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length]; + struct deflate_optimum_node *cur_node = end_node; + + cur_node->cost_to_end = 0; + do { + unsigned num_matches; + unsigned literal; + u32 best_cost_to_end; + + cur_node--; + cache_ptr--; + + num_matches = cache_ptr->length; + literal = cache_ptr->offset; + + /* It's always possible to choose a literal. */ + best_cost_to_end = c->p.n.costs.literal[literal] + + (cur_node + 1)->cost_to_end; + cur_node->item = ((lz_item_t)literal << OPTIMUM_OFFSET_SHIFT) | 1; + + /* Also consider matches if there are any. */ + if (num_matches) { + const struct lz_match *match; + unsigned len; + unsigned offset; + unsigned offset_slot; + u32 offset_cost; + u32 cost_to_end; + + /* + * Consider each length from the minimum + * (DEFLATE_MIN_MATCH_LEN) to the length of the longest + * match found at this position. For each length, we + * consider only the smallest offset for which that + * length is available. Although this is not guaranteed + * to be optimal due to the possibility of a larger + * offset costing less than a smaller offset to code, + * this is a very useful heuristic. + */ + match = cache_ptr - num_matches; + len = DEFLATE_MIN_MATCH_LEN; + do { + offset = match->offset; + offset_slot = deflate_get_offset_slot(c, offset); + offset_cost = c->p.n.costs.offset_slot[offset_slot]; + do { + cost_to_end = offset_cost + + c->p.n.costs.length[len] + + (cur_node + len)->cost_to_end; + if (cost_to_end < best_cost_to_end) { + best_cost_to_end = cost_to_end; + cur_node->item = ((lz_item_t)offset << OPTIMUM_OFFSET_SHIFT) | len; + } + } while (++len <= match->length); + } while (++match != cache_ptr); + cache_ptr -= num_matches; + } + cur_node->cost_to_end = best_cost_to_end; + } while (cur_node != &c->p.n.optimum_nodes[0]); +} + +/* + * Choose the literal/match sequence to use for the current block. The basic + * algorithm finds a minimum-cost path through the block's graph of + * literal/match choices, given a cost model. However, the cost of each symbol + * is unknown until the Huffman codes have been built, but at the same time the + * Huffman codes depend on the frequencies of chosen symbols. Consequently, + * multiple passes must be used to try to approximate an optimal solution. The + * first pass uses default costs, mixed with the costs from the previous block + * if any. Later passes use the Huffman codeword lengths from the previous pass + * as the costs. + */ +static void +deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length, + const struct lz_match *cache_ptr, bool is_first_block) +{ + unsigned num_passes_remaining = c->p.n.num_optim_passes; + u32 i; + + /* Force the block to really end at the desired length, even if some + * matches extend beyond it. */ + for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN, + ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) + c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; + + /* Set the initial costs. */ + if (is_first_block) + deflate_set_default_costs(c); + else + deflate_adjust_costs(c); + + for (;;) { + /* Find the minimum cost path for this pass. */ + deflate_find_min_cost_path(c, block_length, cache_ptr); + + /* Compute frequencies of the chosen symbols. */ + deflate_reset_symbol_frequencies(c); + deflate_tally_item_list(c, block_length); + + if (--num_passes_remaining == 0) + break; + + /* At least one optimization pass remains; update the costs. */ + deflate_make_huffman_codes(&c->freqs, &c->codes); + deflate_set_costs_from_codes(c, &c->codes.lens); + } +} + +/* + * This is the "near-optimal" DEFLATE compressor. It computes the optimal + * representation of each DEFLATE block using a minimum-cost path search over + * the graph of possible match/literal choices for that block, assuming a + * certain cost for each Huffman symbol. + * + * For several reasons, the end result is not guaranteed to be optimal: + * + * - Nonoptimal choice of blocks + * - Heuristic limitations on which matches are actually considered + * - Symbol costs are unknown until the symbols have already been chosen + * (so iterative optimization must be used) + */ +static size_t +deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, + const u8 * restrict in, size_t in_nbytes, + u8 * restrict out, size_t out_nbytes_avail) +{ + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + struct deflate_output_bitstream os; + const u8 *in_cur_base = in_next; + const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + u32 next_hashes[2] = {0, 0}; + + deflate_init_output(&os, out, out_nbytes_avail); + bt_matchfinder_init(&c->p.n.bt_mf); + + do { + /* Starting a new DEFLATE block. */ + + struct lz_match *cache_ptr = c->p.n.match_cache; + const u8 * const in_block_begin = in_next; + const u8 * const in_max_block_end = + in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); + const u8 *next_observation = in_next; + + init_block_split_stats(&c->split_stats); + + /* + * Find matches until we decide to end the block. We end the + * block if any of the following is true: + * + * (1) Maximum block length has been reached + * (2) Match catch may overflow. + * (3) Block split heuristic says to split now. + */ + do { + struct lz_match *matches; + unsigned best_len; + + /* Slide the window forward if needed. */ + if (in_next == in_next_slide) { + bt_matchfinder_slide_window(&c->p.n.bt_mf); + in_cur_base = in_next; + in_next_slide = in_next + MIN(in_end - in_next, + MATCHFINDER_WINDOW_SIZE); + } + + /* Decrease the maximum and nice match lengths if we're + * approaching the end of the input buffer. */ + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = MIN(nice_len, max_len); + } + + /* + * Find matches with the current position using the + * binary tree matchfinder and save them in + * 'match_cache'. + * + * Note: the binary tree matchfinder is more suited for + * optimal parsing than the hash chain matchfinder. The + * reasons for this include: + * + * - The binary tree matchfinder can find more matches + * in the same number of steps. + * - One of the major advantages of hash chains is that + * skipping positions (not searching for matches at + * them) is faster; however, with optimal parsing we + * search for matches at almost all positions, so this + * advantage of hash chains is negated. + */ + matches = cache_ptr; + best_len = 0; + if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) { + cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf, + in_cur_base, + in_next - in_cur_base, + max_len, + nice_len, + c->max_search_depth, + next_hashes, + &best_len, + matches); + } + + if (in_next >= next_observation) { + if (best_len >= 4) { + observe_match(&c->split_stats, best_len); + next_observation = in_next + best_len; + } else { + observe_literal(&c->split_stats, *in_next); + next_observation = in_next + 1; + } + } + + cache_ptr->length = cache_ptr - matches; + cache_ptr->offset = *in_next; + in_next++; + cache_ptr++; + + /* + * If there was a very long match found, don't cache any + * matches for the bytes covered by that match. This + * avoids degenerate behavior when compressing highly + * redundant data, where the number of matches can be + * very large. + * + * This heuristic doesn't actually hurt the compression + * ratio very much. If there's a long match, then the + * data must be highly compressible, so it doesn't + * matter much what we do. + */ + if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) { + --best_len; + do { + if (in_next == in_next_slide) { + bt_matchfinder_slide_window(&c->p.n.bt_mf); + in_cur_base = in_next; + in_next_slide = in_next + MIN(in_end - in_next, + MATCHFINDER_WINDOW_SIZE); + } + if (unlikely(max_len > in_end - in_next)) { + max_len = in_end - in_next; + nice_len = MIN(nice_len, max_len); + } + if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) { + bt_matchfinder_skip_position(&c->p.n.bt_mf, + in_cur_base, + in_next - in_cur_base, + nice_len, + c->max_search_depth, + next_hashes); + } + cache_ptr->length = 0; + cache_ptr->offset = *in_next; + in_next++; + cache_ptr++; + } while (--best_len); + } + } while (in_next < in_max_block_end && + cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] && + !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); + + /* All the matches for this block have been cached. Now choose + * the sequence of items to output and flush the block. */ + deflate_optimize_block(c, in_next - in_block_begin, cache_ptr, + in_block_begin == in); + deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin, + in_next == in_end, true); + } while (in_next != in_end); + + return deflate_flush_output(&os); +} + +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + +/* Initialize c->offset_slot_fast. */ +static void +deflate_init_offset_slot_fast(struct libdeflate_compressor *c) +{ + unsigned offset_slot; + unsigned offset; + unsigned offset_end; + + for (offset_slot = 0; + offset_slot < ARRAY_LEN(deflate_offset_slot_base); + offset_slot++) + { + offset = deflate_offset_slot_base[offset_slot]; + #if USE_FULL_OFFSET_SLOT_FAST + offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); + do { + c->offset_slot_fast[offset] = offset_slot; + } while (++offset != offset_end); + #else + if (offset <= 256) { + offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); + do { + c->offset_slot_fast[offset - 1] = offset_slot; + } while (++offset != offset_end); + } else { + offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); + do { + c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot; + } while ((offset += (1 << 7)) != offset_end); + } + #endif + } +} + +/* Initialize deflate_length_slot. */ +static void +deflate_init_length_slot(void) +{ + unsigned length_slot; + unsigned length; + unsigned length_end; + + if (deflate_length_slot_inited > 0) + return; + + for (length_slot = 1; + length_slot < ARRAY_LEN(deflate_length_slot_base); + length_slot++) + { + length = deflate_length_slot_base[length_slot]; + length_end = length + (1 << deflate_extra_length_bits[length_slot]); +#ifdef DEFLATE64 + /* the last slot is a special case in deflate64 */ + if (length_slot == ARRAY_LEN(deflate_length_slot_base) - 1) { + length = 258; + length_end = DEFLATE_MAX_MATCH_LEN + 1; + } +#endif + do { + deflate_length_slot[length] = length_slot; + } while (++length != length_end); + } + + deflate_length_slot_inited = 1; +} + +#ifndef HIDE_INTERFACE + +LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI +libdeflate_alloc_compressor(int compression_level) +{ + struct libdeflate_compressor *c; + size_t size = offsetof(struct libdeflate_compressor, p); + + if (compression_level < 0 || compression_level > 12) + return NULL; + +#if SUPPORT_NEAR_OPTIMAL_PARSING + if (compression_level >= 8) + size += sizeof(c->p.n); + else if (compression_level >= 1) + size += sizeof(c->p.g); +#else + if (compression_level >= 1) + size += sizeof(c->p.g); +#endif + + c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size); + if (!c) + return NULL; + + c->compression_level = compression_level; + + /* + * The higher the compression level, the more we should bother trying to + * compress very small inputs. + */ + c->min_size_to_compress = 56 - (compression_level * 4); + + switch (compression_level) { + case 0: + c->impl = deflate_compress_none; + break; + case 1: + c->impl = deflate_compress_greedy; + c->max_search_depth = 2; + c->nice_match_length = 8; + break; + case 2: + c->impl = deflate_compress_greedy; + c->max_search_depth = 6; + c->nice_match_length = 10; + break; + case 3: + c->impl = deflate_compress_greedy; + c->max_search_depth = 12; + c->nice_match_length = 14; + break; + case 4: + c->impl = deflate_compress_greedy; + c->max_search_depth = 24; + c->nice_match_length = 24; + break; + case 5: + c->impl = deflate_compress_lazy; + c->max_search_depth = 20; + c->nice_match_length = 30; + break; + case 6: + c->impl = deflate_compress_lazy; + c->max_search_depth = 40; + c->nice_match_length = 65; + break; + case 7: + c->impl = deflate_compress_lazy; + c->max_search_depth = 100; + c->nice_match_length = 130; + break; +#if SUPPORT_NEAR_OPTIMAL_PARSING + case 8: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 12; + c->nice_match_length = 20; + c->p.n.num_optim_passes = 1; + break; + case 9: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 16; + c->nice_match_length = 26; + c->p.n.num_optim_passes = 2; + break; + case 10: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 30; + c->nice_match_length = 50; + c->p.n.num_optim_passes = 2; + break; + case 11: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 60; + c->nice_match_length = 80; + c->p.n.num_optim_passes = 3; + break; + default: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 100; + c->nice_match_length = 133; + c->p.n.num_optim_passes = 4; + break; +#else + case 8: + c->impl = deflate_compress_lazy; + c->max_search_depth = 150; + c->nice_match_length = 200; + break; + default: + c->impl = deflate_compress_lazy; + c->max_search_depth = 200; + c->nice_match_length = DEFLATE_MAX_MATCH_LEN; + break; +#endif + } + + deflate_init_offset_slot_fast(c); + deflate_init_static_codes(c); + deflate_init_length_slot(); + + return c; +} + +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_deflate_compress(struct libdeflate_compressor *c, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail) +{ + if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING)) + return 0; + + /* For extremely small inputs just use a single uncompressed block. */ + if (unlikely(in_nbytes < c->min_size_to_compress)) { + struct deflate_output_bitstream os; + deflate_init_output(&os, out, out_nbytes_avail); + if (in_nbytes == 0) + in = &os; /* Avoid passing NULL to memcpy() */ + deflate_write_uncompressed_block(&os, in, in_nbytes, true); + return deflate_flush_output(&os); + } + + return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail); +} + +LIBDEFLATEEXPORT void LIBDEFLATEAPI +libdeflate_free_compressor(struct libdeflate_compressor *c) +{ + libdeflate_aligned_free(c); +} + +unsigned int +deflate_get_compression_level(struct libdeflate_compressor *c) +{ + return c->compression_level; +} + +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, + size_t in_nbytes) +{ + /* + * The worst case is all uncompressed blocks where one block has length + * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH. + * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE, + * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN. + */ + size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); + return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING +#ifdef GDEFLATE + + 2*(NUM_STREAMS*BITS_PER_PACKET)/8 +#endif + ; +} + +#endif diff --git a/lib/gdeflate/libdeflate/lib/deflate_compress.h b/lib/gdeflate/libdeflate/lib/deflate_compress.h new file mode 100644 index 0000000..d97d019 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/deflate_compress.h @@ -0,0 +1,13 @@ +#ifndef LIB_DEFLATE_COMPRESS_H +#define LIB_DEFLATE_COMPRESS_H + +#include "lib_common.h" + +/* DEFLATE compression is private to deflate_compress.c, but we do need to be + * able to query the compression level for zlib and gzip header generation. */ + +struct libdeflate_compressor; + +unsigned int deflate_get_compression_level(struct libdeflate_compressor *c); + +#endif /* LIB_DEFLATE_COMPRESS_H */ diff --git a/lib/gdeflate/libdeflate/lib/deflate_constants.h b/lib/gdeflate/libdeflate/lib/deflate_constants.h new file mode 100644 index 0000000..a00569f --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/deflate_constants.h @@ -0,0 +1,102 @@ +/* + * deflate_constants.h - constants for the DEFLATE compression format + */ + +#ifndef LIB_DEFLATE_CONSTANTS_H +#define LIB_DEFLATE_CONSTANTS_H + +#ifdef WITH_GDEFLATE + +/* Enable GDeflate mode. */ +#define GDEFLATE + +/* GDeflate is deflate64-based. */ +#define DEFLATE64 + +/* Number of GDeflate streams. */ +#define NUM_STREAMS 32 + +/* The number of bits to keep in input buffer. */ +#define LOW_WATERMARK_BITS 32 + +/* Number of bits per GDeflate bit-packet. */ +#define BITS_PER_PACKET 32 + +/* GDeflate page size. */ +#define GDEFLATE_PAGE_SIZE 65536 + +#endif + +/* Valid block types */ +#define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0 +#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1 +#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2 + +/* Minimum and maximum supported match lengths (in bytes) */ +#define DEFLATE_MIN_MATCH_LEN 3 +#ifndef DEFLATE64 +#define DEFLATE_MAX_MATCH_LEN 258 +#else +/* The maximum length of a deflate64 match is 65538, however the bt_matcher + * uses 16-bits to store match lengths so to have the bt_matcher to + * work without overflows the maximum length either needs to be reduced + * to fit into 16-bits, or length storage type changed to a wider type. */ +#define DEFLATE_MAX_MATCH_LEN 65535 +#endif + +/* Minimum and maximum supported match offsets (in bytes) */ +#define DEFLATE_MIN_MATCH_OFFSET 1 +#ifdef DEFLATE64 +#define DEFLATE_MAX_MATCH_OFFSET (32768*2) +#define DEFLATE_MAX_WINDOW_SIZE (32768*2) +#else +#define DEFLATE_MAX_MATCH_OFFSET 32768 +#define DEFLATE_MAX_WINDOW_SIZE 32768 +#endif + +/* Number of symbols in each Huffman code. Note: for the literal/length + * and offset codes, these are actually the maximum values; a given block + * might use fewer symbols. */ +#define DEFLATE_NUM_PRECODE_SYMS 19 +#define DEFLATE_NUM_LITLEN_SYMS 288 +#define DEFLATE_NUM_OFFSET_SYMS 32 + +/* The maximum number of symbols across all codes */ +#define DEFLATE_MAX_NUM_SYMS 288 + +/* Division of symbols in the literal/length code */ +#define DEFLATE_NUM_LITERALS 256 +#define DEFLATE_END_OF_BLOCK 256 +#define DEFLATE_NUM_LEN_SYMS 31 + +/* Maximum codeword length, in bits, within each Huffman code */ +#define DEFLATE_MAX_PRE_CODEWORD_LEN 7 +#define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15 +#define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15 + +/* The maximum codeword length across all codes */ +#define DEFLATE_MAX_CODEWORD_LEN 15 + +/* Maximum possible overrun when decoding codeword lengths */ +#define DEFLATE_MAX_LENS_OVERRUN 137 + +/* + * Maximum number of extra bits that may be required to represent a match + * length or offset. + */ +#ifdef DEFLATE64 +#define DEFLATE_MAX_EXTRA_LENGTH_BITS 16 +#define DEFLATE_MAX_EXTRA_OFFSET_BITS 14 +#else +#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5 +#define DEFLATE_MAX_EXTRA_OFFSET_BITS 14 +#endif + +/* The maximum number of bits in which a match can be represented. This + * is the absolute worst case, which assumes the longest possible Huffman + * codewords and the maximum numbers of extra bits. */ +#define DEFLATE_MAX_MATCH_BITS \ + (DEFLATE_MAX_LITLEN_CODEWORD_LEN + DEFLATE_MAX_EXTRA_LENGTH_BITS + \ + DEFLATE_MAX_OFFSET_CODEWORD_LEN + DEFLATE_MAX_EXTRA_OFFSET_BITS) + +#endif /* LIB_DEFLATE_CONSTANTS_H */ diff --git a/lib/gdeflate/libdeflate/lib/deflate_decompress.c b/lib/gdeflate/libdeflate/lib/deflate_decompress.c new file mode 100644 index 0000000..a9d3b64 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/deflate_decompress.c @@ -0,0 +1,1021 @@ +/* + * deflate_decompress.c - a decompressor for DEFLATE + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * --------------------------------------------------------------------------- + * + * This is a highly optimized DEFLATE decompressor. When compiled with gcc on + * x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2 + * instructions are available). On other architectures it should still be + * significantly faster than zlib, but the difference may be smaller. + * + * Why this is faster than zlib's implementation: + * + * - Word accesses rather than byte accesses when reading input + * - Word accesses rather than byte accesses when copying matches + * - Faster Huffman decoding combined with various DEFLATE-specific tricks + * - Larger bitbuffer variable that doesn't need to be filled as often + * - Other optimizations to remove unnecessary branches + * - Only full-buffer decompression is supported, so the code doesn't need to + * support stopping and resuming decompression. + * - On x86_64, compile a version of the decompression routine using BMI2 + * instructions and use it automatically at runtime when supported. + */ + +#include + +#include "deflate_constants.h" +#include "unaligned.h" + +#include "libdeflate.h" + +/* + * If the expression passed to SAFETY_CHECK() evaluates to false, then the + * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the + * compressed data is invalid. + * + * Theoretically, these checks could be disabled for specialized applications + * where all input to the decompressor will be trusted. + */ +#if 0 +# pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") +# define SAFETY_CHECK(expr) (void)(expr) +#else +# define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA +#endif + +/* + * Each TABLEBITS number is the base-2 logarithm of the number of entries in the + * main portion of the corresponding decode table. Each number should be large + * enough to ensure that for typical data, the vast majority of symbols can be + * decoded by a direct lookup of the next TABLEBITS bits of compressed data. + * However, this must be balanced against the fact that a larger table requires + * more memory and requires more time to fill. + * + * Note: you cannot change a TABLEBITS number without also changing the + * corresponding ENOUGH number! + */ +#define PRECODE_TABLEBITS 7 +#define LITLEN_TABLEBITS 10 +#define OFFSET_TABLEBITS 8 + +/* + * Each ENOUGH number is the maximum number of decode table entries that may be + * required for the corresponding Huffman code, including the main table and all + * subtables. Each number depends on three parameters: + * + * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS) + * (2) the number of main table bits (the TABLEBITS numbers defined above) + * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) + * + * The ENOUGH numbers were computed using the utility program 'enough' from + * zlib. This program enumerates all possible relevant Huffman codes to find + * the worst-case usage of decode table entries. + */ +#define PRECODE_ENOUGH 128 /* enough 19 7 7 */ +#define LITLEN_ENOUGH 1334 /* enough 288 10 15 */ +#define OFFSET_ENOUGH 402 /* enough 32 8 15 */ + +/* + * Type for codeword lengths. + */ +typedef u8 len_t; + +/* + * The main DEFLATE decompressor structure. Since this implementation only + * supports full buffer decompression, this structure does not store the entire + * decompression state, but rather only some arrays that are too large to + * comfortably allocate on the stack. + */ +struct libdeflate_decompressor { + + /* + * The arrays aren't all needed at the same time. 'precode_lens' and + * 'precode_decode_table' are unneeded after 'lens' has been filled. + * Furthermore, 'lens' need not be retained after building the litlen + * and offset decode tables. In fact, 'lens' can be in union with + * 'litlen_decode_table' provided that 'offset_decode_table' is separate + * and is built first. + */ + + union { + len_t precode_lens[DEFLATE_NUM_PRECODE_SYMS]; + + struct { + len_t lens[DEFLATE_NUM_LITLEN_SYMS + + DEFLATE_NUM_OFFSET_SYMS + + DEFLATE_MAX_LENS_OVERRUN]; + + u32 precode_decode_table[PRECODE_ENOUGH]; + } l; + + u32 litlen_decode_table[LITLEN_ENOUGH]; + } u; + + u32 offset_decode_table[OFFSET_ENOUGH]; + + /* used only during build_decode_table() */ + u16 sorted_syms[DEFLATE_MAX_NUM_SYMS]; + + bool static_codes_loaded; +}; + +/***************************************************************************** + * Input bitstream * + *****************************************************************************/ + +/* + * The state of the "input bitstream" consists of the following variables: + * + * - in_next: pointer to the next unread byte in the input buffer + * + * - in_end: pointer just past the end of the input buffer + * + * - bitbuf: a word-sized variable containing bits that have been read from + * the input buffer. The buffered bits are right-aligned + * (they're the low-order bits). + * + * - bitsleft: number of bits in 'bitbuf' that are valid. + * + * To make it easier for the compiler to optimize the code by keeping variables + * in registers, these are declared as normal variables and manipulated using + * macros. + */ + +/* + * The type for the bitbuffer variable ('bitbuf' described above). For best + * performance, this should have size equal to a machine word. + * + * 64-bit platforms have a significant advantage: they get a bigger bitbuffer + * which they have to fill less often. + * + * For GDEFLATE the bitbuffer has to be at least 64-bits long. + */ +#ifdef GDEFLATE +typedef u64 bitbuf_t; +#else +typedef machine_word_t bitbuf_t; +#endif + +/* + * Number of bits the bitbuffer variable can hold. + * + * This is one less than the obvious value because of the optimized arithmetic + * in FILL_BITS_WORDWISE() that leaves 'bitsleft' in the range + * [WORDBITS - 8, WORDBITS - 1] rather than [WORDBITS - 7, WORDBITS]. + */ +#define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) + +/* + * The maximum number of bits that can be ensured in the bitbuffer variable, + * i.e. the maximum value of 'n' that can be passed ENSURE_BITS(n). The decoder + * only reads whole bytes from memory, so this is the lowest value of 'bitsleft' + * at which another byte cannot be read without first consuming some bits. + */ +#define MAX_ENSURE (BITBUF_NBITS - 7) + +/* + * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if + * 'n' is too large to be passed to ENSURE_BITS(n). Note: if 'n' is a compile + * time constant, then this expression will be a compile-type constant. + * Therefore, CAN_ENSURE() can be used choose between alternative + * implementations at compile time. + */ +#define CAN_ENSURE(n) ((n) <= MAX_ENSURE) + +/* + * Fill the bitbuffer variable, reading one byte at a time. + * + * If we would overread the input buffer, we just don't read anything, leaving + * the bits zeroed but marking them filled. This simplifies the decompressor + * because it removes the need to distinguish between real overreads and + * overreads that occur only because of the decompressor's own lookahead. + * + * The disadvantage is that real overreads are not detected immediately. + * However, this is safe because the decompressor is still guaranteed to make + * forward progress when presented never-ending 0 bits. In an existing block + * output will be getting generated, whereas new blocks can only be uncompressed + * (since the type code for uncompressed blocks is 0), for which we check for + * previous overread. But even if we didn't check, uncompressed blocks would + * fail to validate because LEN would not equal ~NLEN. So the decompressor will + * eventually either detect that the output buffer is full, or detect invalid + * input, or finish the final block. + */ +#define FILL_BITS_BYTEWISE() \ +do { \ + if (likely(in_next != in_end)) \ + bitbuf |= (bitbuf_t)*in_next++ << bitsleft; \ + else \ + overrun_count++; \ + bitsleft += 8; \ +} while (bitsleft <= BITBUF_NBITS - 8) + +/* + * Fill the bitbuffer variable by reading the next word from the input buffer + * and branchlessly updating 'in_next' and 'bitsleft' based on how many bits + * were filled. This can be significantly faster than FILL_BITS_BYTEWISE(). + * However, for this to work correctly, the word must be interpreted in + * little-endian format. In addition, the memory access may be unaligned. + * Therefore, this method is most efficient on little-endian architectures that + * support fast unaligned access, such as x86 and x86_64. + * + * For faster updating of 'bitsleft', we consider the bitbuffer size in bits to + * be 1 less than the word size and therefore be all 1 bits. Then the number of + * bits filled is the value of the 0 bits in position >= 3 when changed to 1. + * E.g. if words are 64 bits and bitsleft = 16 = b010000 then we refill b101000 + * = 40 bits = 5 bytes. This uses only 4 operations to update 'in_next' and + * 'bitsleft': one each of +, ^, >>, and |. (Not counting operations the + * compiler optimizes out.) In contrast, the alternative of: + * + * in_next += (BITBUF_NBITS - bitsleft) >> 3; + * bitsleft += (BITBUF_NBITS - bitsleft) & ~7; + * + * (where BITBUF_NBITS would be WORDBITS rather than WORDBITS - 1) would on + * average refill an extra bit, but uses 5 operations: two +, and one each of + * -, >>, and &. Also the - and & must be completed before 'bitsleft' can be + * updated, while the current solution updates 'bitsleft' with no dependencies. + */ +#define FILL_BITS_WORDWISE() \ +do { \ + /* BITBUF_NBITS must be all 1's in binary, see above */ \ + STATIC_ASSERT((BITBUF_NBITS & (BITBUF_NBITS + 1)) == 0);\ + \ + bitbuf |= get_unaligned_leword(in_next) << bitsleft; \ + in_next += (bitsleft ^ BITBUF_NBITS) >> 3; \ + bitsleft |= BITBUF_NBITS & ~7; \ +} while (0) + +/* + * Does the bitbuffer variable currently contain at least 'n' bits? + */ +#define HAVE_BITS(n) (bitsleft >= (n)) + +/* + * Load more bits from the input buffer until the specified number of bits is + * present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE + * and CAN_ENSURE(). + */ +#define ENSURE_BITS(n) \ +if (!HAVE_BITS(n)) { \ + if (CPU_IS_LITTLE_ENDIAN() && \ + UNALIGNED_ACCESS_IS_FAST && \ + likely(in_end - in_next >= sizeof(bitbuf_t))) \ + FILL_BITS_WORDWISE(); \ + else \ + FILL_BITS_BYTEWISE(); \ +} + +/* + * Return the next 'n' bits from the bitbuffer variable without removing them. + */ +#define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1)) + +/* + * Remove the next 'n' bits from the bitbuffer variable. + */ +#define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n)) + +/* + * Remove and return the next 'n' bits from the bitbuffer variable. + */ +#define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32) + +/* + * Verify that the input buffer hasn't been overread, then align the input to + * the next byte boundary, discarding any remaining bits in the current byte. + * + * Note that if the bitbuffer variable currently contains more than 7 bits, then + * we must rewind 'in_next', effectively putting those bits back. Only the bits + * in what would be the "current" byte if we were reading one byte at a time can + * be actually discarded. + */ +#define ALIGN_INPUT() \ +do { \ + SAFETY_CHECK(overrun_count <= (bitsleft >> 3)); \ + in_next -= (bitsleft >> 3) - overrun_count; \ + overrun_count = 0; \ + bitbuf = 0; \ + bitsleft = 0; \ +} while(0) + +/* + * Read a 16-bit value from the input. This must have been preceded by a call + * to ALIGN_INPUT(), and the caller must have already checked for overrun. + */ +#define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16) + +/***************************************************************************** + * Huffman decoding * + *****************************************************************************/ + +/* + * A decode table for order TABLEBITS consists of a main table of (1 << + * TABLEBITS) entries followed by a variable number of subtables. + * + * The decoding algorithm takes the next TABLEBITS bits of compressed data and + * uses them as an index into the decode table. The resulting entry is either a + * "direct entry", meaning that it contains the value desired, or a "subtable + * pointer", meaning that the entry references a subtable that must be indexed + * using more bits of the compressed data to decode the symbol. + * + * Each decode table (a main table along with its subtables, if any) is + * associated with a Huffman code. Logically, the result of a decode table + * lookup is a symbol from the alphabet from which the corresponding Huffman + * code was constructed. A symbol with codeword length n <= TABLEBITS is + * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a + * symbol with codeword length n > TABLEBITS is associated with one or more + * subtable entries. + * + * On top of this basic design, we implement several optimizations: + * + * - We store the length of each codeword directly in each of its decode table + * entries. This allows the codeword length to be produced without indexing + * an additional table. + * + * - When beneficial, we don't store the Huffman symbol itself, but instead data + * generated from it. For example, when decoding an offset symbol in DEFLATE, + * it's more efficient if we can decode the offset base and number of extra + * offset bits directly rather than decoding the offset symbol and then + * looking up both of those values in an additional table or tables. + * + * The size of each decode table entry is 32 bits, which provides slightly + * better performance than 16-bit entries on 32 and 64 bit processers, provided + * that the table doesn't get so large that it takes up too much memory and + * starts generating cache misses. The bits of each decode table entry are + * defined as follows: + * + * - Bits 30 -- 31: flags (see below) + * - Bits 8 -- 29: decode result: a Huffman symbol or related data + * - Bits 0 -- 7: codeword length + */ + +/* + * This flag is set in all main decode table entries that represent subtable + * pointers. + */ +#define HUFFDEC_SUBTABLE_POINTER 0x80000000 + +/* + * This flag is set in all entries in the litlen decode table that represent + * literals. + */ +#define HUFFDEC_LITERAL 0x40000000 + +/* Mask for extracting the codeword length from a decode table entry. */ +#define HUFFDEC_LENGTH_MASK 0xFF + +/* Shift to extract the decode result from a decode table entry. */ +#define HUFFDEC_RESULT_SHIFT 8 + +/* Shift a decode result into its position in the decode table entry. */ +#define HUFFDEC_RESULT_ENTRY(result) ((u32)(result) << HUFFDEC_RESULT_SHIFT) + +/* The decode result for each precode symbol. There is no special optimization + * for the precode; the decode result is simply the symbol value. */ +static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = { +#define ENTRY(presym) HUFFDEC_RESULT_ENTRY(presym) + ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , + ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , + ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , + ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , + ENTRY(16) , ENTRY(17) , ENTRY(18) , +#undef ENTRY +}; + +/* The decode result for each litlen symbol. For literals, this is the literal + * value itself and the HUFFDEC_LITERAL flag. For lengths, this is the length + * base and the number of extra length bits. */ +static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = { + + /* Literals */ +#define ENTRY(literal) (HUFFDEC_LITERAL | HUFFDEC_RESULT_ENTRY(literal)) + ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , + ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , + ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , + ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , + ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , + ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , + ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , + ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , + ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , + ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , + ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , + ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , + ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , + ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , + ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , + ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , + ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , + ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , + ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , + ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , + ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , + ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , + ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , + ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , + ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , + ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , + ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , + ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , + ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , + ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , + ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , + ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , + ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , + ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , + ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , + ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , + ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , + ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , + ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , + ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , + ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , + ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , + ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , + ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , + ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , + ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , + ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , + ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , + ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , + ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , + ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , + ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , + ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , + ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , + ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , + ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , + ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , + ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , + ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , + ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , + ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , + ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , + ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , + ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , +#undef ENTRY + +#define HUFFDEC_EXTRA_LENGTH_BITS_MASK 0xFF +#define HUFFDEC_LENGTH_BASE_SHIFT 8 +#define HUFFDEC_END_OF_BLOCK_LENGTH 0 + +#define ENTRY(length_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \ + ((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits)) + + /* End of block */ + ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0), + + /* Lengths */ + ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), + ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), + ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), + ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), + ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), + ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), + ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), +#ifndef DEFLATE64 + ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , +#else + ENTRY(3, 16) , ENTRY(3, 16) , ENTRY(3, 16) , +#endif +#undef ENTRY +}; + +/* The decode result for each offset symbol. This is the offset base and the + * number of extra offset bits. */ +static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = { + +#define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16 +#define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1) + +#define ENTRY(offset_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \ + ((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) | \ + (offset_base)) + ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , + ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , + ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , + ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , + ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , + ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , + ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , + ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) , +#undef ENTRY +}; + +/* + * Build a table for fast decoding of symbols from a Huffman code. As input, + * this function takes the codeword length of each symbol which may be used in + * the code. As output, it produces a decode table for the canonical Huffman + * code described by the codeword lengths. The decode table is built with the + * assumption that it will be indexed with "bit-reversed" codewords, where the + * low-order bit is the first bit of the codeword. This format is used for all + * Huffman codes in DEFLATE. + * + * @decode_table + * The array in which the decode table will be generated. This array must + * have sufficient length; see the definition of the ENOUGH numbers. + * @lens + * An array which provides, for each symbol, the length of the + * corresponding codeword in bits, or 0 if the symbol is unused. This may + * alias @decode_table, since nothing is written to @decode_table until all + * @lens have been consumed. All codeword lengths are assumed to be <= + * @max_codeword_len but are otherwise considered untrusted. If they do + * not form a valid Huffman code, then the decode table is not built and + * %false is returned. + * @num_syms + * The number of symbols in the code, including all unused symbols. + * @decode_results + * An array which provides, for each symbol, the actual value to store into + * the decode table. This value will be directly produced as the result of + * decoding that symbol, thereby moving the indirection out of the decode + * loop and into the table initialization. + * @table_bits + * The log base-2 of the number of main table entries to use. + * @max_codeword_len + * The maximum allowed codeword length for this Huffman code. + * Must be <= DEFLATE_MAX_CODEWORD_LEN. + * @sorted_syms + * A temporary array of length @num_syms. + * + * Returns %true if successful; %false if the codeword lengths do not form a + * valid Huffman code. + */ +static bool +build_decode_table(u32 decode_table[], + const len_t lens[], + const unsigned num_syms, + const u32 decode_results[], + const unsigned table_bits, + const unsigned max_codeword_len, + u16 *sorted_syms) +{ + unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned sym; /* current symbol */ + unsigned codeword; /* current codeword, bit-reversed */ + unsigned len; /* current codeword length in bits */ + unsigned count; /* num codewords remaining with this length */ + u32 codespace_used; /* codespace used out of '2^max_codeword_len' */ + unsigned cur_table_end; /* end index of current table */ + unsigned subtable_prefix; /* codeword prefix of current subtable */ + unsigned subtable_start; /* start index of current subtable */ + unsigned subtable_bits; /* log2 of current subtable length */ + + /* Count how many codewords have each length, including 0. */ + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + for (sym = 0; sym < num_syms; sym++) + len_counts[lens[sym]]++; + + /* + * Sort the symbols primarily by increasing codeword length and + * secondarily by increasing symbol value; or equivalently by their + * codewords in lexicographic order, since a canonical code is assumed. + * + * For efficiency, also compute 'codespace_used' in the same pass over + * 'len_counts[]' used to build 'offsets[]' for sorting. + */ + + /* Ensure that 'codespace_used' cannot overflow. */ + STATIC_ASSERT(sizeof(codespace_used) == 4); + STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >= + DEFLATE_MAX_NUM_SYMS); + + offsets[0] = 0; + offsets[1] = len_counts[0]; + codespace_used = 0; + for (len = 1; len < max_codeword_len; len++) { + offsets[len + 1] = offsets[len] + len_counts[len]; + codespace_used = (codespace_used << 1) + len_counts[len]; + } + codespace_used = (codespace_used << 1) + len_counts[len]; + + for (sym = 0; sym < num_syms; sym++) + sorted_syms[offsets[lens[sym]]++] = sym; + + sorted_syms += offsets[0]; /* Skip unused symbols */ + + /* lens[] is done being used, so we can write to decode_table[] now. */ + + /* + * Check whether the lengths form a complete code (exactly fills the + * codespace), an incomplete code (doesn't fill the codespace), or an + * overfull code (overflows the codespace). A codeword of length 'n' + * uses proportion '1/(2^n)' of the codespace. An overfull code is + * nonsensical, so is considered invalid. An incomplete code is + * considered valid only in two specific cases; see below. + */ + + /* overfull code? */ + if (unlikely(codespace_used > (1U << max_codeword_len))) + return false; + + /* incomplete code? */ + if (unlikely(codespace_used < (1U << max_codeword_len))) { + u32 entry; + unsigned i; + + if (codespace_used == 0) { + /* + * An empty code is allowed. This can happen for the + * offset code in DEFLATE, since a dynamic Huffman block + * need not contain any matches. + */ + + /* sym=0, len=1 (arbitrary) */ + entry = decode_results[0] | 1; + } else { + /* + * Allow codes with a single used symbol, with codeword + * length 1. The DEFLATE RFC is unclear regarding this + * case. What zlib's decompressor does is permit this + * for the litlen and offset codes and assume the + * codeword is '0' rather than '1'. We do the same + * except we allow this for precodes too, since there's + * no convincing reason to treat the codes differently. + * We also assign both codewords '0' and '1' to the + * symbol to avoid having to handle '1' specially. + */ + if (codespace_used != (1U << (max_codeword_len - 1)) || + len_counts[1] != 1) + return false; + entry = decode_results[*sorted_syms] | 1; + } + /* + * Note: the decode table still must be fully initialized, in + * case the stream is malformed and contains bits from the part + * of the codespace the incomplete code doesn't use. + */ + for (i = 0; i < (1U << table_bits); i++) + decode_table[i] = entry; + return true; + } + + /* + * The lengths form a complete code. Now, enumerate the codewords in + * lexicographic order and fill the decode table entries for each one. + * + * First, process all codewords with len <= table_bits. Each one gets + * '2^(table_bits-len)' direct entries in the table. + * + * Since DEFLATE uses bit-reversed codewords, these entries aren't + * consecutive but rather are spaced '2^len' entries apart. This makes + * filling them naively somewhat awkward and inefficient, since strided + * stores are less cache-friendly and preclude the use of word or + * vector-at-a-time stores to fill multiple entries per instruction. + * + * To optimize this, we incrementally double the table size. When + * processing codewords with length 'len', the table is treated as + * having only '2^len' entries, so each codeword uses just one entry. + * Then, each time 'len' is incremented, the table size is doubled and + * the first half is copied to the second half. This significantly + * improves performance over naively doing strided stores. + * + * Note that some entries copied for each table doubling may not have + * been initialized yet, but it doesn't matter since they're guaranteed + * to be initialized later (because the Huffman code is complete). + */ + codeword = 0; + len = 1; + while ((count = len_counts[len]) == 0) + len++; + cur_table_end = 1U << len; + while (len <= table_bits) { + /* Process all 'count' codewords with length 'len' bits. */ + do { + unsigned bit; + + /* Fill the first entry for the current codeword. */ + decode_table[codeword] = + decode_results[*sorted_syms++] | len; + + if (codeword == cur_table_end - 1) { + /* Last codeword (all 1's) */ + for (; len < table_bits; len++) { + memcpy(&decode_table[cur_table_end], + decode_table, + cur_table_end * + sizeof(decode_table[0])); + cur_table_end <<= 1; + } + return true; + } + /* + * To advance to the lexicographically next codeword in + * the canonical code, the codeword must be incremented, + * then 0's must be appended to the codeword as needed + * to match the next codeword's length. + * + * Since the codeword is bit-reversed, appending 0's is + * a no-op. However, incrementing it is nontrivial. To + * do so efficiently, use the 'bsr' instruction to find + * the last (highest order) 0 bit in the codeword, set + * it, and clear any later (higher order) 1 bits. But + * 'bsr' actually finds the highest order 1 bit, so to + * use it first flip all bits in the codeword by XOR'ing + * it with (1U << len) - 1 == cur_table_end - 1. + */ + bit = 1U << bsr32(codeword ^ (cur_table_end - 1)); + codeword &= bit - 1; + codeword |= bit; + } while (--count); + + /* Advance to the next codeword length. */ + do { + if (++len <= table_bits) { + memcpy(&decode_table[cur_table_end], + decode_table, + cur_table_end * sizeof(decode_table[0])); + cur_table_end <<= 1; + } + } while ((count = len_counts[len]) == 0); + } + + /* Process codewords with len > table_bits. These require subtables. */ + cur_table_end = 1U << table_bits; + subtable_prefix = -1; + subtable_start = 0; + for (;;) { + u32 entry; + unsigned i; + unsigned stride; + unsigned bit; + + /* + * Start a new subtable if the first 'table_bits' bits of the + * codeword don't match the prefix of the current subtable. + */ + if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) { + subtable_prefix = (codeword & ((1U << table_bits) - 1)); + subtable_start = cur_table_end; + /* + * Calculate the subtable length. If the codeword has + * length 'table_bits + n', then the subtable needs + * '2^n' entries. But it may need more; if fewer than + * '2^n' codewords of length 'table_bits + n' remain, + * then the length will need to be incremented to bring + * in longer codewords until the subtable can be + * completely filled. Note that because the Huffman + * code is complete, it will always be possible to fill + * the subtable eventually. + */ + subtable_bits = len - table_bits; + codespace_used = count; + while (codespace_used < (1U << subtable_bits)) { + subtable_bits++; + codespace_used = (codespace_used << 1) + + len_counts[table_bits + subtable_bits]; + } + cur_table_end = subtable_start + (1U << subtable_bits); + + /* + * Create the entry that points from the main table to + * the subtable. This entry contains the index of the + * start of the subtable and the number of bits with + * which the subtable is indexed (the log base 2 of the + * number of entries it contains). + */ + decode_table[subtable_prefix] = + HUFFDEC_SUBTABLE_POINTER | + HUFFDEC_RESULT_ENTRY(subtable_start) | + subtable_bits; + } + + /* Fill the subtable entries for the current codeword. */ + entry = decode_results[*sorted_syms++] | (len - table_bits); + i = subtable_start + (codeword >> table_bits); + stride = 1U << (len - table_bits); + do { + decode_table[i] = entry; + i += stride; + } while (i < cur_table_end); + + /* Advance to the next codeword. */ + if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */ + return true; + bit = 1U << bsr32(codeword ^ ((1U << len) - 1)); + codeword &= bit - 1; + codeword |= bit; + count--; + while (count == 0) + count = len_counts[++len]; + } +} + +/* Build the decode table for the precode. */ +static bool +build_precode_decode_table(struct libdeflate_decompressor *d) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); + + return build_decode_table(d->u.l.precode_decode_table, + d->u.precode_lens, + DEFLATE_NUM_PRECODE_SYMS, + precode_decode_results, + PRECODE_TABLEBITS, + DEFLATE_MAX_PRE_CODEWORD_LEN, + d->sorted_syms); +} + +/* Build the decode table for the literal/length code. */ +static bool +build_litlen_decode_table(struct libdeflate_decompressor *d, + unsigned num_litlen_syms, unsigned num_offset_syms) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334); + + return build_decode_table(d->u.litlen_decode_table, + d->u.l.lens, + num_litlen_syms, + litlen_decode_results, + LITLEN_TABLEBITS, + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + d->sorted_syms); +} + +/* Build the decode table for the offset code. */ +static bool +build_offset_decode_table(struct libdeflate_decompressor *d, + unsigned num_litlen_syms, unsigned num_offset_syms) +{ + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); + + return build_decode_table(d->offset_decode_table, + d->u.l.lens + num_litlen_syms, + num_offset_syms, + offset_decode_results, + OFFSET_TABLEBITS, + DEFLATE_MAX_OFFSET_CODEWORD_LEN, + d->sorted_syms); +} + +static forceinline machine_word_t +repeat_byte(u8 b) +{ + machine_word_t v; + + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + + v = b; + v |= v << 8; + v |= v << 16; + v |= v << ((WORDBITS == 64) ? 32 : 0); + return v; +} + +static forceinline void +copy_word_unaligned(const void *src, void *dst) +{ + store_word_unaligned(load_word_unaligned(src), dst); +} + +/***************************************************************************** + * Main decompression routine + *****************************************************************************/ + +typedef enum libdeflate_result (*decompress_func_t) + (struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); + +#undef DEFAULT_IMPL +#undef DISPATCH +#if defined(__i386__) || defined(__x86_64__) +# include "x86/decompress_impl.h" +#endif + +#ifndef DEFAULT_IMPL +# define FUNCNAME deflate_decompress_default +# define ATTRIBUTES +# ifdef GDEFLATE +# include "gdeflate_decompress_template.h" +# else +# include "decompress_template.h" +# endif +# define DEFAULT_IMPL deflate_decompress_default +#endif + +#ifdef DISPATCH +static enum libdeflate_result +dispatch(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); + +static volatile decompress_func_t decompress_impl = dispatch; + +/* Choose the fastest implementation at runtime */ +static enum libdeflate_result +dispatch(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + decompress_func_t f = arch_select_decompress_func(); + + if (f == NULL) + f = DEFAULT_IMPL; + + decompress_impl = f; + return (*f)(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); +} +#else +# define decompress_impl DEFAULT_IMPL /* only one implementation, use it */ +#endif + +#ifndef HIDE_INTERFACE + +/* + * This is the main DEFLATE decompression routine. See libdeflate.h for the + * documentation. + * + * Note that the real code is in decompress_template.h. The part here just + * handles calling the appropriate implementation depending on the CPU features + * at runtime. + */ +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) +{ + return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); +} + +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) +{ + return libdeflate_deflate_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); +} + +LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI +libdeflate_alloc_decompressor(void) +{ + /* + * Note that only certain parts of the decompressor actually must be + * initialized here: + * + * - 'static_codes_loaded' must be initialized to false. + * + * - The first half of the main portion of each decode table must be + * initialized to any value, to avoid reading from uninitialized + * memory during table expansion in build_decode_table(). (Although, + * this is really just to avoid warnings with dynamic tools like + * valgrind, since build_decode_table() is guaranteed to initialize + * all entries eventually anyway.) + * + * But for simplicity, we currently just zero the whole decompressor. + */ + struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d)); + + if (d == NULL) + return NULL; + memset(d, 0, sizeof(*d)); + return d; +} + +LIBDEFLATEEXPORT void LIBDEFLATEAPI +libdeflate_free_decompressor(struct libdeflate_decompressor *d) +{ + libdeflate_free(d); +} + +#endif diff --git a/lib/gdeflate/libdeflate/lib/gdeflate_compress.c b/lib/gdeflate/libdeflate/lib/gdeflate_compress.c new file mode 100644 index 0000000..0c832da --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/gdeflate_compress.c @@ -0,0 +1,246 @@ +/* + * gdeflate_compress.c - a compressor for GDEFLATE + * + * Originally public domain; changes after 2016-09-07 are copyrighted. + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#define WITH_GDEFLATE +#define HIDE_INTERFACE + +#define libdeflate_compressor libdeflate_gdeflate_compressor + +#include "deflate_compress.c" + +LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI +libdeflate_alloc_gdeflate_compressor(int compression_level) +{ + struct libdeflate_compressor *c; + size_t size = offsetof(struct libdeflate_compressor, p); + + if (compression_level < 0 || compression_level > 12) + return NULL; + +#if SUPPORT_NEAR_OPTIMAL_PARSING + if (compression_level >= 8) + size += sizeof(c->p.n); + else if (compression_level >= 1) + size += sizeof(c->p.g); +#else + if (compression_level >= 1) + size += sizeof(c->p.g); +#endif + + c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size); + if (!c) + return NULL; + + c->compression_level = compression_level; + + /* + * The higher the compression level, the more we should bother trying to + * compress very small inputs. + */ + c->min_size_to_compress = 56 - (compression_level * 4); + + switch (compression_level) { + case 0: + c->impl = deflate_compress_none; + break; + case 1: + c->impl = deflate_compress_greedy; + c->max_search_depth = 2; + c->nice_match_length = 8; + break; + case 2: + c->impl = deflate_compress_greedy; + c->max_search_depth = 6; + c->nice_match_length = 10; + break; + case 3: + c->impl = deflate_compress_greedy; + c->max_search_depth = 12; + c->nice_match_length = 14; + break; + case 4: + c->impl = deflate_compress_greedy; + c->max_search_depth = 24; + c->nice_match_length = 24; + break; + case 5: + c->impl = deflate_compress_lazy; + c->max_search_depth = 20; + c->nice_match_length = 30; + break; + case 6: + c->impl = deflate_compress_lazy; + c->max_search_depth = 40; + c->nice_match_length = 65; + break; + case 7: + c->impl = deflate_compress_lazy; + c->max_search_depth = 100; + c->nice_match_length = 130; + break; +#if SUPPORT_NEAR_OPTIMAL_PARSING + case 8: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 12; + c->nice_match_length = 20; + c->p.n.num_optim_passes = 1; + break; + case 9: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 16; + c->nice_match_length = 26; + c->p.n.num_optim_passes = 2; + break; + case 10: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 30; + c->nice_match_length = 50; + c->p.n.num_optim_passes = 2; + break; + case 11: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 60; + c->nice_match_length = 80; + c->p.n.num_optim_passes = 3; + break; + default: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 100; + c->nice_match_length = 133; + c->p.n.num_optim_passes = 4; + break; +#else + case 8: + c->impl = deflate_compress_lazy; + c->max_search_depth = 150; + c->nice_match_length = 200; + break; + default: + c->impl = deflate_compress_lazy; + c->max_search_depth = 200; + c->nice_match_length = DEFLATE_MAX_MATCH_LEN; + break; +#endif + } + + deflate_init_offset_slot_fast(c); + deflate_init_static_codes(c); + deflate_init_length_slot(); + + return c; +} + +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_gdeflate_compress(struct libdeflate_compressor *c, + const void *in, size_t in_nbytes, + struct libdeflate_gdeflate_out_page* out_pages, + size_t out_npages) +{ + const u8 * in_bytes = in; + size_t out_nbytes = 0; + size_t npages; + size_t upper_bound = libdeflate_gdeflate_compress_bound(c, + in_nbytes, &npages); + size_t page_upper_bound = upper_bound / npages; + + if (unlikely(out_pages == NULL || out_npages != npages)) + return 0; + + for (size_t page = 0; page < npages; page++) { + size_t comp_page_nbytes; + const size_t page_nbytes = in_nbytes > GDEFLATE_PAGE_SIZE ? + GDEFLATE_PAGE_SIZE : in_nbytes; + + if (unlikely(out_pages[page].nbytes < page_upper_bound)) { + return 0; + } + + comp_page_nbytes = (*c->impl)(c, in_bytes, page_nbytes, + out_pages[page].data, page_upper_bound); + + out_pages[page].nbytes = comp_page_nbytes; + + /* Page did not fit - bail out. */ + if (unlikely(comp_page_nbytes == 0)) + return 0; + + in_bytes += page_nbytes; + in_nbytes -= page_nbytes; + + out_nbytes += comp_page_nbytes; + } + + return out_nbytes; +} + +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_gdeflate_compress_ex(struct libdeflate_compressor *c, + const struct libdeflate_gdeflate_in_page* in_pages, + struct libdeflate_gdeflate_out_page* out_pages, size_t npages) +{ + if (unlikely(out_pages == NULL || in_pages == NULL)) + return 0; + + for (size_t page = 0; page < npages; page++) { + size_t comp_page_nbytes; + + comp_page_nbytes = (*c->impl)(c, in_pages[page].data, in_pages[page].nbytes, + out_pages[page].data, out_pages[page].nbytes); + + out_pages[page].nbytes = comp_page_nbytes; + + /* Page did not fit - bail out. */ + if (unlikely(comp_page_nbytes == 0)) + return page; + } + + return npages; +} + +LIBDEFLATEEXPORT void LIBDEFLATEAPI +libdeflate_free_gdeflate_compressor(struct libdeflate_compressor *c) +{ + libdeflate_aligned_free(c); +} + +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_gdeflate_compress_bound(struct libdeflate_compressor *c, + size_t in_nbytes, size_t *out_npages) +{ + const size_t page_bound = libdeflate_deflate_compress_bound(c, GDEFLATE_PAGE_SIZE); + const size_t npages = DIV_ROUND_UP(in_nbytes, GDEFLATE_PAGE_SIZE); + + if (out_npages) + *out_npages = npages; + + return (page_bound + (NUM_STREAMS * BITS_PER_PACKET) / 8) * npages; +} diff --git a/lib/gdeflate/libdeflate/lib/gdeflate_decompress.c b/lib/gdeflate/libdeflate/lib/gdeflate_decompress.c new file mode 100644 index 0000000..fcf4c1d --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/gdeflate_decompress.c @@ -0,0 +1,134 @@ +/* + * gdeflate_decompress.c - a decompressor for GDEFLATE + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#define WITH_GDEFLATE +#define HIDE_INTERFACE + +#define libdeflate_decompressor libdeflate_gdeflate_decompressor + +#include "deflate_decompress.c" + +/* + * This is the main GDEFLATE decompression routine. See libdeflate.h for the + * documentation. + * + * Note that the real code is in gdeflate_decompress_template.h. The part here + * just handles calling the appropriate implementation depending on the CPU + * features at runtime. + */ +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_gdeflate_decompress(struct libdeflate_decompressor * restrict d, + struct libdeflate_gdeflate_in_page *in_pages, + size_t in_npages, void * restrict out, + size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) +{ + u8 * restrict out_bytes = out; + + if (unlikely(in_pages == NULL || in_npages == 0)) + return LIBDEFLATE_BAD_DATA; + + for (size_t npage = 0; npage < in_npages; npage++) { + size_t page_out_nbytes_ret, page_in_nbytes_ret; + enum libdeflate_result res; + + res = decompress_impl(d, in_pages[npage].data, + in_pages[npage].nbytes, out_bytes, + out_nbytes_avail, &page_in_nbytes_ret, + &page_out_nbytes_ret); + + if (unlikely(res != LIBDEFLATE_SUCCESS)) + return res; + + out_bytes += page_out_nbytes_ret; + out_nbytes_avail -= page_out_nbytes_ret; + + if (actual_out_nbytes_ret) + *actual_out_nbytes_ret += page_out_nbytes_ret; + } + + return LIBDEFLATE_SUCCESS; +} + +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_gdeflate_decompress_ex(struct libdeflate_gdeflate_decompressor * restrict d, + struct libdeflate_gdeflate_in_page * restrict in_pages, + struct libdeflate_gdeflate_out_page * restrict out_pages, size_t npages) +{ + if (unlikely(in_pages == NULL || out_pages == NULL || npages == 0)) + return LIBDEFLATE_BAD_DATA; + + for (size_t npage = 0; npage < npages; npage++) { + size_t page_out_nbytes_ret, page_in_nbytes_ret; + enum libdeflate_result res; + + res = decompress_impl(d, in_pages[npage].data, + in_pages[npage].nbytes, out_pages[npage].data, + out_pages[npage].nbytes, &page_in_nbytes_ret, + &page_out_nbytes_ret); + + if (unlikely(res != LIBDEFLATE_SUCCESS)) + return res; + } + + return LIBDEFLATE_SUCCESS; +} + +LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI +libdeflate_alloc_gdeflate_decompressor(void) +{ + /* + * Note that only certain parts of the decompressor actually must be + * initialized here: + * + * - 'static_codes_loaded' must be initialized to false. + * + * - The first half of the main portion of each decode table must be + * initialized to any value, to avoid reading from uninitialized + * memory during table expansion in build_decode_table(). (Although, + * this is really just to avoid warnings with dynamic tools like + * valgrind, since build_decode_table() is guaranteed to initialize + * all entries eventually anyway.) + * + * But for simplicity, we currently just zero the whole decompressor. + */ + struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d)); + + if (d == NULL) + return NULL; + memset(d, 0, sizeof(*d)); + return d; +} + +LIBDEFLATEEXPORT void LIBDEFLATEAPI +libdeflate_free_gdeflate_decompressor(struct libdeflate_decompressor *d) +{ + libdeflate_free(d); +} diff --git a/lib/gdeflate/libdeflate/lib/gdeflate_decompress_template.h b/lib/gdeflate/libdeflate/lib/gdeflate_decompress_template.h new file mode 100644 index 0000000..f8f6533 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/gdeflate_decompress_template.h @@ -0,0 +1,611 @@ +/* + * gdeflate_decompress_template.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This is the actual GDEFLATE decompression routine, lifted out of + * gdeflate_decompress.c so that it can be compiled multiple times with + * different target instruction sets. + */ + +/* + * Does the bitbuffer variable currently contain at least 'n' bits? + */ +#undef HAVE_BITS +#define HAVE_BITS(n) (s->bitsleft[s->idx] >= (n)) + +/* + * Load more bits from the input buffer until the specified number of bits is + * present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE + * and CAN_ENSURE(). + */ +#undef ENSURE_BITS +#define ENSURE_BITS(n) \ +if (!HAVE_BITS(n)) { \ + s->bitbuf[s->idx] |= (bitbuf_t)(get_unaligned_le32(in_next)) << \ + s->bitsleft[s->idx]; \ + in_next += BITS_PER_PACKET/8; \ + s->bitsleft[s->idx] += BITS_PER_PACKET; \ +} + +/* + * Return the next 'n' bits from the bitbuffer variable without removing them. + */ +#undef BITS +#define BITS(n) ((u32)s->bitbuf[s->idx] & (((u32)1 << (n)) - 1)) + +/* + * Remove the next 'n' bits from the bitbuffer variable. + */ +#undef REMOVE_BITS +#define REMOVE_BITS(n) (s->bitbuf[s->idx] >>= (n), s->bitsleft[s->idx] -= (n)) + +/* + * Setup copy advance method depending on a number of streams used. + */ +#if (NUM_STREAMS == 32) +#define ADVANCE_COPIES() is_copy = _rotr(is_copy, 1) +#else +# pragma message("Invalid number of GDeflate streams used!") +#endif + +/* + * Reset GDeflate stream index. + */ +#define RESET() s->idx = 0 + +/* + * Advance GDeflate stream index. Refill bits if necessary. + */ +#define ADVANCE() do { \ + ENSURE_BITS(LOW_WATERMARK_BITS); \ + s->idx = (s->idx + 1)%NUM_STREAMS; \ + ADVANCE_COPIES(); \ +} while(0) + +/* + * Tells if current GDeflate stream has a deferred copy. + */ +#define IS_COPY() (is_copy&1) + +/* + * Stores a deferred copy in current GDeflate stream. + */ +#define STORE_COPY(len, out) do { \ + s->copies[s->idx].length = len; \ + s->copies[s->idx].out_next = out; \ + is_copy |= 1; \ +} while(0) + +/* + * Marks a copy in current current GDeflate stream as complete. + */ +#define COPY_COMPLETE() (is_copy &= ~1) + +/* + * Prevent multiple type declarations. + */ +#ifndef GDEFLATE_TYPES_DECLARED + +/* + * Setup is_copy type depending on a number of streams used. + */ +#if (NUM_STREAMS == 32) +typedef u32 is_copy_t; +#else +# pragma message("Invalid number of GDeflate streams used!") +#endif + +/* + * GDeflate deferred copy state structure. + */ +struct gdeflate_deferred_copy { + unsigned length; + u8 * out_next; +}; + +/* + * GDeflate state structure. + */ +struct gdeflate_state { + bitbuf_t bitbuf[NUM_STREAMS]; + unsigned bitsleft[NUM_STREAMS]; + struct gdeflate_deferred_copy copies[NUM_STREAMS]; + unsigned idx; +}; + +/* + * Prevent multiple type declarations. + */ +#define GDEFLATE_TYPES_DECLARED + +#endif /* GDEFLATE_TYPES_DECLARED */ + +/* + * Specialize gdeflate_do_copy function name. + */ +#define DO_COPY CONCAT(FUNCNAME, _gdeflate_do_copy) + +/* + * Perform a deferred GDeflate copy. + */ +static forceinline enum libdeflate_result ATTRIBUTES +DO_COPY(struct libdeflate_decompressor * restrict d, + struct gdeflate_state * s, u8 * out, u8 * const out_end) +{ + u32 entry; + u32 offset; + const u8 *src; + u8 *dst; + u32 tmp32; + + /* Pop match params. */ + u32 length = s->copies[s->idx].length; + u8 * out_next = s->copies[s->idx].out_next; + + /* Decode the match offset. */ + + entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)]; + if (entry & HUFFDEC_SUBTABLE_POINTER) { + /* Offset subtable required (uncommon case) */ + REMOVE_BITS(OFFSET_TABLEBITS); + entry = d->offset_decode_table[ + ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + + BITS(entry & HUFFDEC_LENGTH_MASK)]; + } + REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); + + entry >>= HUFFDEC_RESULT_SHIFT; + + /* Pop the extra offset bits and add them to the offset base to + * produce the full offset. */ + offset = (entry & HUFFDEC_OFFSET_BASE_MASK) + + POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT); + + /* The match source must not begin before the beginning of the + * output buffer. */ + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + + /* + * Copy the match: 'length' bytes at 'out_next - offset' to + * 'out_next', possibly overlapping. If the match doesn't end + * too close to the end of the buffer and offset >= WORDBYTES || + * offset == 1, take a fast path which copies a word at a time + * -- potentially more than the length of the match, but that's + * fine as long as we check for enough extra space. + * + * The remaining cases are not performance-critical so are + * handled by a simple byte-by-byte copy. + */ + src = out_next - offset; + dst = out_next; + out_next += length; + + if (UNALIGNED_ACCESS_IS_FAST && + likely(out_end - out_next >= WORDBYTES && length >= WORDBYTES)) { + if (offset >= WORDBYTES) { /* words don't overlap? */ + while (dst < out_next - WORDBYTES) { + copy_word_unaligned(src, dst); + src += WORDBYTES; + dst += WORDBYTES; + } + /* Tail. */ + while (dst < out_next) *dst++ = *src++; + } else if (offset == 1) { + /* RLE encoding of previous byte, common if the + * data contains many repeated bytes */ + machine_word_t v = repeat_byte(*src); + + while (dst < out_next - WORDBYTES) { + store_word_unaligned(v, dst); + dst += WORDBYTES; + } + /* Tail. */ + while (dst < out_next) *dst++ = (u8)v; + } else { + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + } else { + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + + return LIBDEFLATE_SUCCESS; +} + +static enum libdeflate_result ATTRIBUTES +FUNCNAME(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) +{ + u8 *out_next = out; + u8 * const out_end = out_next + out_nbytes_avail; + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + struct gdeflate_state state; + struct gdeflate_state * s = &state; + unsigned i; + unsigned is_final_block; + unsigned block_type; + u16 len; + unsigned num_litlen_syms; + unsigned num_offset_syms; + u32 tmp32; + is_copy_t is_copy = 0; + + /* Starting to read GDeflate stream. */ + RESET(); + for (unsigned n = 0; n < NUM_STREAMS; n++) { + s->bitbuf[n] = 0; + s->bitsleft[n] = 0; + s->copies[n].length = 0; + ADVANCE(); + } + +next_block: + /* Starting to read the next block. */ + RESET(); + + /* BFINAL: 1 bit */ + is_final_block = POP_BITS(1); + + /* BTYPE: 2 bits */ + block_type = POP_BITS(2); + + ENSURE_BITS(LOW_WATERMARK_BITS); + + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { + + /* Dynamic Huffman block. */ + + /* The order in which precode lengths are stored. */ + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + unsigned num_explicit_precode_lens; + + /* Read the codeword length counts. */ + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257); + num_litlen_syms = POP_BITS(5) + 257; + + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1); + num_offset_syms = POP_BITS(5) + 1; + + STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4); + num_explicit_precode_lens = POP_BITS(4) + 4; + + d->static_codes_loaded = false; + + ENSURE_BITS(LOW_WATERMARK_BITS); + + /* Read the precode codeword lengths. */ + STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); + for (i = 0; i < num_explicit_precode_lens; i++) { + d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); + + ADVANCE(); + } + + for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) + d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; + + /* Build the decode table for the precode. */ + SAFETY_CHECK(build_precode_decode_table(d)); + + RESET(); + + /* Expand the literal/length and offset codeword lengths. */ + for (i = 0; i < num_litlen_syms + num_offset_syms; ) { + u32 entry; + unsigned presym; + u8 rep_val; + unsigned rep_count; + + /* (The code below assumes that the precode decode table + * does not have any subtables.) */ + STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); + + /* Read the next precode symbol. */ + entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)]; + REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); + + presym = entry >> HUFFDEC_RESULT_SHIFT; + + if (presym < 16) { + /* Explicit codeword length */ + d->u.l.lens[i++] = presym; + ADVANCE(); + continue; + } + + /* Run-length encoded codeword lengths */ + + /* Note: we don't need verify that the repeat count + * doesn't overflow the number of elements, since we + * have enough extra spaces to allow for the worst-case + * overflow (138 zeroes when only 1 length was + * remaining). + * + * In the case of the small repeat counts (presyms 16 + * and 17), it is fastest to always write the maximum + * number of entries. That gets rid of branches that + * would otherwise be required. + * + * It is not just because of the numerical order that + * our checks go in the order 'presym < 16', 'presym == + * 16', and 'presym == 17'. For typical data this is + * ordered from most frequent to least frequent case. + */ + STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); + + if (presym == 16) { + /* Repeat the previous length 3 - 6 times */ + SAFETY_CHECK(i != 0); + rep_val = d->u.l.lens[i - 1]; + STATIC_ASSERT(3 + ((1 << 2) - 1) == 6); + rep_count = 3 + POP_BITS(2); + d->u.l.lens[i + 0] = rep_val; + d->u.l.lens[i + 1] = rep_val; + d->u.l.lens[i + 2] = rep_val; + d->u.l.lens[i + 3] = rep_val; + d->u.l.lens[i + 4] = rep_val; + d->u.l.lens[i + 5] = rep_val; + i += rep_count; + } else if (presym == 17) { + /* Repeat zero 3 - 10 times */ + STATIC_ASSERT(3 + ((1 << 3) - 1) == 10); + rep_count = 3 + POP_BITS(3); + d->u.l.lens[i + 0] = 0; + d->u.l.lens[i + 1] = 0; + d->u.l.lens[i + 2] = 0; + d->u.l.lens[i + 3] = 0; + d->u.l.lens[i + 4] = 0; + d->u.l.lens[i + 5] = 0; + d->u.l.lens[i + 6] = 0; + d->u.l.lens[i + 7] = 0; + d->u.l.lens[i + 8] = 0; + d->u.l.lens[i + 9] = 0; + i += rep_count; + } else { + /* Repeat zero 11 - 138 times */ + STATIC_ASSERT(11 + ((1 << 7) - 1) == 138); + rep_count = 11 + POP_BITS(7); + memset(&d->u.l.lens[i], 0, + rep_count * sizeof(d->u.l.lens[i])); + i += rep_count; + } + + ADVANCE(); + } + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + + /* Uncompressed block: copy 'len' bytes literally from the input + * buffer to the output buffer. */ + + /* Count bits in the bit buffers. */ + u32 num_buffered_bits = 0; + for (u32 n = 0; n < NUM_STREAMS; n++) + num_buffered_bits += s->bitsleft[n]; + + SAFETY_CHECK(in_end - in_next + (num_buffered_bits + 7)/8 >= 2); + + len = POP_BITS(16); + + if (unlikely(len > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + SAFETY_CHECK(len <= in_end - in_next + (num_buffered_bits + 7)/8); + + while (len) { + *out_next++ = POP_BITS(8); + len--; + ADVANCE(); + } + + goto block_done; + + } else { + SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); + + /* + * Static Huffman block: build the decode tables for the static + * codes. Skip doing so if the tables are already set up from + * an earlier static block; this speeds up decompression of + * degenerate input of many empty or very short static blocks. + * + * Afterwards, the remainder is the same as decompressing a + * dynamic Huffman block. + */ + + if (d->static_codes_loaded) + goto have_decode_tables; + + d->static_codes_loaded = true; + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); + + for (i = 0; i < 144; i++) + d->u.l.lens[i] = 8; + for (; i < 256; i++) + d->u.l.lens[i] = 9; + for (; i < 280; i++) + d->u.l.lens[i] = 7; + for (; i < 288; i++) + d->u.l.lens[i] = 8; + + for (; i < 288 + 32; i++) + d->u.l.lens[i] = 5; + + num_litlen_syms = 288; + num_offset_syms = 32; + } + + /* Decompressing a Huffman block (either dynamic or static) */ + + SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); + SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); +have_decode_tables: + + RESET(); + + /* The main GDEFLATE decode loop */ + for (;;) { + u32 entry; + u32 length; + + if (likely(!IS_COPY())) { + /* Decode a litlen symbol. */ + entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)]; + if (entry & HUFFDEC_SUBTABLE_POINTER) { + /* Litlen subtable required (uncommon case) */ + REMOVE_BITS(LITLEN_TABLEBITS); + entry = d->u.litlen_decode_table[ + ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + + BITS(entry & HUFFDEC_LENGTH_MASK)]; + } + REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); + if (entry & HUFFDEC_LITERAL) { + /* Literal */ + if (unlikely(out_next == out_end)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT); + ADVANCE(); + continue; + } + + /* Match or end-of-block */ + + entry >>= HUFFDEC_RESULT_SHIFT; + + /* Pop the extra length bits and add them to the length base to + * produce the full length. */ + length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) + + POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK); + + /* The match destination must not end after the end of the + * output buffer. For efficiency, combine this check with the + * end-of-block check. We're using 0 for the special + * end-of-block length, so subtract 1 and it turn it into + * SIZE_MAX. */ + STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0); + if (unlikely((size_t)length - 1 >= out_end - out_next)) { + if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + goto block_done; + } + + /* Store copy for use later. */ + STORE_COPY(length, out_next); + + /* Advance output stream. */ + out_next += length; + } else { + enum libdeflate_result res = + DO_COPY(d, s, out, out_end); + + if (unlikely(res)) + return res; + + COPY_COMPLETE(); + } + + ADVANCE(); + } + +block_done: + + /* Run the outstanding deferred copies. */ + + for (unsigned n = 0; n < NUM_STREAMS; n++) { + if (IS_COPY()) { + enum libdeflate_result res = + DO_COPY(d, s, out, out_end); + + if (unlikely(res)) + return res; + + COPY_COMPLETE(); + } + + ADVANCE(); + } + + /* Finished decoding a block. */ + + if (!is_final_block) + goto next_block; + + /* That was the last block. */ + + /* Optionally return the actual number of bytes read */ + if (actual_in_nbytes_ret) + *actual_in_nbytes_ret = in_next - (u8 *)in; + + /* Optionally return the actual number of bytes written */ + if (actual_out_nbytes_ret) { + *actual_out_nbytes_ret = out_next - (u8 *)out; + } else { + if (out_next != out_end) + return LIBDEFLATE_SHORT_OUTPUT; + } + return LIBDEFLATE_SUCCESS; +} + +#undef FUNCNAME +#undef ATTRIBUTES +#undef IS_COPY +#undef DO_COPY +#undef ADVANCE_COPIES +#undef STORE_COPY +#undef COPY_COMPLETE +#undef RESET +#undef ADVANCE diff --git a/lib/gdeflate/libdeflate/lib/gzip_compress.c b/lib/gdeflate/libdeflate/lib/gzip_compress.c new file mode 100644 index 0000000..3cb8803 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/gzip_compress.c @@ -0,0 +1,95 @@ +/* + * gzip_compress.c - compress with a gzip wrapper + * + * Originally public domain; changes after 2016-09-07 are copyrighted. + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "deflate_compress.h" +#include "gzip_constants.h" +#include "unaligned.h" + +#include "libdeflate.h" + +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_gzip_compress(struct libdeflate_compressor *c, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail) +{ + u8 *out_next = out; + unsigned compression_level; + u8 xfl; + size_t deflate_size; + + if (out_nbytes_avail <= GZIP_MIN_OVERHEAD) + return 0; + + /* ID1 */ + *out_next++ = GZIP_ID1; + /* ID2 */ + *out_next++ = GZIP_ID2; + /* CM */ + *out_next++ = GZIP_CM_DEFLATE; + /* FLG */ + *out_next++ = 0; + /* MTIME */ + put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next); + out_next += 4; + /* XFL */ + xfl = 0; + compression_level = deflate_get_compression_level(c); + if (compression_level < 2) + xfl |= GZIP_XFL_FASTEST_COMPRESSION; + else if (compression_level >= 8) + xfl |= GZIP_XFL_SLOWEST_COMPRESSION; + *out_next++ = xfl; + /* OS */ + *out_next++ = GZIP_OS_UNKNOWN; /* OS */ + + /* Compressed data */ + deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next, + out_nbytes_avail - GZIP_MIN_OVERHEAD); + if (deflate_size == 0) + return 0; + out_next += deflate_size; + + /* CRC32 */ + put_unaligned_le32(libdeflate_crc32(0, in, in_nbytes), out_next); + out_next += 4; + + /* ISIZE */ + put_unaligned_le32((u32)in_nbytes, out_next); + out_next += 4; + + return out_next - (u8 *)out; +} + +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_gzip_compress_bound(struct libdeflate_compressor *c, + size_t in_nbytes) +{ + return GZIP_MIN_OVERHEAD + + libdeflate_deflate_compress_bound(c, in_nbytes); +} diff --git a/lib/gdeflate/libdeflate/lib/gzip_constants.h b/lib/gdeflate/libdeflate/lib/gzip_constants.h new file mode 100644 index 0000000..35e4728 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/gzip_constants.h @@ -0,0 +1,45 @@ +/* + * gzip_constants.h - constants for the gzip wrapper format + */ + +#ifndef LIB_GZIP_CONSTANTS_H +#define LIB_GZIP_CONSTANTS_H + +#define GZIP_MIN_HEADER_SIZE 10 +#define GZIP_FOOTER_SIZE 8 +#define GZIP_MIN_OVERHEAD (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE) + +#define GZIP_ID1 0x1F +#define GZIP_ID2 0x8B + +#define GZIP_CM_DEFLATE 8 + +#define GZIP_FTEXT 0x01 +#define GZIP_FHCRC 0x02 +#define GZIP_FEXTRA 0x04 +#define GZIP_FNAME 0x08 +#define GZIP_FCOMMENT 0x10 +#define GZIP_FRESERVED 0xE0 + +#define GZIP_MTIME_UNAVAILABLE 0 + +#define GZIP_XFL_SLOWEST_COMPRESSION 0x02 +#define GZIP_XFL_FASTEST_COMPRESSION 0x04 + +#define GZIP_OS_FAT 0 +#define GZIP_OS_AMIGA 1 +#define GZIP_OS_VMS 2 +#define GZIP_OS_UNIX 3 +#define GZIP_OS_VM_CMS 4 +#define GZIP_OS_ATARI_TOS 5 +#define GZIP_OS_HPFS 6 +#define GZIP_OS_MACINTOSH 7 +#define GZIP_OS_Z_SYSTEM 8 +#define GZIP_OS_CP_M 9 +#define GZIP_OS_TOPS_20 10 +#define GZIP_OS_NTFS 11 +#define GZIP_OS_QDOS 12 +#define GZIP_OS_RISCOS 13 +#define GZIP_OS_UNKNOWN 255 + +#endif /* LIB_GZIP_CONSTANTS_H */ diff --git a/lib/gdeflate/libdeflate/lib/gzip_decompress.c b/lib/gdeflate/libdeflate/lib/gzip_decompress.c new file mode 100644 index 0000000..1b31d8a --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/gzip_decompress.c @@ -0,0 +1,148 @@ +/* + * gzip_decompress.c - decompress with a gzip wrapper + * + * Originally public domain; changes after 2016-09-07 are copyrighted. + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "gzip_constants.h" +#include "unaligned.h" + +#include "libdeflate.h" + +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) +{ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + u8 flg; + size_t actual_in_nbytes; + size_t actual_out_nbytes; + enum libdeflate_result result; + + if (in_nbytes < GZIP_MIN_OVERHEAD) + return LIBDEFLATE_BAD_DATA; + + /* ID1 */ + if (*in_next++ != GZIP_ID1) + return LIBDEFLATE_BAD_DATA; + /* ID2 */ + if (*in_next++ != GZIP_ID2) + return LIBDEFLATE_BAD_DATA; + /* CM */ + if (*in_next++ != GZIP_CM_DEFLATE) + return LIBDEFLATE_BAD_DATA; + flg = *in_next++; + /* MTIME */ + in_next += 4; + /* XFL */ + in_next += 1; + /* OS */ + in_next += 1; + + if (flg & GZIP_FRESERVED) + return LIBDEFLATE_BAD_DATA; + + /* Extra field */ + if (flg & GZIP_FEXTRA) { + u16 xlen = get_unaligned_le16(in_next); + in_next += 2; + + if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + + in_next += xlen; + } + + /* Original file name (zero terminated) */ + if (flg & GZIP_FNAME) { + while (*in_next++ != 0 && in_next != in_end) + ; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + } + + /* File comment (zero terminated) */ + if (flg & GZIP_FCOMMENT) { + while (*in_next++ != 0 && in_next != in_end) + ; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + } + + /* CRC16 for gzip header */ + if (flg & GZIP_FHCRC) { + in_next += 2; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + } + + /* Compressed data */ + result = libdeflate_deflate_decompress_ex(d, in_next, + in_end - GZIP_FOOTER_SIZE - in_next, + out, out_nbytes_avail, + &actual_in_nbytes, + actual_out_nbytes_ret); + if (result != LIBDEFLATE_SUCCESS) + return result; + + if (actual_out_nbytes_ret) + actual_out_nbytes = *actual_out_nbytes_ret; + else + actual_out_nbytes = out_nbytes_avail; + + in_next += actual_in_nbytes; + + /* CRC32 */ + if (libdeflate_crc32(0, out, actual_out_nbytes) != + get_unaligned_le32(in_next)) + return LIBDEFLATE_BAD_DATA; + in_next += 4; + + /* ISIZE */ + if ((u32)actual_out_nbytes != get_unaligned_le32(in_next)) + return LIBDEFLATE_BAD_DATA; + in_next += 4; + + if (actual_in_nbytes_ret) + *actual_in_nbytes_ret = in_next - (u8 *)in; + + return LIBDEFLATE_SUCCESS; +} + +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_gzip_decompress(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) +{ + return libdeflate_gzip_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); +} diff --git a/lib/gdeflate/libdeflate/lib/hc_matchfinder.h b/lib/gdeflate/libdeflate/lib/hc_matchfinder.h new file mode 100644 index 0000000..b81d32c --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/hc_matchfinder.h @@ -0,0 +1,412 @@ +/* + * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists + * + * Originally public domain; changes after 2016-09-07 are copyrighted. + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * --------------------------------------------------------------------------- + * + * Algorithm + * + * This is a Hash Chains (hc) based matchfinder. + * + * The main data structure is a hash table where each hash bucket contains a + * linked list (or "chain") of sequences whose first 4 bytes share the same hash + * code. Each sequence is identified by its starting position in the input + * buffer. + * + * The algorithm processes the input buffer sequentially. At each byte + * position, the hash code of the first 4 bytes of the sequence beginning at + * that position (the sequence being matched against) is computed. This + * identifies the hash bucket to use for that position. Then, this hash + * bucket's linked list is searched for matches. Then, a new linked list node + * is created to represent the current sequence and is prepended to the list. + * + * This algorithm has several useful properties: + * + * - It only finds true Lempel-Ziv matches; i.e., those where the matching + * sequence occurs prior to the sequence being matched against. + * + * - The sequences in each linked list are always sorted by decreasing starting + * position. Therefore, the closest (smallest offset) matches are found + * first, which in many compression formats tend to be the cheapest to encode. + * + * - Although fast running time is not guaranteed due to the possibility of the + * lists getting very long, the worst degenerate behavior can be easily + * prevented by capping the number of nodes searched at each position. + * + * - If the compressor decides not to search for matches at a certain position, + * then that position can be quickly inserted without searching the list. + * + * - The algorithm is adaptable to sliding windows: just store the positions + * relative to a "base" value that is updated from time to time, and stop + * searching each list when the sequences get too far away. + * + * ---------------------------------------------------------------------------- + * + * Optimizations + * + * The main hash table and chains handle length 4+ matches. Length 3 matches + * are handled by a separate hash table with no chains. This works well for + * typical "greedy" or "lazy"-style compressors, where length 3 matches are + * often only helpful if they have small offsets. Instead of searching a full + * chain for length 3+ matches, the algorithm just checks for one close length 3 + * match, then focuses on finding length 4+ matches. + * + * The longest_match() and skip_positions() functions are inlined into the + * compressors that use them. This isn't just about saving the overhead of a + * function call. These functions are intended to be called from the inner + * loops of compressors, where giving the compiler more control over register + * allocation is very helpful. There is also significant benefit to be gained + * from allowing the CPU to predict branches independently at each call site. + * For example, "lazy"-style compressors can be written with two calls to + * longest_match(), each of which starts with a different 'best_len' and + * therefore has significantly different performance characteristics. + * + * Although any hash function can be used, a multiplicative hash is fast and + * works well. + * + * On some processors, it is significantly faster to extend matches by whole + * words (32 or 64 bits) instead of by individual bytes. For this to be the + * case, the processor must implement unaligned memory accesses efficiently and + * must have either a fast "find first set bit" instruction or a fast "find last + * set bit" instruction, depending on the processor's endianness. + * + * The code uses one loop for finding the first match and one loop for finding a + * longer match. Each of these loops is tuned for its respective task and in + * combination are faster than a single generalized loop that handles both + * tasks. + * + * The code also uses a tight inner loop that only compares the last and first + * bytes of a potential match. It is only when these bytes match that a full + * match extension is attempted. + * + * ---------------------------------------------------------------------------- + */ + +#ifndef LIB_HC_MATCHFINDER_H +#define LIB_HC_MATCHFINDER_H + +#include "matchfinder_common.h" + +#define HC_MATCHFINDER_HASH3_ORDER 15 +#define HC_MATCHFINDER_HASH4_ORDER 16 + +#define HC_MATCHFINDER_TOTAL_HASH_SIZE \ + (((1UL << HC_MATCHFINDER_HASH3_ORDER) + \ + (1UL << HC_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t)) + +struct hc_matchfinder { + + /* The hash table for finding length 3 matches */ + mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER]; + + /* The hash table which contains the first nodes of the linked lists for + * finding length 4+ matches */ + mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER]; + + /* The "next node" references for the linked lists. The "next node" of + * the node for the sequence with position 'pos' is 'next_tab[pos]'. */ + mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE]; + +} +#ifdef _aligned_attribute + _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT) +#endif +; + +/* Prepare the matchfinder for a new input buffer. */ +static forceinline void +hc_matchfinder_init(struct hc_matchfinder *mf) +{ + STATIC_ASSERT(HC_MATCHFINDER_TOTAL_HASH_SIZE % + MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_SIZE); +} + +static forceinline void +hc_matchfinder_slide_window(struct hc_matchfinder *mf) +{ + STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0); + + matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf)); +} + +/* + * Find the longest match longer than 'best_len' bytes. + * + * @mf + * The matchfinder structure. + * @in_base_p + * Location of a pointer which points to the place in the input data the + * matchfinder currently stores positions relative to. This may be updated + * by this function. + * @cur_pos + * The current position in the input buffer relative to @in_base (the + * position of the sequence being matched against). + * @best_len + * Require a match longer than this length. + * @max_len + * The maximum permissible match length at this position. + * @nice_len + * Stop searching if a match of at least this length is found. + * Must be <= @max_len. + * @max_search_depth + * Limit on the number of potential matches to consider. Must be >= 1. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + 1. + * @offset_ret + * If a match is found, its offset is returned in this location. + * + * Return the length of the match found, or 'best_len' if no match longer than + * 'best_len' was found. + */ +static forceinline u32 +hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, + const u8 ** const restrict in_base_p, + const u8 * const restrict in_next, + u32 best_len, + const u32 max_len, + const u32 nice_len, + const u32 max_search_depth, + u32 * const restrict next_hashes, + u32 * const restrict offset_ret) +{ + u32 depth_remaining = max_search_depth; + const u8 *best_matchptr = in_next; + mf_pos_t cur_node3, cur_node4; + u32 hash3, hash4; + u32 next_hashseq; + u32 seq4; + const u8 *matchptr; + u32 len; + u32 cur_pos = in_next - *in_base_p; + const u8 *in_base; + mf_pos_t cutoff; + + if (cur_pos == MATCHFINDER_WINDOW_SIZE) { + hc_matchfinder_slide_window(mf); + *in_base_p += MATCHFINDER_WINDOW_SIZE; + cur_pos = 0; + } + + in_base = *in_base_p; + cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; + + if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */ + goto out; + + /* Get the precomputed hash codes. */ + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + + /* From the hash buckets, get the first node of each linked list. */ + cur_node3 = mf->hash3_tab[hash3]; + cur_node4 = mf->hash4_tab[hash4]; + + /* Update for length 3 matches. This replaces the singleton node in the + * 'hash3' bucket with the node for the current sequence. */ + mf->hash3_tab[hash3] = cur_pos; + + /* Update for length 4 matches. This prepends the node for the current + * sequence to the linked list in the 'hash4' bucket. */ + mf->hash4_tab[hash4] = cur_pos; + mf->next_tab[cur_pos] = cur_node4; + + /* Compute the next hash codes. */ + next_hashseq = get_unaligned_le32(in_next + 1); + next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER); + next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER); + prefetchw(&mf->hash3_tab[next_hashes[0]]); + prefetchw(&mf->hash4_tab[next_hashes[1]]); + + if (best_len < 4) { /* No match of length >= 4 found yet? */ + + /* Check for a length 3 match if needed. */ + + if (cur_node3 <= cutoff) + goto out; + + seq4 = load_u32_unaligned(in_next); + + if (best_len < 3) { + matchptr = &in_base[cur_node3]; + if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) { + best_len = 3; + best_matchptr = matchptr; + } + } + + /* Check for a length 4 match. */ + + if (cur_node4 <= cutoff) + goto out; + + for (;;) { + /* No length 4 match found yet. Check the first 4 bytes. */ + matchptr = &in_base[cur_node4]; + + if (load_u32_unaligned(matchptr) == seq4) + break; + + /* The first 4 bytes did not match. Keep trying. */ + cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; + if (cur_node4 <= cutoff || !--depth_remaining) + goto out; + } + + /* Found a match of length >= 4. Extend it to its full length. */ + best_matchptr = matchptr; + best_len = lz_extend(in_next, best_matchptr, 4, max_len); + if (best_len >= nice_len) + goto out; + cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; + if (cur_node4 <= cutoff || !--depth_remaining) + goto out; + } else { + if (cur_node4 <= cutoff || best_len >= nice_len) + goto out; + } + + /* Check for matches of length >= 5. */ + + for (;;) { + for (;;) { + matchptr = &in_base[cur_node4]; + + /* Already found a length 4 match. Try for a longer + * match; start by checking either the last 4 bytes and + * the first 4 bytes, or the last byte. (The last byte, + * the one which would extend the match length by 1, is + * the most important.) */ + #if UNALIGNED_ACCESS_IS_FAST + if ((load_u32_unaligned(matchptr + best_len - 3) == + load_u32_unaligned(in_next + best_len - 3)) && + (load_u32_unaligned(matchptr) == + load_u32_unaligned(in_next))) + #else + if (matchptr[best_len] == in_next[best_len]) + #endif + break; + + /* Continue to the next node in the list. */ + cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; + if (cur_node4 <= cutoff || !--depth_remaining) + goto out; + } + + #if UNALIGNED_ACCESS_IS_FAST + len = 4; + #else + len = 0; + #endif + len = lz_extend(in_next, matchptr, len, max_len); + if (len > best_len) { + /* This is the new longest match. */ + best_len = len; + best_matchptr = matchptr; + if (best_len >= nice_len) + goto out; + } + + /* Continue to the next node in the list. */ + cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; + if (cur_node4 <= cutoff || !--depth_remaining) + goto out; + } +out: + *offset_ret = in_next - best_matchptr; + return best_len; +} + +/* + * Advance the matchfinder, but don't search for matches. + * + * @mf + * The matchfinder structure. + * @in_base_p + * Location of a pointer which points to the place in the input data the + * matchfinder currently stores positions relative to. This may be updated + * by this function. + * @cur_pos + * The current position in the input buffer relative to @in_base. + * @end_pos + * The end position of the input buffer, relative to @in_base. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + @count. + * @count + * The number of bytes to advance. Must be > 0. + * + * Returns @in_next + @count. + */ +static forceinline const u8 * +hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf, + const u8 ** const restrict in_base_p, + const u8 *in_next, + const u8 * const in_end, + const u32 count, + u32 * const restrict next_hashes) +{ + u32 cur_pos; + u32 hash3, hash4; + u32 next_hashseq; + u32 remaining = count; + + if (unlikely(count + 5 > in_end - in_next)) + return &in_next[count]; + + cur_pos = in_next - *in_base_p; + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + do { + if (cur_pos == MATCHFINDER_WINDOW_SIZE) { + hc_matchfinder_slide_window(mf); + *in_base_p += MATCHFINDER_WINDOW_SIZE; + cur_pos = 0; + } + mf->hash3_tab[hash3] = cur_pos; + mf->next_tab[cur_pos] = mf->hash4_tab[hash4]; + mf->hash4_tab[hash4] = cur_pos; + + next_hashseq = get_unaligned_le32(++in_next); + hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER); + hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER); + cur_pos++; + } while (--remaining); + + prefetchw(&mf->hash3_tab[hash3]); + prefetchw(&mf->hash4_tab[hash4]); + next_hashes[0] = hash3; + next_hashes[1] = hash4; + + return in_next; +} + +#endif /* LIB_HC_MATCHFINDER_H */ diff --git a/lib/gdeflate/libdeflate/lib/lib_common.h b/lib/gdeflate/libdeflate/lib/lib_common.h new file mode 100644 index 0000000..2eea56c --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/lib_common.h @@ -0,0 +1,67 @@ +/* + * lib_common.h - internal header included by all library code + */ + +#ifndef LIB_LIB_COMMON_H +#define LIB_LIB_COMMON_H + +#ifdef LIBDEFLATE_H +# error "lib_common.h must always be included before libdeflate.h" + /* because BUILDING_LIBDEFLATE must be set first */ +#endif + +#define BUILDING_LIBDEFLATE + +#include "../common/common_defs.h" + +/* + * Prefix with "_libdeflate_" all global symbols which are not part of the API + * and don't already have a "libdeflate" prefix. This avoids exposing overly + * generic names when libdeflate is built as a static library. + * + * Note that the chosen prefix is not really important and can be changed + * without breaking library users. It was just chosen so that the resulting + * symbol names are unlikely to conflict with those from any other software. + * Also note that this fixup has no useful effect when libdeflate is built as a + * shared library, since these symbols are not exported. + */ +#define SYM_FIXUP(sym) _libdeflate_##sym +#define deflate_get_compression_level SYM_FIXUP(deflate_get_compression_level) +#define _cpu_features SYM_FIXUP(_cpu_features) +#define setup_cpu_features SYM_FIXUP(setup_cpu_features) + +void *libdeflate_malloc(size_t size); +void libdeflate_free(void *ptr); + +void *libdeflate_aligned_malloc(size_t alignment, size_t size); +void libdeflate_aligned_free(void *ptr); + +#ifdef FREESTANDING +/* + * With -ffreestanding, may be missing, and we must provide + * implementations of memset(), memcpy(), memmove(), and memcmp(). + * See https://gcc.gnu.org/onlinedocs/gcc/Standards.html + * + * Also, -ffreestanding disables interpreting calls to these functions as + * built-ins. E.g., calling memcpy(&v, p, WORDBYTES) will make a function call, + * not be optimized to a single load instruction. For performance reasons we + * don't want that. So, declare these functions as macros that expand to the + * corresponding built-ins. This approach is recommended in the gcc man page. + * We still need the actual function definitions in case gcc calls them. + */ +void *memset(void *s, int c, size_t n); +#define memset(s, c, n) __builtin_memset((s), (c), (n)) + +void *memcpy(void *dest, const void *src, size_t n); +#define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n)) + +void *memmove(void *dest, const void *src, size_t n); +#define memmove(dest, src, n) __builtin_memmove((dest), (src), (n)) + +int memcmp(const void *s1, const void *s2, size_t n); +#define memcmp(s1, s2, n) __builtin_memcmp((s1), (s2), (n)) +#else +#include +#endif + +#endif /* LIB_LIB_COMMON_H */ diff --git a/lib/gdeflate/libdeflate/lib/matchfinder_common.h b/lib/gdeflate/libdeflate/lib/matchfinder_common.h new file mode 100644 index 0000000..423529d --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/matchfinder_common.h @@ -0,0 +1,180 @@ +/* + * matchfinder_common.h - common code for Lempel-Ziv matchfinding + */ + +#ifndef LIB_MATCHFINDER_COMMON_H +#define LIB_MATCHFINDER_COMMON_H + +#include "lib_common.h" +#include "unaligned.h" + +#ifndef MATCHFINDER_WINDOW_ORDER +# error "MATCHFINDER_WINDOW_ORDER must be defined!" +#endif + +#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER) + +#ifdef DEFLATE64 +typedef s32 mf_pos_t; +#else +typedef s16 mf_pos_t; +#endif + +#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE) + +/* + * Required alignment of the matchfinder buffer pointer and size. The values + * here come from the AVX-2 implementation, which is the worst case. + */ +#define MATCHFINDER_MEM_ALIGNMENT 32 +#define MATCHFINDER_SIZE_ALIGNMENT 128 + +#undef matchfinder_init +#undef matchfinder_rebase +#if defined(_aligned_attribute) && !defined(DEFLATE64) +# if defined(__arm__) || defined(__aarch64__) +# include "arm/matchfinder_impl.h" +# elif defined(__i386__) || defined(__x86_64__) +# include "x86/matchfinder_impl.h" +# endif +#endif + +/* + * Initialize the hash table portion of the matchfinder. + * + * Essentially, this is an optimized memset(). + * + * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and + * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT. + */ +#ifndef matchfinder_init +static forceinline void +matchfinder_init(mf_pos_t *data, size_t size) +{ + size_t num_entries = size / sizeof(*data); + size_t i; + + for (i = 0; i < num_entries; i++) + data[i] = MATCHFINDER_INITVAL; +} +#endif + +/* + * Slide the matchfinder by WINDOW_SIZE bytes. + * + * This must be called just after each WINDOW_SIZE bytes have been run through + * the matchfinder. + * + * This will subtract WINDOW_SIZE bytes from each entry in the array specified. + * The effect is that all entries are updated to be relative to the current + * position, rather than the position WINDOW_SIZE bytes prior. + * + * Underflow is detected and replaced with signed saturation. This ensures that + * once the sliding window has passed over a position, that position forever + * remains out of bounds. + * + * The array passed in must contain all matchfinder data that is + * position-relative. Concretely, this will include the hash table as well as + * the table of positions that is used to link together the sequences in each + * hash bucket. Note that in the latter table, the links are 1-ary in the case + * of "hash chains", and 2-ary in the case of "binary trees". In either case, + * the links need to be rebased in the same way. + * + * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and + * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT. + */ +#ifndef matchfinder_rebase +static forceinline void +matchfinder_rebase(mf_pos_t *data, size_t size) +{ + size_t num_entries = size / sizeof(*data); + size_t i; + + if (MATCHFINDER_WINDOW_SIZE == 32768 && sizeof(mf_pos_t) == 2) { + /* Branchless version for 32768 byte windows. If the value was + * already negative, clear all bits except the sign bit; this + * changes the value to -32768. Otherwise, set the sign bit; + * this is equivalent to subtracting 32768. */ + for (i = 0; i < num_entries; i++) { + u16 v = data[i]; + u16 sign_bit = v & 0x8000; + v &= sign_bit - ((sign_bit >> 15) ^ 1); + v |= 0x8000; + data[i] = v; + } + return; + } + + for (i = 0; i < num_entries; i++) { + if (data[i] >= 0) + data[i] -= (mf_pos_t)MATCHFINDER_WINDOW_SIZE; + else + data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; + } +} +#endif + +/* + * The hash function: given a sequence prefix held in the low-order bits of a + * 32-bit value, multiply by a carefully-chosen large constant. Discard any + * bits of the product that don't fit in a 32-bit value, but take the + * next-highest @num_bits bits of the product as the hash value, as those have + * the most randomness. + */ +static forceinline u32 +lz_hash(u32 seq, unsigned num_bits) +{ + return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits); +} + +/* + * Return the number of bytes at @matchptr that match the bytes at @strptr, up + * to a maximum of @max_len. Initially, @start_len bytes are matched. + */ +static forceinline unsigned +lz_extend(const u8 * const strptr, const u8 * const matchptr, + const unsigned start_len, const unsigned max_len) +{ + unsigned len = start_len; + machine_word_t v_word; + + if (UNALIGNED_ACCESS_IS_FAST) { + + if (likely(max_len - len >= 4 * WORDBYTES)) { + + #define COMPARE_WORD_STEP \ + v_word = load_word_unaligned(&matchptr[len]) ^ \ + load_word_unaligned(&strptr[len]); \ + if (v_word != 0) \ + goto word_differs; \ + len += WORDBYTES; \ + + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + #undef COMPARE_WORD_STEP + } + + while (len + WORDBYTES <= max_len) { + v_word = load_word_unaligned(&matchptr[len]) ^ + load_word_unaligned(&strptr[len]); + if (v_word != 0) + goto word_differs; + len += WORDBYTES; + } + } + + while (len < max_len && matchptr[len] == strptr[len]) + len++; + return len; + +word_differs: + if (CPU_IS_LITTLE_ENDIAN()) + len += (bsfw(v_word) >> 3); + else + len += (WORDBITS - 1 - bsrw(v_word)) >> 3; + return len; +} + +#endif /* LIB_MATCHFINDER_COMMON_H */ diff --git a/lib/gdeflate/libdeflate/lib/unaligned.h b/lib/gdeflate/libdeflate/lib/unaligned.h new file mode 100644 index 0000000..bb48bf8 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/unaligned.h @@ -0,0 +1,228 @@ +/* + * unaligned.h - inline functions for unaligned memory accesses + */ + +#ifndef LIB_UNALIGNED_H +#define LIB_UNALIGNED_H + +#include "lib_common.h" + +/***** Unaligned loads and stores without endianness conversion *****/ + +/* + * memcpy() is portable, and it usually gets optimized appropriately by modern + * compilers. I.e., each memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled + * to a load or store instruction, not to an actual function call. + * + * We no longer use the "packed struct" approach, as that is nonstandard, has + * unclear semantics, and doesn't receive enough testing + * (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994). + * + * arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception + * where memcpy() generates inefficient code + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366). However, we no longer + * consider that one case important enough to maintain different code for. + * If you run into it, please just use a newer version of gcc (or use clang). + */ + +#define DEFINE_UNALIGNED_TYPE(type) \ +static forceinline type \ +load_##type##_unaligned(const void *p) \ +{ \ + type v; \ + memcpy(&v, p, sizeof(v)); \ + return v; \ +} \ + \ +static forceinline void \ +store_##type##_unaligned(type v, void *p) \ +{ \ + memcpy(p, &v, sizeof(v)); \ +} + +DEFINE_UNALIGNED_TYPE(u16) +DEFINE_UNALIGNED_TYPE(u32) +DEFINE_UNALIGNED_TYPE(u64) +DEFINE_UNALIGNED_TYPE(machine_word_t) + +#define load_word_unaligned load_machine_word_t_unaligned +#define store_word_unaligned store_machine_word_t_unaligned + +/***** Unaligned loads with endianness conversion *****/ + +static forceinline u16 +get_unaligned_le16(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le16_bswap(load_u16_unaligned(p)); + else + return ((u16)p[1] << 8) | p[0]; +} + +static forceinline u16 +get_unaligned_be16(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return be16_bswap(load_u16_unaligned(p)); + else + return ((u16)p[0] << 8) | p[1]; +} + +static forceinline u32 +get_unaligned_le32(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le32_bswap(load_u32_unaligned(p)); + else + return ((u32)p[3] << 24) | ((u32)p[2] << 16) | + ((u32)p[1] << 8) | p[0]; +} + +static forceinline u32 +get_unaligned_be32(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return be32_bswap(load_u32_unaligned(p)); + else + return ((u32)p[0] << 24) | ((u32)p[1] << 16) | + ((u32)p[2] << 8) | p[3]; +} + +static forceinline u64 +get_unaligned_le64(const u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le64_bswap(load_u64_unaligned(p)); + else + return ((u64)p[7] << 56) | ((u64)p[6] << 48) | + ((u64)p[5] << 40) | ((u64)p[4] << 32) | + ((u64)p[3] << 24) | ((u64)p[2] << 16) | + ((u64)p[1] << 8) | p[0]; +} + +static forceinline machine_word_t +get_unaligned_leword(const u8 *p) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return get_unaligned_le32(p); + else + return get_unaligned_le64(p); +} + +/***** Unaligned stores with endianness conversion *****/ + +static forceinline void +put_unaligned_le16(u16 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u16_unaligned(le16_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + } +} + +static forceinline void +put_unaligned_be16(u16 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u16_unaligned(be16_bswap(v), p); + } else { + p[0] = (u8)(v >> 8); + p[1] = (u8)(v >> 0); + } +} + +static forceinline void +put_unaligned_le32(u32 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u32_unaligned(le32_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + p[2] = (u8)(v >> 16); + p[3] = (u8)(v >> 24); + } +} + +static forceinline void +put_unaligned_be32(u32 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u32_unaligned(be32_bswap(v), p); + } else { + p[0] = (u8)(v >> 24); + p[1] = (u8)(v >> 16); + p[2] = (u8)(v >> 8); + p[3] = (u8)(v >> 0); + } +} + +static forceinline void +put_unaligned_le64(u64 v, u8 *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_u64_unaligned(le64_bswap(v), p); + } else { + p[0] = (u8)(v >> 0); + p[1] = (u8)(v >> 8); + p[2] = (u8)(v >> 16); + p[3] = (u8)(v >> 24); + p[4] = (u8)(v >> 32); + p[5] = (u8)(v >> 40); + p[6] = (u8)(v >> 48); + p[7] = (u8)(v >> 56); + } +} + +static forceinline void +put_unaligned_leword(machine_word_t v, u8 *p) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + put_unaligned_le32(v, p); + else + put_unaligned_le64(v, p); +} + +/***** 24-bit loads *****/ + +/* + * Given a 32-bit value that was loaded with the platform's native endianness, + * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24 + * bits contain the first 3 bytes, arranged in octets in a platform-dependent + * order, at the memory location from which the input 32-bit value was loaded. + */ +static forceinline u32 +loaded_u32_to_u24(u32 v) +{ + if (CPU_IS_LITTLE_ENDIAN()) + return v & 0xFFFFFF; + else + return v >> 8; +} + +/* + * Load the next 3 bytes from the memory location @p into the 24 low-order bits + * of a 32-bit value. The order in which the 3 bytes will be arranged as octets + * in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES + * bytes must be available at @p; note that this may be more than 3. + */ +static forceinline u32 +load_u24_unaligned(const u8 *p) +{ +#if UNALIGNED_ACCESS_IS_FAST +# define LOAD_U24_REQUIRED_NBYTES 4 + return loaded_u32_to_u24(load_u32_unaligned(p)); +#else +# define LOAD_U24_REQUIRED_NBYTES 3 + if (CPU_IS_LITTLE_ENDIAN()) + return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); + else + return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16); +#endif +} + +#endif /* LIB_UNALIGNED_H */ diff --git a/lib/gdeflate/libdeflate/lib/utils.c b/lib/gdeflate/libdeflate/lib/utils.c new file mode 100644 index 0000000..c626af1 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/utils.c @@ -0,0 +1,142 @@ +/* + * utils.c - utility functions for libdeflate + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "lib_common.h" + +#include "libdeflate.h" + +#ifdef FREESTANDING +# define malloc NULL +# define free NULL +#else +# include +#endif + +static void *(*libdeflate_malloc_func)(size_t) = malloc; +static void (*libdeflate_free_func)(void *) = free; + +void * +libdeflate_malloc(size_t size) +{ + return (*libdeflate_malloc_func)(size); +} + +void +libdeflate_free(void *ptr) +{ + (*libdeflate_free_func)(ptr); +} + +void * +libdeflate_aligned_malloc(size_t alignment, size_t size) +{ + void *ptr = libdeflate_malloc(sizeof(void *) + alignment - 1 + size); + if (ptr) { + void *orig_ptr = ptr; + ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment); + ((void **)ptr)[-1] = orig_ptr; + } + return ptr; +} + +void +libdeflate_aligned_free(void *ptr) +{ + if (ptr) + libdeflate_free(((void **)ptr)[-1]); +} + +LIBDEFLATEEXPORT void LIBDEFLATEAPI +libdeflate_set_memory_allocator(void *(*malloc_func)(size_t), + void (*free_func)(void *)) +{ + libdeflate_malloc_func = malloc_func; + libdeflate_free_func = free_func; +} + +/* + * Implementations of libc functions for freestanding library builds. + * Normal library builds don't use these. Not optimized yet; usually the + * compiler expands these functions and doesn't actually call them anyway. + */ +#ifdef FREESTANDING +#undef memset +void * __attribute__((weak)) +memset(void *s, int c, size_t n) +{ + u8 *p = s; + size_t i; + + for (i = 0; i < n; i++) + p[i] = c; + return s; +} + +#undef memcpy +void * __attribute__((weak)) +memcpy(void *dest, const void *src, size_t n) +{ + u8 *d = dest; + const u8 *s = src; + size_t i; + + for (i = 0; i < n; i++) + d[i] = s[i]; + return dest; +} + +#undef memmove +void * __attribute__((weak)) +memmove(void *dest, const void *src, size_t n) +{ + u8 *d = dest; + const u8 *s = src; + size_t i; + + if (d <= s) + return memcpy(d, s, n); + + for (i = n; i > 0; i--) + d[i - 1] = s[i - 1]; + return dest; +} + +#undef memcmp +int __attribute__((weak)) +memcmp(const void *s1, const void *s2, size_t n) +{ + const u8 *p1 = s1; + const u8 *p2 = s2; + size_t i; + + for (i = 0; i < n; i++) { + if (p1[i] != p2[i]) + return (int)p1[i] - (int)p2[i]; + } + return 0; +} +#endif /* FREESTANDING */ diff --git a/lib/gdeflate/libdeflate/lib/x86/adler32_impl.h b/lib/gdeflate/libdeflate/lib/x86/adler32_impl.h new file mode 100644 index 0000000..f89bde5 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/x86/adler32_impl.h @@ -0,0 +1,337 @@ +/* + * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_X86_ADLER32_IMPL_H +#define LIB_X86_ADLER32_IMPL_H + +#include "cpu_features.h" + +/* + * The following macros horizontally sum the s1 counters and add them to the + * real s1, and likewise for s2. They do this via a series of reductions, each + * of which halves the vector length, until just one counter remains. + * + * The s1 reductions don't depend on the s2 reductions and vice versa, so for + * efficiency they are interleaved. Also, every other s1 counter is 0 due to + * the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than + * 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits. + */ + +#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2) \ +{ \ + __v4su s1_last = (v_s1), s2_last = (v_s2); \ + \ + /* 128 => 32 bits */ \ + s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x31); \ + s1_last += (__v4su)_mm_shuffle_epi32((__m128i)s1_last, 0x02); \ + s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x02); \ + \ + *(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last); \ + *(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last); \ +} + +#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2) \ +{ \ + __v4su s1_128bit, s2_128bit; \ + \ + /* 256 => 128 bits */ \ + s1_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 0) + \ + (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 1); \ + s2_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 0) + \ + (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 1); \ + \ + ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \ +} + +#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \ +{ \ + __v8su s1_256bit, s2_256bit; \ + \ + /* 512 => 256 bits */ \ + s1_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 0) + \ + (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 1); \ + s2_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 0) + \ + (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 1); \ + \ + ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \ +} + +/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */ +#undef DISPATCH_AVX512BW +#if !defined(DEFAULT_IMPL) && \ + /* + * clang before v3.9 is missing some AVX-512BW intrinsics including + * _mm512_sad_epu8(), a.k.a. __builtin_ia32_psadbw512. So just make using + * AVX-512BW, even when __AVX512BW__ is defined, conditional on + * COMPILER_SUPPORTS_AVX512BW_TARGET where we check for that builtin. + */ \ + COMPILER_SUPPORTS_AVX512BW_TARGET && \ + (defined(__AVX512BW__) || (X86_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS)) +# define FUNCNAME adler32_avx512bw +# define FUNCNAME_CHUNK adler32_avx512bw_chunk +# define IMPL_ALIGNMENT 64 +# define IMPL_SEGMENT_SIZE 64 +# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE +# ifdef __AVX512BW__ +# define ATTRIBUTES +# define DEFAULT_IMPL adler32_avx512bw +# else +# define ATTRIBUTES __attribute__((target("avx512bw"))) +# define DISPATCH 1 +# define DISPATCH_AVX512BW 1 +# endif +# include +static forceinline ATTRIBUTES void +adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end, + u32 *s1, u32 *s2) +{ + const __m512i zeroes = _mm512_setzero_si512(); + const __v64qi multipliers = (__v64qi){ + 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, + 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + }; + const __v32hi ones = (__v32hi)_mm512_set1_epi16(1); + __v16si v_s1 = (__v16si)zeroes; + __v16si v_s1_sums = (__v16si)zeroes; + __v16si v_s2 = (__v16si)zeroes; + + do { + /* Load the next 64-byte segment */ + __m512i bytes = *p++; + + /* Multiply the bytes by 64...1 (the number of times they need + * to be added to s2) and add adjacent products */ + __v32hi sums = (__v32hi)_mm512_maddubs_epi16( + bytes, (__m512i)multipliers); + + /* Keep sum of all previous s1 counters, for adding to s2 later. + * This allows delaying the multiplication by 64 to the end. */ + v_s1_sums += v_s1; + + /* Add the sum of each group of 8 bytes to the corresponding s1 + * counter */ + v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes); + + /* Add the sum of each group of 4 products of the bytes by + * 64...1 to the corresponding s2 counter */ + v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums, + (__m512i)ones); + } while (p != end); + + /* Finish the s2 counters by adding the sum of the s1 values at the + * beginning of each segment, multiplied by the segment size (64) */ + v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6); + + /* Add the counters to the real s1 and s2 */ + ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2); +} +# include "../adler32_vec_template.h" +#endif /* AVX-512BW implementation */ + +/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */ +#undef DISPATCH_AVX2 +#if !defined(DEFAULT_IMPL) && \ + (defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS)) +# define FUNCNAME adler32_avx2 +# define FUNCNAME_CHUNK adler32_avx2_chunk +# define IMPL_ALIGNMENT 32 +# define IMPL_SEGMENT_SIZE 32 +# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE +# ifdef __AVX2__ +# define ATTRIBUTES +# define DEFAULT_IMPL adler32_avx2 +# else +# define ATTRIBUTES __attribute__((target("avx2"))) +# define DISPATCH 1 +# define DISPATCH_AVX2 1 +# endif +# include +static forceinline ATTRIBUTES void +adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) +{ + const __m256i zeroes = _mm256_setzero_si256(); + const __v32qu multipliers = (__v32qu){ + 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + }; + const __v16hu ones = (__v16hu)_mm256_set1_epi16(1); + __v8su v_s1 = (__v8su)zeroes; + __v8su v_s1_sums = (__v8su)zeroes; + __v8su v_s2 = (__v8su)zeroes; + + do { + /* Load the next 32-byte segment */ + __m256i bytes = *p++; + + /* Multiply the bytes by 32...1 (the number of times they need + * to be added to s2) and add adjacent products */ + __v16hu sums = (__v16hu)_mm256_maddubs_epi16( + bytes, (__m256i)multipliers); + + /* Keep sum of all previous s1 counters, for adding to s2 later. + * This allows delaying the multiplication by 32 to the end. */ + v_s1_sums += v_s1; + + /* Add the sum of each group of 8 bytes to the corresponding s1 + * counter */ + v_s1 += (__v8su)_mm256_sad_epu8(bytes, zeroes); + + /* Add the sum of each group of 4 products of the bytes by + * 32...1 to the corresponding s2 counter */ + v_s2 += (__v8su)_mm256_madd_epi16((__m256i)sums, (__m256i)ones); + } while (p != end); + + /* Finish the s2 counters by adding the sum of the s1 values at the + * beginning of each segment, multiplied by the segment size (32) */ + v_s2 += (__v8su)_mm256_slli_epi32((__m256i)v_s1_sums, 5); + + /* Add the counters to the real s1 and s2 */ + ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2); +} +# include "../adler32_vec_template.h" +#endif /* AVX2 implementation */ + +/* SSE2 implementation */ +#undef DISPATCH_SSE2 +#if !defined(DEFAULT_IMPL) && \ + (defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS)) +# define FUNCNAME adler32_sse2 +# define FUNCNAME_CHUNK adler32_sse2_chunk +# define IMPL_ALIGNMENT 16 +# define IMPL_SEGMENT_SIZE 32 +/* + * The 16-bit precision byte counters must not be allowed to undergo *signed* + * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16) + * would behave incorrectly. + */ +# define IMPL_MAX_CHUNK_SIZE (32 * (0x7FFF / 0xFF)) +# ifdef __SSE2__ +# define ATTRIBUTES +# define DEFAULT_IMPL adler32_sse2 +# else +# define ATTRIBUTES __attribute__((target("sse2"))) +# define DISPATCH 1 +# define DISPATCH_SSE2 1 +# endif +# include +static forceinline ATTRIBUTES void +adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) +{ + const __m128i zeroes = _mm_setzero_si128(); + + /* s1 counters: 32-bit, sum of bytes */ + __v4su v_s1 = (__v4su)zeroes; + + /* s2 counters: 32-bit, sum of s1 values */ + __v4su v_s2 = (__v4su)zeroes; + + /* + * Thirty-two 16-bit counters for byte sums. Each accumulates the bytes + * that eventually need to be multiplied by a number 32...1 for addition + * into s2. + */ + __v8hu v_byte_sums_a = (__v8hu)zeroes; + __v8hu v_byte_sums_b = (__v8hu)zeroes; + __v8hu v_byte_sums_c = (__v8hu)zeroes; + __v8hu v_byte_sums_d = (__v8hu)zeroes; + + do { + /* Load the next 32 bytes */ + const __m128i bytes1 = *p++; + const __m128i bytes2 = *p++; + + /* + * Accumulate the previous s1 counters into the s2 counters. + * Logically, this really should be v_s2 += v_s1 * 32, but we + * can do the multiplication (or left shift) later. + */ + v_s2 += v_s1; + + /* + * s1 update: use "Packed Sum of Absolute Differences" to add + * the bytes horizontally with 8 bytes per sum. Then add the + * sums to the s1 counters. + */ + v_s1 += (__v4su)_mm_sad_epu8(bytes1, zeroes); + v_s1 += (__v4su)_mm_sad_epu8(bytes2, zeroes); + + /* + * Also accumulate the bytes into 32 separate counters that have + * 16-bit precision. + */ + v_byte_sums_a += (__v8hu)_mm_unpacklo_epi8(bytes1, zeroes); + v_byte_sums_b += (__v8hu)_mm_unpackhi_epi8(bytes1, zeroes); + v_byte_sums_c += (__v8hu)_mm_unpacklo_epi8(bytes2, zeroes); + v_byte_sums_d += (__v8hu)_mm_unpackhi_epi8(bytes2, zeroes); + + } while (p != end); + + /* Finish calculating the s2 counters */ + v_s2 = (__v4su)_mm_slli_epi32((__m128i)v_s2, 5); + v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_a, + (__m128i)(__v8hu){ 32, 31, 30, 29, 28, 27, 26, 25 }); + v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_b, + (__m128i)(__v8hu){ 24, 23, 22, 21, 20, 19, 18, 17 }); + v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_c, + (__m128i)(__v8hu){ 16, 15, 14, 13, 12, 11, 10, 9 }); + v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_d, + (__m128i)(__v8hu){ 8, 7, 6, 5, 4, 3, 2, 1 }); + + /* Add the counters to the real s1 and s2 */ + ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2); +} +# include "../adler32_vec_template.h" +#endif /* SSE2 implementation */ + +#ifdef DISPATCH +static inline adler32_func_t +arch_select_adler32_func(void) +{ + u32 features = get_cpu_features(); + +#ifdef DISPATCH_AVX512BW + if (features & X86_CPU_FEATURE_AVX512BW) + return adler32_avx512bw; +#endif +#ifdef DISPATCH_AVX2 + if (features & X86_CPU_FEATURE_AVX2) + return adler32_avx2; +#endif +#ifdef DISPATCH_SSE2 + if (features & X86_CPU_FEATURE_SSE2) + return adler32_sse2; +#endif + return NULL; +} +#endif /* DISPATCH */ + +#endif /* LIB_X86_ADLER32_IMPL_H */ diff --git a/lib/gdeflate/libdeflate/lib/x86/cpu_features.c b/lib/gdeflate/libdeflate/lib/x86/cpu_features.c new file mode 100644 index 0000000..e3471d4 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/x86/cpu_features.c @@ -0,0 +1,152 @@ +/* + * x86/cpu_features.c - feature detection for x86 processors + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "../cpu_features_common.h" /* must be included first */ +#include "cpu_features.h" + +#if X86_CPU_FEATURES_ENABLED + +volatile u32 _cpu_features = 0; + +/* With old GCC versions we have to manually save and restore the x86_32 PIC + * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 */ +#if defined(__i386__) && defined(__PIC__) +# define EBX_CONSTRAINT "=&r" +#else +# define EBX_CONSTRAINT "=b" +#endif + +/* Execute the CPUID instruction. */ +static inline void +cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) +{ + __asm__(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" + "cpuid \n" + ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" + : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) + : "a" (leaf), "c" (subleaf)); +} + +/* Read an extended control register. */ +static inline u64 +read_xcr(u32 index) +{ + u32 edx, eax; + + /* Execute the "xgetbv" instruction. Old versions of binutils do not + * recognize this instruction, so list the raw bytes instead. */ + __asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index)); + + return ((u64)edx << 32) | eax; +} + +#undef BIT +#define BIT(nr) (1UL << (nr)) + +#define XCR0_BIT_SSE BIT(1) +#define XCR0_BIT_AVX BIT(2) +#define XCR0_BIT_OPMASK BIT(5) +#define XCR0_BIT_ZMM_HI256 BIT(6) +#define XCR0_BIT_HI16_ZMM BIT(7) + +#define IS_SET(reg, nr) ((reg) & BIT(nr)) +#define IS_ALL_SET(reg, mask) (((reg) & (mask)) == (mask)) + +static const struct cpu_feature x86_cpu_feature_table[] = { + {X86_CPU_FEATURE_SSE2, "sse2"}, + {X86_CPU_FEATURE_PCLMUL, "pclmul"}, + {X86_CPU_FEATURE_AVX, "avx"}, + {X86_CPU_FEATURE_AVX2, "avx2"}, + {X86_CPU_FEATURE_BMI2, "bmi2"}, + {X86_CPU_FEATURE_AVX512BW, "avx512bw"}, +}; + +/* Initialize _cpu_features with bits for interesting processor features. */ +void setup_cpu_features(void) +{ + u32 features = 0; + u32 dummy1, dummy2, dummy3, dummy4; + u32 max_function; + u32 features_1, features_2, features_3, features_4; + bool os_avx_support = false; + bool os_avx512_support = false; + + /* Get maximum supported function */ + cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4); + if (max_function < 1) + goto out; + + /* Standard feature flags */ + cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1); + + if (IS_SET(features_1, 26)) + features |= X86_CPU_FEATURE_SSE2; + + if (IS_SET(features_2, 1)) + features |= X86_CPU_FEATURE_PCLMUL; + + if (IS_SET(features_2, 27)) { /* OSXSAVE set? */ + u64 xcr0 = read_xcr(0); + + os_avx_support = IS_ALL_SET(xcr0, + XCR0_BIT_SSE | + XCR0_BIT_AVX); + + os_avx512_support = IS_ALL_SET(xcr0, + XCR0_BIT_SSE | + XCR0_BIT_AVX | + XCR0_BIT_OPMASK | + XCR0_BIT_ZMM_HI256 | + XCR0_BIT_HI16_ZMM); + } + + if (os_avx_support && IS_SET(features_2, 28)) + features |= X86_CPU_FEATURE_AVX; + + if (max_function < 7) + goto out; + + /* Extended feature flags */ + cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4); + + if (os_avx_support && IS_SET(features_3, 5)) + features |= X86_CPU_FEATURE_AVX2; + + if (IS_SET(features_3, 8)) + features |= X86_CPU_FEATURE_BMI2; + + if (os_avx512_support && IS_SET(features_3, 30)) + features |= X86_CPU_FEATURE_AVX512BW; + +out: + disable_cpu_features_for_testing(&features, x86_cpu_feature_table, + ARRAY_LEN(x86_cpu_feature_table)); + + _cpu_features = features | X86_CPU_FEATURES_KNOWN; +} + +#endif /* X86_CPU_FEATURES_ENABLED */ diff --git a/lib/gdeflate/libdeflate/lib/x86/cpu_features.h b/lib/gdeflate/libdeflate/lib/x86/cpu_features.h new file mode 100644 index 0000000..4c02353 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/x86/cpu_features.h @@ -0,0 +1,41 @@ +/* + * x86/cpu_features.h - feature detection for x86 processors + */ + +#ifndef LIB_X86_CPU_FEATURES_H +#define LIB_X86_CPU_FEATURES_H + +#include "../lib_common.h" + +#if (defined(__i386__) || defined(__x86_64__)) && \ + COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE +# define X86_CPU_FEATURES_ENABLED 1 +#else +# define X86_CPU_FEATURES_ENABLED 0 +#endif + +#if X86_CPU_FEATURES_ENABLED + +#define X86_CPU_FEATURE_SSE2 0x00000001 +#define X86_CPU_FEATURE_PCLMUL 0x00000002 +#define X86_CPU_FEATURE_AVX 0x00000004 +#define X86_CPU_FEATURE_AVX2 0x00000008 +#define X86_CPU_FEATURE_BMI2 0x00000010 +#define X86_CPU_FEATURE_AVX512BW 0x00000020 + +#define X86_CPU_FEATURES_KNOWN 0x80000000 + +extern volatile u32 _cpu_features; + +void setup_cpu_features(void); + +static inline u32 get_cpu_features(void) +{ + if (_cpu_features == 0) + setup_cpu_features(); + return _cpu_features; +} + +#endif /* X86_CPU_FEATURES_ENABLED */ + +#endif /* LIB_X86_CPU_FEATURES_H */ diff --git a/lib/gdeflate/libdeflate/lib/x86/crc32_impl.h b/lib/gdeflate/libdeflate/lib/x86/crc32_impl.h new file mode 100644 index 0000000..14a6867 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/x86/crc32_impl.h @@ -0,0 +1,92 @@ +/* + * x86/crc32_impl.h - x86 implementations of CRC-32 checksum algorithm + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_X86_CRC32_IMPL_H +#define LIB_X86_CRC32_IMPL_H + +#include "cpu_features.h" + +/* + * Include the PCLMUL/AVX implementation? Although our PCLMUL-optimized CRC-32 + * function doesn't use any AVX intrinsics specifically, it can benefit a lot + * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100 + * MB/s. I expect this is related to the PCLMULQDQ instructions being assembled + * in the newer three-operand form rather than the older two-operand form. + * + * Note: this is only needed if __AVX__ is *not* defined, since otherwise the + * "regular" PCLMUL implementation would already be AVX enabled. + */ +#undef DISPATCH_PCLMUL_AVX +#if !defined(DEFAULT_IMPL) && !defined(__AVX__) && \ + X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET && \ + (defined(__PCLMUL__) || COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS) +# define FUNCNAME crc32_pclmul_avx +# define FUNCNAME_ALIGNED crc32_pclmul_avx_aligned +# define ATTRIBUTES __attribute__((target("pclmul,avx"))) +# define DISPATCH 1 +# define DISPATCH_PCLMUL_AVX 1 +# include "crc32_pclmul_template.h" +#endif + +/* PCLMUL implementation */ +#undef DISPATCH_PCLMUL +#if !defined(DEFAULT_IMPL) && \ + (defined(__PCLMUL__) || (X86_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS)) +# define FUNCNAME crc32_pclmul +# define FUNCNAME_ALIGNED crc32_pclmul_aligned +# ifdef __PCLMUL__ +# define ATTRIBUTES +# define DEFAULT_IMPL crc32_pclmul +# else +# define ATTRIBUTES __attribute__((target("pclmul"))) +# define DISPATCH 1 +# define DISPATCH_PCLMUL 1 +# endif +# include "crc32_pclmul_template.h" +#endif + +#ifdef DISPATCH +static inline crc32_func_t +arch_select_crc32_func(void) +{ + u32 features = get_cpu_features(); + +#ifdef DISPATCH_PCLMUL_AVX + if ((features & X86_CPU_FEATURE_PCLMUL) && + (features & X86_CPU_FEATURE_AVX)) + return crc32_pclmul_avx; +#endif +#ifdef DISPATCH_PCLMUL + if (features & X86_CPU_FEATURE_PCLMUL) + return crc32_pclmul; +#endif + return NULL; +} +#endif /* DISPATCH */ + +#endif /* LIB_X86_CRC32_IMPL_H */ diff --git a/lib/gdeflate/libdeflate/lib/x86/crc32_pclmul_template.h b/lib/gdeflate/libdeflate/lib/x86/crc32_pclmul_template.h new file mode 100644 index 0000000..a5eda9b --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/x86/crc32_pclmul_template.h @@ -0,0 +1,262 @@ +/* + * x86/crc32_pclmul_template.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +/* + * CRC-32 folding with PCLMULQDQ. + * + * The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits, + * producing an abbreviated message which is congruent the original message + * modulo the generator polynomial G(x). + * + * Folding each 512 bits is implemented as eight 64-bit folds, each of which + * uses one carryless multiplication instruction. It's expected that CPUs may + * be able to execute some of these multiplications in parallel. + * + * Explanation of "folding": let A(x) be 64 bits from the message, and let B(x) + * be 95 bits from a constant distance D later in the message. The relevant + * portion of the message can be written as: + * + * M(x) = A(x)*x^D + B(x) + * + * ... where + and * represent addition and multiplication, respectively, of + * polynomials over GF(2). Note that when implemented on a computer, these + * operations are equivalent to XOR and carryless multiplication, respectively. + * + * For the purpose of CRC calculation, only the remainder modulo the generator + * polynomial G(x) matters: + * + * M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x) + * + * Since the modulo operation can be applied anywhere in a sequence of additions + * and multiplications without affecting the result, this is equivalent to: + * + * M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x) + * + * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e. + * a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is equivalent to a carryless + * multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit + * product. Then, adding (XOR-ing) the product to B(x) produces a polynomial + * with the same length as B(x) but with the same remainder as 'A(x)*x^D + + * B(x)'. This is the basic fold operation with 64 bits. + * + * Note that the carryless multiplication instruction PCLMULQDQ actually takes + * two 64-bit inputs and produces a 127-bit product in the low-order bits of a + * 128-bit XMM register. This works fine, but care must be taken to account for + * "bit endianness". With the CRC version implemented here, bits are always + * ordered such that the lowest-order bit represents the coefficient of highest + * power of x and the highest-order bit represents the coefficient of the lowest + * power of x. This is backwards from the more intuitive order. Still, + * carryless multiplication works essentially the same either way. It just must + * be accounted for that when we XOR the 95-bit product in the low-order 95 bits + * of a 128-bit XMM register into 128-bits of later data held in another XMM + * register, we'll really be XOR-ing the product into the mathematically higher + * degree end of those later bits, not the lower degree end as may be expected. + * + * So given that caveat and the fact that we process 512 bits per iteration, the + * 'D' values we need for the two 64-bit halves of each 128 bits of data are: + * + * D = (512 + 95) - 64 for the higher-degree half of each 128 bits, + * i.e. the lower order bits in the XMM register + * + * D = (512 + 95) - 128 for the lower-degree half of each 128 bits, + * i.e. the higher order bits in the XMM register + * + * The required 'x^D mod G(x)' values were precomputed. + * + * When <= 512 bits remain in the message, we finish up by folding across + * smaller distances. This works similarly; the distance D is just different, + * so different constant multipliers must be used. Finally, once the remaining + * message is just 64 bits, it is reduced to the CRC-32 using Barrett reduction + * (explained later). + * + * For more information see the original paper from Intel: + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + * December 2009 + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + */ +static u32 ATTRIBUTES +FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t nr_segs) +{ + /* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */ + const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 }; + const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 }; + const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E }; + const __v2di final_multiplier = (__v2di){ 0xB8BC6765 }; + const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF }; + const __v2di barrett_reduction_constants = + (__v2di){ 0x00000001F7011641, 0x00000001DB710641 }; + + const __m128i * const end = p + nr_segs; + const __m128i * const end512 = p + (nr_segs & ~3); + __m128i x0, x1, x2, x3; + + /* + * Account for the current 'remainder', i.e. the CRC of the part of the + * message already processed. Explanation: rewrite the message + * polynomial M(x) in terms of the first part A(x), the second part + * B(x), and the length of the second part in bits |B(x)| >= 32: + * + * M(x) = A(x)*x^|B(x)| + B(x) + * + * Then the CRC of M(x) is: + * + * CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x)) + * = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x)) + * = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x)) + * + * Note: all arithmetic is modulo G(x), the generator polynomial; that's + * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x). + * + * So the CRC of the full message is the CRC of the second part of the + * message where the first 32 bits of the second part of the message + * have been XOR'ed with the CRC of the first part of the message. + */ + x0 = *p++; + x0 ^= (__m128i)(__v4si){ remainder }; + + if (p > end512) /* only 128, 256, or 384 bits of input? */ + goto _128_bits_at_a_time; + x1 = *p++; + x2 = *p++; + x3 = *p++; + + /* Fold 512 bits at a time */ + for (; p != end512; p += 4) { + __m128i y0, y1, y2, y3; + + y0 = p[0]; + y1 = p[1]; + y2 = p[2]; + y3 = p[3]; + + /* + * Note: the immediate constant for PCLMULQDQ specifies which + * 64-bit halves of the 128-bit vectors to multiply: + * + * 0x00 means low halves (higher degree polynomial terms for us) + * 0x11 means high halves (lower degree polynomial terms for us) + */ + y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00); + y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00); + y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00); + y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00); + y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11); + y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11); + y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11); + y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11); + + x0 = y0; + x1 = y1; + x2 = y2; + x3 = y3; + } + + /* Fold 512 bits => 128 bits */ + x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00); + x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00); + x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11); + x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11); + x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00); + x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11); + x0 = x3; + +_128_bits_at_a_time: + while (p != end) { + /* Fold 128 bits into next 128 bits */ + x1 = *p++; + x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00); + x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11); + x0 = x1; + } + + /* Now there are just 128 bits left, stored in 'x0'. */ + + /* + * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, + * which is equivalent to multiplying by x^32. This is needed because + * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). + */ + x0 = _mm_srli_si128(x0, 8) ^ + _mm_clmulepi64_si128(x0, multipliers_1, 0x10); + + /* Fold 96 => 64 bits */ + x0 = _mm_srli_si128(x0, 4) ^ + _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00); + + /* + * Finally, reduce 64 => 32 bits using Barrett reduction. + * + * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to + * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)): + * + * R(x) = (A(x)*x^32 + B(x)) mod G(x) + * = (A(x)*x^32) mod G(x) + B(x) + * + * Then, by the Division Algorithm there exists a unique q(x) such that: + * + * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x) + * + * Since the left-hand side is of maximum degree 31, the right-hand side + * must be too. This implies that we can apply 'mod x^32' to the + * right-hand side without changing its value: + * + * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32 + * + * Note that '+' is equivalent to '-' in polynomials over GF(2). + * + * We also know that: + * + * / A(x)*x^32 \ + * q(x) = floor ( --------- ) + * \ G(x) / + * + * To compute this efficiently, we can multiply the top and bottom by + * x^32 and move the division by G(x) to the top: + * + * / A(x) * floor(x^64 / G(x)) \ + * q(x) = floor ( ------------------------- ) + * \ x^32 / + * + * Note that floor(x^64 / G(x)) is a constant. + * + * So finally we have: + * + * / A(x) * floor(x^64 / G(x)) \ + * R(x) = B(x) + G(x)*floor ( ------------------------- ) + * \ x^32 / + */ + x1 = x0; + x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00); + x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10); + return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4)); +} + +#define IMPL_ALIGNMENT 16 +#define IMPL_SEGMENT_SIZE 16 +#include "../crc32_vec_template.h" diff --git a/lib/gdeflate/libdeflate/lib/x86/decompress_impl.h b/lib/gdeflate/libdeflate/lib/x86/decompress_impl.h new file mode 100644 index 0000000..dfe1a79 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/x86/decompress_impl.h @@ -0,0 +1,35 @@ +#ifndef LIB_X86_DECOMPRESS_IMPL_H +#define LIB_X86_DECOMPRESS_IMPL_H + +#include "cpu_features.h" + +/* Include the BMI2-optimized version? */ +#undef DISPATCH_BMI2 +#if !defined(__BMI2__) && X86_CPU_FEATURES_ENABLED && \ + COMPILER_SUPPORTS_BMI2_TARGET +# define FUNCNAME deflate_decompress_bmi2 +# define ATTRIBUTES __attribute__((target("bmi2"))) +# define DISPATCH 1 +# define DISPATCH_BMI2 1 +#ifdef GDEFLATE +# include "../gdeflate_decompress_template.h" +#else +# include "../decompress_template.h" +#endif +#endif + +#ifdef DISPATCH +static inline decompress_func_t +arch_select_decompress_func(void) +{ + u32 features = get_cpu_features(); + +#ifdef DISPATCH_BMI2 + if (features & X86_CPU_FEATURE_BMI2) + return deflate_decompress_bmi2; +#endif + return NULL; +} +#endif /* DISPATCH */ + +#endif /* LIB_X86_DECOMPRESS_IMPL_H */ diff --git a/lib/gdeflate/libdeflate/lib/x86/matchfinder_impl.h b/lib/gdeflate/libdeflate/lib/x86/matchfinder_impl.h new file mode 100644 index 0000000..99fbebe --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/x86/matchfinder_impl.h @@ -0,0 +1,122 @@ +/* + * x86/matchfinder_impl.h - x86 implementations of matchfinder functions + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_X86_MATCHFINDER_IMPL_H +#define LIB_X86_MATCHFINDER_IMPL_H + +#ifdef __AVX2__ +# include +static forceinline void +matchfinder_init_avx2(mf_pos_t *data, size_t size) +{ + __m256i *p = (__m256i *)data; + __m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + p[0] = v; + p[1] = v; + p[2] = v; + p[3] = v; + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); +} +#define matchfinder_init matchfinder_init_avx2 + +static forceinline void +matchfinder_rebase_avx2(mf_pos_t *data, size_t size) +{ + __m256i *p = (__m256i *)data; + __m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + /* PADDSW: Add Packed Signed Integers With Signed Saturation */ + p[0] = _mm256_adds_epi16(p[0], v); + p[1] = _mm256_adds_epi16(p[1], v); + p[2] = _mm256_adds_epi16(p[2], v); + p[3] = _mm256_adds_epi16(p[3], v); + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); +} +#define matchfinder_rebase matchfinder_rebase_avx2 + +#elif defined(__SSE2__) +# include +static forceinline void +matchfinder_init_sse2(mf_pos_t *data, size_t size) +{ + __m128i *p = (__m128i *)data; + __m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + p[0] = v; + p[1] = v; + p[2] = v; + p[3] = v; + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); +} +#define matchfinder_init matchfinder_init_sse2 + +static forceinline void +matchfinder_rebase_sse2(mf_pos_t *data, size_t size) +{ + __m128i *p = (__m128i *)data; + __m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + /* PADDSW: Add Packed Signed Integers With Signed Saturation */ + p[0] = _mm_adds_epi16(p[0], v); + p[1] = _mm_adds_epi16(p[1], v); + p[2] = _mm_adds_epi16(p[2], v); + p[3] = _mm_adds_epi16(p[3], v); + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); +} +#define matchfinder_rebase matchfinder_rebase_sse2 +#endif /* __SSE2__ */ + +#endif /* LIB_X86_MATCHFINDER_IMPL_H */ diff --git a/lib/gdeflate/libdeflate/lib/zlib_compress.c b/lib/gdeflate/libdeflate/lib/zlib_compress.c new file mode 100644 index 0000000..ab00751 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/zlib_compress.c @@ -0,0 +1,87 @@ +/* + * zlib_compress.c - compress with a zlib wrapper + * + * Originally public domain; changes after 2016-09-07 are copyrighted. + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "deflate_compress.h" +#include "unaligned.h" +#include "zlib_constants.h" + +#include "libdeflate.h" + +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_zlib_compress(struct libdeflate_compressor *c, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail) +{ + u8 *out_next = out; + u16 hdr; + unsigned compression_level; + unsigned level_hint; + size_t deflate_size; + + if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD) + return 0; + + /* 2 byte header: CMF and FLG */ + hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12); + compression_level = deflate_get_compression_level(c); + if (compression_level < 2) + level_hint = ZLIB_FASTEST_COMPRESSION; + else if (compression_level < 6) + level_hint = ZLIB_FAST_COMPRESSION; + else if (compression_level < 8) + level_hint = ZLIB_DEFAULT_COMPRESSION; + else + level_hint = ZLIB_SLOWEST_COMPRESSION; + hdr |= level_hint << 6; + hdr |= 31 - (hdr % 31); + + put_unaligned_be16(hdr, out_next); + out_next += 2; + + /* Compressed data */ + deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next, + out_nbytes_avail - ZLIB_MIN_OVERHEAD); + if (deflate_size == 0) + return 0; + out_next += deflate_size; + + /* ADLER32 */ + put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next); + out_next += 4; + + return out_next - (u8 *)out; +} + +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_zlib_compress_bound(struct libdeflate_compressor *c, + size_t in_nbytes) +{ + return ZLIB_MIN_OVERHEAD + + libdeflate_deflate_compress_bound(c, in_nbytes); +} diff --git a/lib/gdeflate/libdeflate/lib/zlib_constants.h b/lib/gdeflate/libdeflate/lib/zlib_constants.h new file mode 100644 index 0000000..f304310 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/zlib_constants.h @@ -0,0 +1,21 @@ +/* + * zlib_constants.h - constants for the zlib wrapper format + */ + +#ifndef LIB_ZLIB_CONSTANTS_H +#define LIB_ZLIB_CONSTANTS_H + +#define ZLIB_MIN_HEADER_SIZE 2 +#define ZLIB_FOOTER_SIZE 4 +#define ZLIB_MIN_OVERHEAD (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE) + +#define ZLIB_CM_DEFLATE 8 + +#define ZLIB_CINFO_32K_WINDOW 7 + +#define ZLIB_FASTEST_COMPRESSION 0 +#define ZLIB_FAST_COMPRESSION 1 +#define ZLIB_DEFAULT_COMPRESSION 2 +#define ZLIB_SLOWEST_COMPRESSION 3 + +#endif /* LIB_ZLIB_CONSTANTS_H */ diff --git a/lib/gdeflate/libdeflate/lib/zlib_decompress.c b/lib/gdeflate/libdeflate/lib/zlib_decompress.c new file mode 100644 index 0000000..0f6c714 --- /dev/null +++ b/lib/gdeflate/libdeflate/lib/zlib_decompress.c @@ -0,0 +1,108 @@ +/* + * zlib_decompress.c - decompress with a zlib wrapper + * + * Originally public domain; changes after 2016-09-07 are copyrighted. + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "unaligned.h" +#include "zlib_constants.h" + +#include "libdeflate.h" + +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) +{ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + u16 hdr; + size_t actual_in_nbytes; + size_t actual_out_nbytes; + enum libdeflate_result result; + + if (in_nbytes < ZLIB_MIN_OVERHEAD) + return LIBDEFLATE_BAD_DATA; + + /* 2 byte header: CMF and FLG */ + hdr = get_unaligned_be16(in_next); + in_next += 2; + + /* FCHECK */ + if ((hdr % 31) != 0) + return LIBDEFLATE_BAD_DATA; + + /* CM */ + if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE) + return LIBDEFLATE_BAD_DATA; + + /* CINFO */ + if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW) + return LIBDEFLATE_BAD_DATA; + + /* FDICT */ + if ((hdr >> 5) & 1) + return LIBDEFLATE_BAD_DATA; + + /* Compressed data */ + result = libdeflate_deflate_decompress_ex(d, in_next, + in_end - ZLIB_FOOTER_SIZE - in_next, + out, out_nbytes_avail, + &actual_in_nbytes, actual_out_nbytes_ret); + if (result != LIBDEFLATE_SUCCESS) + return result; + + if (actual_out_nbytes_ret) + actual_out_nbytes = *actual_out_nbytes_ret; + else + actual_out_nbytes = out_nbytes_avail; + + in_next += actual_in_nbytes; + + /* ADLER32 */ + if (libdeflate_adler32(1, out, actual_out_nbytes) != + get_unaligned_be32(in_next)) + return LIBDEFLATE_BAD_DATA; + in_next += 4; + + if (actual_in_nbytes_ret) + *actual_in_nbytes_ret = in_next - (u8 *)in; + + return LIBDEFLATE_SUCCESS; +} + +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_zlib_decompress(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) +{ + return libdeflate_zlib_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); +} diff --git a/lib/gdeflate/libdeflate/libdeflate.h b/lib/gdeflate/libdeflate/libdeflate.h new file mode 100644 index 0000000..284ffa2 --- /dev/null +++ b/lib/gdeflate/libdeflate/libdeflate.h @@ -0,0 +1,540 @@ +/* + * libdeflate.h - public header for libdeflate + */ + +#ifndef LIBDEFLATE_H +#define LIBDEFLATE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBDEFLATE_VERSION_MAJOR 1 +#define LIBDEFLATE_VERSION_MINOR 8 +#define LIBDEFLATE_VERSION_STRING "1.8" + +#include +#include + +/* + * On Windows, if you want to link to the DLL version of libdeflate, then + * #define LIBDEFLATE_DLL. Note that the calling convention is "stdcall". + */ +#ifdef LIBDEFLATE_DLL +# ifdef BUILDING_LIBDEFLATE +# define LIBDEFLATEEXPORT LIBEXPORT +# elif defined(_WIN32) || defined(__CYGWIN__) +# define LIBDEFLATEEXPORT __declspec(dllimport) +# endif +#endif +#ifndef LIBDEFLATEEXPORT +# define LIBDEFLATEEXPORT +#endif + +#if defined(_WIN32) && !defined(_WIN64) +# define LIBDEFLATEAPI_ABI __stdcall +#else +# define LIBDEFLATEAPI_ABI +#endif + +#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && \ + defined(_WIN32) && !defined(_WIN64) + /* + * On 32-bit Windows, gcc assumes 16-byte stack alignment but MSVC only 4. + * Realign the stack when entering libdeflate to avoid crashing in SSE/AVX + * code when called from an MSVC-compiled application. + */ +# define LIBDEFLATEAPI_STACKALIGN __attribute__((force_align_arg_pointer)) +#else +# define LIBDEFLATEAPI_STACKALIGN +#endif + +#define LIBDEFLATEAPI LIBDEFLATEAPI_ABI LIBDEFLATEAPI_STACKALIGN + +/* ========================================================================== */ +/* Compression */ +/* ========================================================================== */ + +struct libdeflate_compressor; + +/* + * libdeflate_alloc_compressor() allocates a new compressor that supports + * DEFLATE, zlib, and gzip compression. 'compression_level' is the compression + * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 = + * medium/default, 9 = slow, 12 = slowest). Level 0 is also supported and means + * "no compression", specifically "create a valid stream, but only emit + * uncompressed blocks" (this will expand the data slightly). + * + * The return value is a pointer to the new compressor, or NULL if out of memory + * or if the compression level is invalid (i.e. outside the range [0, 12]). + * + * Note: for compression, the sliding window size is defined at compilation time + * to 32768, the largest size permissible in the DEFLATE format. It cannot be + * changed at runtime. + * + * A single compressor is not safe to use by multiple threads concurrently. + * However, different threads may use different compressors concurrently. + */ +LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI +libdeflate_alloc_compressor(int compression_level); + +/* + * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of + * data. The function attempts to compress 'in_nbytes' bytes of data located at + * 'in' and write the results to 'out', which has space for 'out_nbytes_avail' + * bytes. The return value is the compressed size in bytes, or 0 if the data + * could not be compressed to 'out_nbytes_avail' bytes or fewer. + */ +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_deflate_compress(struct libdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the + * number of bytes of compressed data that may be produced by compressing any + * buffer of length less than or equal to 'in_nbytes' using + * libdeflate_deflate_compress() with the specified compressor. Mathematically, + * this bound will necessarily be a number greater than or equal to 'in_nbytes'. + * It may be an overestimate of the true upper bound. The return value is + * guaranteed to be the same for all invocations with the same compressor and + * same 'in_nbytes'. + * + * As a special case, 'compressor' may be NULL. This causes the bound to be + * taken across *any* libdeflate_compressor that could ever be allocated with + * this build of the library, with any options. + * + * Note that this function is not necessary in many applications. With + * block-based compression, it is usually preferable to separately store the + * uncompressed size of each block and to store any blocks that did not compress + * to less than their original size uncompressed. In that scenario, there is no + * need to know the worst-case compressed size, since the maximum number of + * bytes of compressed data that may be used would always be one less than the + * input length. You can just pass a buffer of that size to + * libdeflate_deflate_compress() and store the data uncompressed if + * libdeflate_deflate_compress() returns 0, indicating that the compressed data + * did not fit into the provided output buffer. + */ +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor, + size_t in_nbytes); + +/* + * Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper + * format. + */ +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_zlib_compress(struct libdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * Like libdeflate_deflate_compress_bound(), but assumes the data will be + * compressed with libdeflate_zlib_compress() rather than with + * libdeflate_deflate_compress(). + */ +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor, + size_t in_nbytes); + +/* + * Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper + * format. + */ +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_gzip_compress(struct libdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail); + +/* + * Like libdeflate_deflate_compress_bound(), but assumes the data will be + * compressed with libdeflate_gzip_compress() rather than with + * libdeflate_deflate_compress(). + */ +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor, + size_t in_nbytes); + +/* + * libdeflate_free_compressor() frees a compressor that was allocated with + * libdeflate_alloc_compressor(). If a NULL pointer is passed in, no action is + * taken. + */ +LIBDEFLATEEXPORT void LIBDEFLATEAPI +libdeflate_free_compressor(struct libdeflate_compressor *compressor); + +/* ========================================================================== */ +/* Decompression */ +/* ========================================================================== */ + +struct libdeflate_decompressor; + +/* + * libdeflate_alloc_decompressor() allocates a new decompressor that can be used + * for DEFLATE, zlib, and gzip decompression. The return value is a pointer to + * the new decompressor, or NULL if out of memory. + * + * This function takes no parameters, and the returned decompressor is valid for + * decompressing data that was compressed at any compression level and with any + * sliding window size. + * + * A single decompressor is not safe to use by multiple threads concurrently. + * However, different threads may use different decompressors concurrently. + */ +LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI +libdeflate_alloc_decompressor(void); + +/* + * Result of a call to libdeflate_deflate_decompress(), + * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress(). + */ +enum libdeflate_result { + /* Decompression was successful. */ + LIBDEFLATE_SUCCESS = 0, + + /* Decompressed failed because the compressed data was invalid, corrupt, + * or otherwise unsupported. */ + LIBDEFLATE_BAD_DATA = 1, + + /* A NULL 'actual_out_nbytes_ret' was provided, but the data would have + * decompressed to fewer than 'out_nbytes_avail' bytes. */ + LIBDEFLATE_SHORT_OUTPUT = 2, + + /* The data would have decompressed to more than 'out_nbytes_avail' + * bytes. */ + LIBDEFLATE_INSUFFICIENT_SPACE = 3, +}; + +/* + * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream + * from the buffer 'in' with compressed size up to 'in_nbytes' bytes. The + * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail' + * bytes. If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. + * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If + * a nonzero result code is returned, then the contents of the output buffer are + * undefined. + * + * Decompression stops at the end of the DEFLATE stream (as indicated by the + * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes. + * + * libdeflate_deflate_decompress() can be used in cases where the actual + * uncompressed size is known (recommended) or unknown (not recommended): + * + * - If the actual uncompressed size is known, then pass the actual + * uncompressed size as 'out_nbytes_avail' and pass NULL for + * 'actual_out_nbytes_ret'. This makes libdeflate_deflate_decompress() fail + * with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the + * specified number of bytes. + * + * - If the actual uncompressed size is unknown, then provide a non-NULL + * 'actual_out_nbytes_ret' and provide a buffer with some size + * 'out_nbytes_avail' that you think is large enough to hold all the + * uncompressed data. In this case, if the data decompresses to less than + * or equal to 'out_nbytes_avail' bytes, then + * libdeflate_deflate_decompress() will write the actual uncompressed size + * to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS). Otherwise, + * it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was + * not large enough but no other problems were encountered, or another + * nonzero result code if decompression failed for another reason. + */ +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL, + * then the actual compressed size of the DEFLATE stream (aligned to the next + * byte boundary) is written to *actual_in_nbytes_ret. + */ +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format + * instead of raw DEFLATE. + * + * Decompression will stop at the end of the zlib stream, even if it is shorter + * than 'in_nbytes'. If you need to know exactly where the zlib stream ended, + * use libdeflate_zlib_decompress_ex(). + */ +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_zlib_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression + * succeeds (indicating that the first zlib-compressed stream in the input + * buffer was decompressed), then the actual number of input bytes consumed is + * written to *actual_in_nbytes_ret. + */ +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format + * instead of raw DEFLATE. + * + * If multiple gzip-compressed members are concatenated, then only the first + * will be decompressed. Use libdeflate_gzip_decompress_ex() if you need + * multi-member support. + */ +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret); + +/* + * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression + * succeeds (indicating that the first gzip-compressed member in the input + * buffer was decompressed), then the actual number of input bytes consumed is + * written to *actual_in_nbytes_ret. + */ +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + +/* + * libdeflate_free_decompressor() frees a decompressor that was allocated with + * libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action + * is taken. + */ +LIBDEFLATEEXPORT void LIBDEFLATEAPI +libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor); + +/* ========================================================================== */ +/* Checksums */ +/* ========================================================================== */ + +/* + * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of + * data and returns the updated checksum. When starting a new checksum, the + * required initial value for 'adler' is 1. This value is also returned when + * 'buffer' is specified as NULL. + */ +LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI +libdeflate_adler32(uint32_t adler, const void *buffer, size_t len); + + +/* + * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data + * and returns the updated checksum. When starting a new checksum, the required + * initial value for 'crc' is 0. This value is also returned when 'buffer' is + * specified as NULL. + */ +LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI +libdeflate_crc32(uint32_t crc, const void *buffer, size_t len); + +/* ========================================================================== */ +/* Custom memory allocator */ +/* ========================================================================== */ + +/* + * Install a custom memory allocator which libdeflate will use for all memory + * allocations. 'malloc_func' is a function that must behave like malloc(), and + * 'free_func' is a function that must behave like free(). + * + * There must not be any libdeflate_compressor or libdeflate_decompressor + * structures in existence when calling this function. + */ +LIBDEFLATEEXPORT void LIBDEFLATEAPI +libdeflate_set_memory_allocator(void *(*malloc_func)(size_t), + void (*free_func)(void *)); + +/* ========================================================================== */ +/* NVIDIA GDEFLATE-related API */ +/* ========================================================================== */ + +/* ========================================================================== */ +/* Compression */ +/* ========================================================================== */ + +struct libdeflate_gdeflate_compressor; + +struct libdeflate_gdeflate_out_page { + /* Buffer for compressed GDEFLATE page data. */ + void *data; + /* Buffer size in bytes. If compression succeeded this field will + * contain the size of compressed GDEFLATE page. + */ + size_t nbytes; +}; + +struct libdeflate_gdeflate_in_page { + /* Compressed GDEFLATE page data. */ + const void* data; + /* Size in bytes of compressed GDEFLATE page. */ + size_t nbytes; +}; + +/* + * libdeflate_alloc_gdeflate_compressor() allocates a new compressor that + * supports GDEFLATE compression. 'compression_level' is the compression + * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 = + * medium/default, 9 = slow, 12 = slowest). Level 0 is also supported and means + * "no compression", specifically "create a valid stream, but only emit + * uncompressed blocks" (this will expand the data slightly). + * + * The return value is a pointer to the new compressor, or NULL if out of memory + * or if the compression level is invalid (i.e. outside the range [0, 12]). + * + * Note: for compression, the sliding window size is defined at compilation time + * to 65536, the largest size permissible in the GDEFLATE format. It cannot be + * changed at runtime. + * + * A single compressor is not safe to use by multiple threads concurrently. + * However, different threads may use different compressors concurrently. + */ +LIBDEFLATEEXPORT struct libdeflate_gdeflate_compressor * LIBDEFLATEAPI +libdeflate_alloc_gdeflate_compressor(int compression_level); + +/* + * libdeflate_gdeflate_compress() performs raw GDEFLATE compression on a buffer + * of data. The function attempts to compress 'in_nbytes' bytes of data located + * at 'in' and writes page results to 'out_pages', which has preallocated 'data' + * with 'nbytes' space for each page. To determine the number of pages + * 'out_npages' the input data will be split into and the upper bound on + * compressed data use the libdeflate_gdeflate_compress_bound() function. The + * return value is the compressed size in bytes, or 0 if the data could not be + * compressed. If compression succeeded the size of each compressed page will + * be written to 'nbytes' field of 'out_pages'. + */ +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_gdeflate_compress(struct libdeflate_gdeflate_compressor *compressor, + const void *in, size_t in_nbytes, + struct libdeflate_gdeflate_out_page *out_pages, + size_t out_npages); + +/* + * libdeflate_gdeflate_compress_ex() performs raw GDEFLATE compression on + * set of pages. The function attempts to compress 'npages' from 'in_pages' + * and writes page results to 'out_pages', which has preallocated 'data' + * with 'nbytes' space for each page. To determine the number of pages + * 'out_npages' the input data will be split into and the upper bound on + * compressed data use the libdeflate_gdeflate_compress_bound() function. The + * return value is the number of processed pages. If compression succeeded + * the size of each compressed page will be written to 'nbytes' field of 'out_pages'. + */ +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_gdeflate_compress_ex(struct libdeflate_gdeflate_compressor *compressor, + const struct libdeflate_gdeflate_in_page* in_pages, + struct libdeflate_gdeflate_out_page* out_pages, size_t npages); + +/* + * libdeflate_gdeflate_compress_bound() returns a worst-case upper bound on the + * number of bytes of compressed data that may be produced by compressing any + * buffer of length less than or equal to 'in_nbytes' using + * libdeflate_gdeflate_compress() with the specified compressor. Mathematically, + * this bound will necessarily be a number greater than or equal to 'in_nbytes'. + * It may be an overestimate of the true upper bound. The return value is + * guaranteed to be the same for all invocations with the same compressor and + * same 'in_nbytes'. The 'out_npages' will contain the number of GDEFLATE pages + * the input data will be split into. This number should be used to preallocate + * the page array for libdeflate_gdeflate_compress(). The upper bound on the + * number of compressed data in a page can be found by dividing the function + * result by the 'out_npages' value. + * + * As a special case, 'compressor' may be NULL. This causes the bound to be + * taken across *any* libdeflate_compressor that could ever be allocated with + * this build of the library, with any options. + */ +LIBDEFLATEEXPORT size_t LIBDEFLATEAPI +libdeflate_gdeflate_compress_bound(struct libdeflate_gdeflate_compressor *comp, + size_t in_nbytes, size_t *out_npages); + +/* + * libdeflate_free_gdeflate_compressor() frees a compressor that was allocated + * with libdeflate_alloc_gdeflate_compressor(). If a NULL pointer is passed in, + * no action is taken. + */ +LIBDEFLATEEXPORT void LIBDEFLATEAPI +libdeflate_free_gdeflate_compressor(struct libdeflate_gdeflate_compressor *comp); + +/* ========================================================================== */ +/* Decompression */ +/* ========================================================================== */ + +struct libdeflate_gdeflate_decompressor; + +/* + * libdeflate_alloc_gdeflate_decompressor() allocates a new decompressor that + * can be used for GDEFLATE decompression. The return value is a pointer to + * the new decompressor, or NULL if out of memory. + * + * This function takes no parameters, and the returned decompressor is valid for + * decompressing data that was compressed at any compression level and with any + * sliding window size. + * + * A single decompressor is not safe to use by multiple threads concurrently. + * However, different threads may use different decompressors concurrently. + */ +LIBDEFLATEEXPORT struct libdeflate_gdeflate_decompressor * LIBDEFLATEAPI +libdeflate_alloc_gdeflate_decompressor(void); + +/* + * libdeflate_gdeflate_decompress() decompresses the GDEFLATE-compressed pages + * from the 'in_pages' array with 'in_npages' members. The uncompressed data is + * written to 'out', a buffer with size 'out_nbytes_avail' bytes. + * If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. + * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If + * a nonzero result code is returned, then the contents of the output buffer are + * undefined. + * + * libdeflate_gdeflate_decompress() can be used only in cases where the actual + * uncompressed size is known. + */ +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_gdeflate_decompress(struct libdeflate_gdeflate_decompressor *decomp, + struct libdeflate_gdeflate_in_page *in_pages, + size_t in_npages, void *out, + size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret); + +/* + * libdeflate_gdeflate_decompress_ex() decompresses the GDEFLATE-compressed pages + * from the 'in_pages' array with 'npages' members. The uncompressed data is + * written to 'out_pages'. + * If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. + * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If + * a nonzero result code is returned, then the contents of the output buffer are + * undefined. + * + * libdeflate_gdeflate_decompress_ex() can be used only in cases where the actual + * uncompressed size is known. + */ +LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI +libdeflate_gdeflate_decompress_ex(struct libdeflate_gdeflate_decompressor *decomp, + struct libdeflate_gdeflate_in_page *in_pages, + struct libdeflate_gdeflate_out_page* out_pages, size_t npages); + +/* + * libdeflate_free_gdeflate_decompressor() frees a decompressor that was + * allocated with libdeflate_alloc_gdeflate_decompressor(). If a NULL pointer + * is passed in, no action is taken. + */ +LIBDEFLATEEXPORT void LIBDEFLATEAPI +libdeflate_free_gdeflate_decompressor(struct libdeflate_gdeflate_decompressor *decomp); + +#ifdef __cplusplus +} +#endif + +#endif /* LIBDEFLATE_H */ diff --git a/project.cpp b/project.cpp index a97f7b4..fdb8efa 100644 --- a/project.cpp +++ b/project.cpp @@ -4,29 +4,76 @@ namespace fs = std::filesystem; using namespace Crafter; extern "C" Configuration CrafterBuildProject(std::span args) { - GitProjectSpec mathSpec{ - .source = { .url = "https://forgejo.catcrafts.net/Catcrafts/Crafter.Math.git" }, - .args = std::vector(args.begin(), args.end()), - }; - Configuration* math = GitProject(mathSpec); + std::vector depArgs(args.begin(), args.end()); + + bool useLocal = false; + for (std::string_view a : args) { + if (a == "--local") { useLocal = true; break; } + } + + Configuration* math = useLocal + ? LocalProject({ + .projectFile = "../Crafter.Math/project.cpp", + .args = depArgs, + }) + : GitProject({ + .source = { .url = "https://forgejo.catcrafts.net/Catcrafts/Crafter.Math.git" }, + .args = depArgs, + }); Configuration cfg; cfg.path = "./"; cfg.name = "Crafter.Asset"; + // Default to library so consumers (Crafter.Graphics, examples) link against + // libCrafter.Asset.a — the Compression module's non-template entry points + // need to be in a real archive, not just PCM-inline. `--exe` (via the + // example's main.cpp) flips us to Executable to produce crafter-asset. + cfg.type = ConfigurationType::LibraryStatic; ApplyStandardArgs(cfg, args); + bool wantExe = false; + for (std::string_view a : args) { + if (a == "--exe") { wantExe = true; break; } + } + if (wantExe) cfg.type = ConfigurationType::Executable; cfg.outputName = (cfg.type == ConfigurationType::Executable) ? "crafter-asset" : "Crafter.Asset"; cfg.dependencies = { math }; - std::array ifaces = { + // Vendored GDeflate (Apache-2.0, Microsoft DirectStorage reference) + + // libdeflate (MIT, NVIDIA fork). The C++ wrappers live inline in the + // Compression module impl; libdeflate's .c sources are compiled here. + cfg.compileFlags.push_back("-Ilib/gdeflate/libdeflate"); + const std::array libdeflateSources = { + "lib/gdeflate/libdeflate/lib/adler32", + "lib/gdeflate/libdeflate/lib/crc32", + "lib/gdeflate/libdeflate/lib/deflate_compress", + "lib/gdeflate/libdeflate/lib/deflate_decompress", + "lib/gdeflate/libdeflate/lib/gdeflate_compress", + "lib/gdeflate/libdeflate/lib/gdeflate_decompress", + "lib/gdeflate/libdeflate/lib/gzip_compress", + "lib/gdeflate/libdeflate/lib/gzip_decompress", + "lib/gdeflate/libdeflate/lib/utils", + "lib/gdeflate/libdeflate/lib/zlib_compress", + "lib/gdeflate/libdeflate/lib/zlib_decompress", + "lib/gdeflate/libdeflate/lib/x86/cpu_features", + }; + for (const fs::path& p : libdeflateSources) cfg.cFiles.push_back(p); + + std::array ifaces = { "interfaces/Crafter.Asset", + "interfaces/Crafter.Asset-Compression", "interfaces/Crafter.Asset-Texture", "interfaces/Crafter.Asset-Mesh", }; if (cfg.type == ConfigurationType::Executable) { - std::array impls = { "implementations/main" }; + std::array impls = { + "implementations/Crafter.Asset-Compression", + "implementations/main", + }; cfg.GetInterfacesAndImplementations(ifaces, impls); } else { - std::array impls = {}; + std::array impls = { + "implementations/Crafter.Asset-Compression", + }; cfg.GetInterfacesAndImplementations(ifaces, impls); }