asset compression

This commit is contained in:
Jorijn van der Graaf 2026-05-11 18:37:30 +02:00
commit 30a283c1b3
57 changed files with 13237 additions and 8 deletions

View file

@ -0,0 +1,136 @@
/*
Crafter®.Asset
Copyright (C) 2026 Catcrafts®
catcrafts.net
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License version 3.0 as published by the Free Software Foundation;
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
module;
// Vendored GDeflate (Microsoft DirectStorage reference, Apache-2.0). Headers
// pull libdeflate (MIT) via -Ilib/gdeflate/libdeflate. The C++ wrappers are
// pulled inline because Crafter.Build's cFiles only handles .c TUs — folding
// them into this module impl avoids adding a parallel cppFiles channel.
#include "../lib/gdeflate/GDeflate.h"
#include "../lib/gdeflate/GDeflateCompress.cpp"
#include "../lib/gdeflate/GDeflateDecompress.cpp"
module Crafter.Asset;
import std;
namespace Crafter::Compression {
CompressedBlob CompressStreams(std::span<const std::span<const std::byte>> streams) {
CompressedBlob blob;
blob.regions.reserve(streams.size());
// First pass: compress each stream into its own buffer so we know the
// exact final size. GDeflate::CompressBound is an upper bound; we'd
// waste capacity if we appended the bound directly into a single
// shared buffer.
std::vector<std::vector<std::byte>> compressed;
compressed.reserve(streams.size());
std::uint64_t totalSize = 0;
for (const std::span<const std::byte>& stream : streams) {
if (stream.empty()) {
compressed.emplace_back();
continue;
}
std::size_t boundSize = GDeflate::CompressBound(stream.size());
std::vector<std::byte> out(boundSize);
std::size_t actualSize = boundSize;
bool ok = GDeflate::Compress(
reinterpret_cast<std::uint8_t*>(out.data()),
&actualSize,
reinterpret_cast<const std::uint8_t*>(stream.data()),
stream.size(),
GDeflate::MaximumCompressionLevel,
0);
if (!ok) {
throw std::runtime_error("GDeflate::Compress failed");
}
out.resize(actualSize);
totalSize += actualSize;
compressed.push_back(std::move(out));
}
// Second pass: concatenate and build the region table.
blob.bytes.reserve(totalSize);
for (std::size_t i = 0; i < streams.size(); ++i) {
RegionMeta r {
.srcOffset = blob.bytes.size(),
.compressedSize = compressed[i].size(),
.decompressedSize = streams[i].size(),
};
blob.regions.push_back(r);
blob.bytes.insert(blob.bytes.end(), compressed[i].begin(), compressed[i].end());
}
return blob;
}
void WriteBlob(std::ostream& file, const CompressedBlob& blob) {
std::uint32_t regionCount = static_cast<std::uint32_t>(blob.regions.size());
file.write(reinterpret_cast<const char*>(&regionCount), sizeof(regionCount));
if (regionCount > 0) {
file.write(reinterpret_cast<const char*>(blob.regions.data()),
regionCount * sizeof(RegionMeta));
}
std::uint64_t payloadSize = blob.bytes.size();
file.write(reinterpret_cast<const char*>(&payloadSize), sizeof(payloadSize));
if (payloadSize > 0) {
file.write(reinterpret_cast<const char*>(blob.bytes.data()), payloadSize);
}
}
CompressedBlob ReadBlob(std::istream& file) {
CompressedBlob blob;
std::uint32_t regionCount = 0;
file.read(reinterpret_cast<char*>(&regionCount), sizeof(regionCount));
blob.regions.resize(regionCount);
if (regionCount > 0) {
file.read(reinterpret_cast<char*>(blob.regions.data()),
regionCount * sizeof(RegionMeta));
}
std::uint64_t payloadSize = 0;
file.read(reinterpret_cast<char*>(&payloadSize), sizeof(payloadSize));
blob.bytes.resize(payloadSize);
if (payloadSize > 0) {
file.read(reinterpret_cast<char*>(blob.bytes.data()), payloadSize);
}
return blob;
}
void DecompressCPU(const CompressedBlob& blob, std::span<const std::span<std::byte>> outputs) {
if (outputs.size() != blob.regions.size()) {
throw std::runtime_error("DecompressCPU: outputs.size() != regions.size()");
}
for (std::size_t i = 0; i < blob.regions.size(); ++i) {
const RegionMeta& r = blob.regions[i];
const std::span<std::byte>& out = outputs[i];
if (out.size() != r.decompressedSize) {
throw std::runtime_error("DecompressCPU: output size mismatch");
}
if (r.decompressedSize == 0) continue;
bool ok = GDeflate::Decompress(
reinterpret_cast<std::uint8_t*>(out.data()),
r.decompressedSize,
reinterpret_cast<const std::uint8_t*>(blob.bytes.data() + r.srcOffset),
r.compressedSize,
/*numWorkers=*/1);
if (!ok) {
throw std::runtime_error("GDeflate::Decompress failed");
}
}
}
}

View file

@ -19,6 +19,35 @@ import std;
using namespace Crafter; using namespace Crafter;
namespace fs = std::filesystem; namespace fs = std::filesystem;
// CPU GDeflate roundtrip sanity test across the size boundaries from the
// implementation plan. Returns 0 on pass, 1 on first byte-mismatch.
static int RunCompressionRoundtrip() {
const std::array<std::size_t, 5> sizes = { 1, 65535, 65536, 65537, 16 * 1024 * 1024 };
std::mt19937_64 rng(0xC0FFEEu);
for (std::size_t n : sizes) {
std::vector<std::byte> input(n);
for (std::size_t i = 0; i < n; ++i) {
// Mix random bytes with a deterministic pattern so the codec is
// exercised on both compressible and noisy regions.
input[i] = static_cast<std::byte>((i * 0x9E3779B97F4A7C15ULL ^ rng()) & 0xFF);
}
std::array<std::span<const std::byte>, 1> streams = { std::span(input) };
Compression::CompressedBlob blob = Compression::CompressStreams(streams);
std::vector<std::byte> output(n);
std::array<std::span<std::byte>, 1> outputs = { std::span(output) };
Compression::DecompressCPU(blob, outputs);
if (output != input) {
std::cerr << "[FAIL] roundtrip size=" << n << "\n";
return 1;
}
std::cout << "[ok] size=" << n
<< " compressed=" << blob.bytes.size()
<< " ratio=" << (double(blob.bytes.size()) / double(n)) << "\n";
}
std::cout << "All roundtrips passed.\n";
return 0;
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
// Parse arguments: crafter-asset <input_file> [output_file] [--format u8|f16] // Parse arguments: crafter-asset <input_file> [output_file] [--format u8|f16]
fs::path inputPath; fs::path inputPath;
@ -29,6 +58,9 @@ int main(int argc, char** argv) {
std::vector<std::string> positional; std::vector<std::string> positional;
for (int i = 1; i < argc; ++i) { for (int i = 1; i < argc; ++i) {
std::string arg = argv[i]; std::string arg = argv[i];
if (arg == "--test-compression") {
return RunCompressionRoundtrip();
}
if (arg == "--format" || arg == "-f") { if (arg == "--format" || arg == "-f") {
if (i + 1 >= argc) { if (i + 1 >= argc) {
std::cerr << "Error: --format requires a value (u8 or f16)\n"; std::cerr << "Error: --format requires a value (u8 or f16)\n";

View file

@ -0,0 +1,59 @@
/*
Crafter®.Asset
Copyright (C) 2026 Catcrafts®
catcrafts.net
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License version 3.0 as published by the Free Software Foundation;
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
export module Crafter.Asset:Compression;
import std;
export namespace Crafter::Compression {
// One independently-decompressable GDeflate stream inside CompressedBlob::bytes.
// Layout matches what VK_EXT_memory_decompression's VkDecompressMemoryRegionEXT
// expects, minus the device addresses (which the consumer fills in).
struct RegionMeta {
std::uint64_t srcOffset;
std::uint64_t compressedSize;
std::uint64_t decompressedSize;
};
struct CompressedBlob {
std::vector<std::byte> bytes;
std::vector<RegionMeta> regions;
std::uint64_t TotalDecompressedSize() const noexcept {
std::uint64_t sum = 0;
for (const RegionMeta& r : regions) sum += r.decompressedSize;
return sum;
}
};
// Compresses each input span as its own GDeflate tile-stream; concatenates
// them into one byte buffer with a parallel region table. Streams are
// independent and can be addressed individually by VkDecompressMemoryRegionEXT
// entries on the GPU path.
CompressedBlob CompressStreams(std::span<const std::span<const std::byte>> streams);
// CPU fallback decoder. outputs.size() must equal blob.regions.size();
// outputs[i].size() must equal blob.regions[i].decompressedSize.
void DecompressCPU(const CompressedBlob& blob, std::span<const std::span<std::byte>> outputs);
// Length-prefixed serialization of a CompressedBlob. Used by per-asset
// SaveCompressed/LoadCompressed implementations after they've written
// their own type-specific header.
void WriteBlob(std::ostream& file, const CompressedBlob& blob);
CompressedBlob ReadBlob(std::istream& file);
}

View file

@ -18,6 +18,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/ */
export module Crafter.Asset:Mesh; export module Crafter.Asset:Mesh;
import :Compression;
import Crafter.Math; import Crafter.Math;
import std; import std;
namespace fs = std::filesystem; namespace fs = std::filesystem;
@ -35,6 +36,44 @@ export namespace Crafter {
Vector<float, 2, 0> uv; Vector<float, 2, 0> uv;
}; };
// GDeflate-compressed counterpart of MeshAsset<T>::Save output. Three
// regions: [vertex, index, data]. dataCount==0 leaves the data region
// empty (zero compressedSize/decompressedSize). dataStride records sizeof(T)
// at compress time so consumers can validate.
struct CompressedMeshAsset {
std::uint32_t vertexCount = 0;
std::uint32_t indexCount = 0;
std::uint32_t dataCount = 0;
std::uint32_t dataStride = 0;
Compression::CompressedBlob blob;
};
namespace MeshAssetFormat {
inline constexpr char magic[4] = {'C', 'G', 'D', 'M'};
inline constexpr std::uint32_t version = 1;
}
inline CompressedMeshAsset LoadCompressedMesh(fs::path path) {
std::ifstream file(path, std::ios::binary);
char magic[4];
file.read(magic, 4);
if (std::memcmp(magic, MeshAssetFormat::magic, 4) != 0) {
throw std::runtime_error("LoadCompressedMesh: bad magic on " + path.string());
}
std::uint32_t version = 0;
file.read(reinterpret_cast<char*>(&version), sizeof(version));
if (version != MeshAssetFormat::version) {
throw std::runtime_error("LoadCompressedMesh: unsupported version on " + path.string());
}
CompressedMeshAsset out;
file.read(reinterpret_cast<char*>(&out.vertexCount), sizeof(out.vertexCount));
file.read(reinterpret_cast<char*>(&out.indexCount), sizeof(out.indexCount));
file.read(reinterpret_cast<char*>(&out.dataCount), sizeof(out.dataCount));
file.read(reinterpret_cast<char*>(&out.dataStride), sizeof(out.dataStride));
out.blob = Compression::ReadBlob(file);
return out;
}
template <typename T> template <typename T>
struct MeshAsset { struct MeshAsset {
std::vector<Vector<float, 3, 3>> vertexes; std::vector<Vector<float, 3, 3>> vertexes;
@ -57,6 +96,29 @@ export namespace Crafter {
file.write(reinterpret_cast<char*>(datas.data()), dataCount * sizeof(T)); file.write(reinterpret_cast<char*>(datas.data()), dataCount * sizeof(T));
} }
void SaveCompressed(fs::path path) const {
std::array<std::span<const std::byte>, 3> streams = {
std::as_bytes(std::span(vertexes)),
std::as_bytes(std::span(indexes)),
std::as_bytes(std::span(datas)),
};
Compression::CompressedBlob blob = Compression::CompressStreams(streams);
std::ofstream file(path, std::ios::binary);
file.write(MeshAssetFormat::magic, 4);
std::uint32_t version = MeshAssetFormat::version;
std::uint32_t vc = static_cast<std::uint32_t>(vertexes.size());
std::uint32_t ic = static_cast<std::uint32_t>(indexes.size());
std::uint32_t dc = static_cast<std::uint32_t>(datas.size());
std::uint32_t stride = static_cast<std::uint32_t>(sizeof(T));
file.write(reinterpret_cast<const char*>(&version), sizeof(version));
file.write(reinterpret_cast<const char*>(&vc), sizeof(vc));
file.write(reinterpret_cast<const char*>(&ic), sizeof(ic));
file.write(reinterpret_cast<const char*>(&dc), sizeof(dc));
file.write(reinterpret_cast<const char*>(&stride), sizeof(stride));
Compression::WriteBlob(file, blob);
}
static MeshAsset<T> Load(fs::path path) { static MeshAsset<T> Load(fs::path path) {
MeshAsset<T> mesh; MeshAsset<T> mesh;
@ -196,6 +258,32 @@ export namespace Crafter {
file.write(reinterpret_cast<char*>(vertexes.data()), vertexCount * sizeof(Vector<float, 3, 3>)); file.write(reinterpret_cast<char*>(vertexes.data()), vertexCount * sizeof(Vector<float, 3, 3>));
file.write(reinterpret_cast<char*>(indexes.data()), indexCount * sizeof(std::uint32_t)); file.write(reinterpret_cast<char*>(indexes.data()), indexCount * sizeof(std::uint32_t));
} }
void SaveCompressed(fs::path path) const {
// Three regions to keep file format identical to the templated
// variant; the data region is empty (skipped on the GPU path).
std::array<std::span<const std::byte>, 3> streams = {
std::as_bytes(std::span(vertexes)),
std::as_bytes(std::span(indexes)),
std::span<const std::byte>{},
};
Compression::CompressedBlob blob = Compression::CompressStreams(streams);
std::ofstream file(path, std::ios::binary);
file.write(MeshAssetFormat::magic, 4);
std::uint32_t version = MeshAssetFormat::version;
std::uint32_t vc = static_cast<std::uint32_t>(vertexes.size());
std::uint32_t ic = static_cast<std::uint32_t>(indexes.size());
std::uint32_t dc = 0;
std::uint32_t stride = 0;
file.write(reinterpret_cast<const char*>(&version), sizeof(version));
file.write(reinterpret_cast<const char*>(&vc), sizeof(vc));
file.write(reinterpret_cast<const char*>(&ic), sizeof(ic));
file.write(reinterpret_cast<const char*>(&dc), sizeof(dc));
file.write(reinterpret_cast<const char*>(&stride), sizeof(stride));
Compression::WriteBlob(file, blob);
}
static MeshAsset<void> Load(fs::path path) { static MeshAsset<void> Load(fs::path path) {
MeshAsset<void> mesh; MeshAsset<void> mesh;

View file

@ -21,6 +21,7 @@ module;
#define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_IMPLEMENTATION
#include "../lib/stb_image.h" #include "../lib/stb_image.h"
export module Crafter.Asset:Texture; export module Crafter.Asset:Texture;
import :Compression;
import std; import std;
import Crafter.Math; import Crafter.Math;
namespace fs = std::filesystem; namespace fs = std::filesystem;
@ -38,6 +39,42 @@ export namespace Crafter {
OpaqueType opaque; OpaqueType opaque;
}; };
// GDeflate-compressed counterpart of TextureAsset<T>::Save output.
// Single-region blob: the pixel array as one stream.
struct CompressedTextureAsset {
std::uint16_t sizeX = 0;
std::uint16_t sizeY = 0;
OpaqueType opaque = OpaqueType::FullyOpaque;
std::uint32_t pixelStride = 0;
Compression::CompressedBlob blob;
};
namespace TextureAssetFormat {
inline constexpr char magic[4] = {'C', 'G', 'D', 'T'};
inline constexpr std::uint32_t version = 1;
}
inline CompressedTextureAsset LoadCompressedTexture(fs::path path) {
std::ifstream file(path, std::ios::binary);
char magic[4];
file.read(magic, 4);
if (std::memcmp(magic, TextureAssetFormat::magic, 4) != 0) {
throw std::runtime_error("LoadCompressedTexture: bad magic on " + path.string());
}
std::uint32_t version = 0;
file.read(reinterpret_cast<char*>(&version), sizeof(version));
if (version != TextureAssetFormat::version) {
throw std::runtime_error("LoadCompressedTexture: unsupported version on " + path.string());
}
CompressedTextureAsset out;
file.read(reinterpret_cast<char*>(&out.sizeX), sizeof(out.sizeX));
file.read(reinterpret_cast<char*>(&out.sizeY), sizeof(out.sizeY));
file.read(reinterpret_cast<char*>(&out.opaque), sizeof(out.opaque));
file.read(reinterpret_cast<char*>(&out.pixelStride), sizeof(out.pixelStride));
out.blob = Compression::ReadBlob(file);
return out;
}
template <typename T> template <typename T>
struct TextureAsset { struct TextureAsset {
std::uint16_t sizeX; std::uint16_t sizeX;
@ -54,6 +91,24 @@ export namespace Crafter {
file.write(reinterpret_cast<char*>(pixels.data()), pixels.size() * sizeof(T)); file.write(reinterpret_cast<char*>(pixels.data()), pixels.size() * sizeof(T));
} }
void SaveCompressed(fs::path path) const {
std::array<std::span<const std::byte>, 1> streams = {
std::as_bytes(std::span(pixels)),
};
Compression::CompressedBlob blob = Compression::CompressStreams(streams);
std::ofstream file(path, std::ios::binary);
file.write(TextureAssetFormat::magic, 4);
std::uint32_t version = TextureAssetFormat::version;
std::uint32_t stride = static_cast<std::uint32_t>(sizeof(T));
file.write(reinterpret_cast<const char*>(&version), sizeof(version));
file.write(reinterpret_cast<const char*>(&sizeX), sizeof(sizeX));
file.write(reinterpret_cast<const char*>(&sizeY), sizeof(sizeY));
file.write(reinterpret_cast<const char*>(&opaque), sizeof(opaque));
file.write(reinterpret_cast<const char*>(&stride), sizeof(stride));
Compression::WriteBlob(file, blob);
}
static TextureAsset<T> Load(fs::path path) { static TextureAsset<T> Load(fs::path path) {
TextureAsset<T> tex; TextureAsset<T> tex;

View file

@ -19,5 +19,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
export module Crafter.Asset; export module Crafter.Asset;
export import :Compression;
export import :Mesh; export import :Mesh;
export import :Texture; export import :Texture;

46
lib/gdeflate/GDeflate.h Normal file
View file

@ -0,0 +1,46 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) Microsoft Corportaion. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "config.h"
namespace GDeflate
{
// See README.MD in libdeflate_1_8 for details on Compression Levels
static const uint32_t MinimumCompressionLevel = 1;
static const uint32_t MaximumCompressionLevel = 12;
enum Flags
{
COMPRESS_SINGLE_THREAD = 0x200, /*!< Force compression using a single thread. */
};
size_t CompressBound(size_t size);
bool Compress(
uint8_t* output,
size_t* outputSize,
const uint8_t* in,
size_t inSize,
uint32_t level,
uint32_t flags);
bool Decompress(uint8_t* output, size_t outputSize, const uint8_t* in, size_t inSize, uint32_t numWorkers);
} // namespace GDeflate

View file

@ -0,0 +1,270 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) Microsoft Corporation. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "GDeflate.h"
#include "TileStream.h"
#include "Utils.h"
#include "config.h"
#include <assert.h>
#include "libdeflate/libdeflate.h"
#include <string.h>
#include <algorithm>
#include <atomic>
#include <thread>
#include <vector>
template<>
struct std::default_delete<libdeflate_gdeflate_compressor>
{
void operator()(libdeflate_gdeflate_compressor* p) const
{
libdeflate_free_gdeflate_compressor(p);
}
};
namespace GDeflate
{
static constexpr uint32_t kMaxWorkers = 31;
static constexpr uint32_t kMinTilesPerWorker = 64;
struct OutputStreamWrapper
{
uint8_t* ptr = nullptr;
size_t size = 0;
size_t pos = 0;
template<typename T>
void StreamOut(T* d, size_t n = 1)
{
const size_t dataSize = sizeof(T) * n;
if (pos + dataSize > size)
{
printf("Fatal: stream overrun!\n");
return;
}
memcpy(ptr + pos, d, dataSize);
pos += dataSize;
}
bool SetPos(size_t n)
{
if (n > size)
{
printf("Fatal: stream overrun!\n");
return false;
}
pos = n;
return true;
}
};
struct CompressionContext
{
const uint8_t* inputPtr;
size_t inputSize;
struct Tile
{
std::vector<uint8_t> data;
size_t uncompressedSize = 0;
};
std::vector<Tile> tiles;
std::atomic_uint32_t globalIndex;
uint32_t numItems;
std::atomic_bool failed;
};
static bool DoCompress(
uint8_t* output,
size_t* outputSize,
const uint8_t* in,
size_t inSize,
uint32_t level,
uint32_t flags)
{
if (outputSize == nullptr || output == nullptr || in == nullptr || inSize == 0)
return false;
if (inSize > kDefaultTileSize * TileStream::kMaxTiles)
return false;
CompressionContext context{};
context.inputPtr = in;
context.inputSize = inSize;
context.numItems = static_cast<uint32_t>((inSize + kDefaultTileSize - 1) / kDefaultTileSize);
context.tiles.resize(context.numItems);
context.failed = false;
auto TileCompressionJob = [&context, level]()
{
size_t pageCount = 0;
const size_t scratchSize = libdeflate_gdeflate_compress_bound(nullptr, kDefaultTileSize, &pageCount);
assert(pageCount == 1);
void* scratch = alloca(scratchSize);
std::unique_ptr<libdeflate_gdeflate_compressor> compressor(libdeflate_alloc_gdeflate_compressor(level));
while (true)
{
const uint32_t tileIndex = context.globalIndex.fetch_add(1, std::memory_order_relaxed);
if (tileIndex >= context.numItems)
break;
const size_t tilePos = tileIndex * kDefaultTileSize;
size_t remaining = context.inputSize - tilePos;
size_t uncompressedSize = std::min<size_t>(remaining, kDefaultTileSize);
auto& tile = context.tiles[tileIndex];
tile.uncompressedSize = uncompressedSize;
libdeflate_gdeflate_out_page compressedPage{scratch, scratchSize};
size_t result = libdeflate_gdeflate_compress(
compressor.get(),
context.inputPtr + tilePos,
uncompressedSize,
&compressedPage,
1);
if (result == 0)
{
context.failed = true;
break;
}
tile.data.resize(compressedPage.nbytes);
memcpy(tile.data.data(), compressedPage.data, compressedPage.nbytes);
}
};
std::thread workers[kMaxWorkers + 1];
const uint32_t maxWorkers = std::min(kMaxWorkers, std::thread::hardware_concurrency());
uint32_t numWorkersLeft =
std::min(maxWorkers, (context.numItems + kMinTilesPerWorker - 1) / kMinTilesPerWorker);
if (flags & COMPRESS_SINGLE_THREAD)
numWorkersLeft = 0;
for (auto& worker : workers)
{
if (numWorkersLeft == 0)
break;
worker = std::thread([TileCompressionJob]() { TileCompressionJob(); });
--numWorkersLeft;
}
TileCompressionJob();
for (auto& worker : workers)
{
if (worker.joinable())
worker.join();
}
// Compression failed
if (context.failed)
return false;
// Compression is done. Prepare the output stream.
std::vector<uint32_t> tilePtrs;
size_t dataPos = 0;
for (auto const& tile : context.tiles)
{
tilePtrs.push_back(static_cast<uint32_t>(dataPos));
dataPos += tile.data.size();
}
// tilePtrs[0] is used to store the size of the last tile; all the other
// elements are offsets to the tile data.
tilePtrs[0] = static_cast<uint32_t>(context.tiles.back().data.size());
assert(tilePtrs.size() <= TileStream::kMaxTiles);
assert(tilePtrs.size() == context.tiles.size());
assert(tilePtrs.size() == context.numItems);
OutputStreamWrapper outputStream;
outputStream.ptr = output;
outputStream.size = *outputSize;
// calculate uncompressed size
size_t uncompressedSize = tilePtrs.size() * kDefaultTileSize;
size_t tailSize = context.inputSize - (tilePtrs.size() - 1) * kDefaultTileSize;
if (tailSize < kDefaultTileSize)
uncompressedSize -= kDefaultTileSize - tailSize;
assert(uncompressedSize == inSize);
TileStream header(uncompressedSize);
assert(tilePtrs.size() == header.numTiles);
outputStream.StreamOut(&header);
outputStream.StreamOut(tilePtrs.data(), tilePtrs.size());
size_t dataOffset = outputStream.pos;
for (size_t i = 0; i < tilePtrs.size(); ++i)
{
CompressionContext::Tile const& tile = context.tiles[i];
uint32_t tileOffset = (i == 0) ? 0 : tilePtrs[i];
outputStream.SetPos(dataOffset + tileOffset);
outputStream.StreamOut(tile.data.data(), tile.data.size());
}
*outputSize = outputStream.pos;
return true;
}
size_t CompressBound(size_t size)
{
size_t numTiles = std::min(size_t(TileStream::kMaxTiles), (size + kDefaultTileSize - 1) / kDefaultTileSize);
numTiles = std::max(size_t(1), numTiles);
const size_t tileSize = kDefaultTileSize +
// Tile header. Ideally need to make it a part of compressor API.
(sizeof(uint32_t) + 4 * 208 + 4 * 8);
return numTiles * tileSize + sizeof(TileStream) + sizeof(uint64_t);
}
bool Compress(uint8_t* output, size_t* outputSize, const uint8_t* in, size_t inSize, uint32_t level, uint32_t flags)
{
return DoCompress(output, outputSize, in, inSize, level, flags);
}
} // namespace GDeflate

View file

@ -0,0 +1,170 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) Microsoft Corporation. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "TileStream.h"
#include "Utils.h"
#include "libdeflate/libdeflate.h"
#include <algorithm>
#include <atomic>
#include <thread>
template<>
struct std::default_delete<libdeflate_gdeflate_decompressor>
{
void operator()(libdeflate_gdeflate_decompressor* p) const
{
libdeflate_free_gdeflate_decompressor(p);
}
};
namespace GDeflate
{
static constexpr uint32_t kMaxDecompressWorkers = 31;
static bool ValidateStream(const TileStream* header)
{
if (!header->IsValid())
{
printf("Malformed stream encountered.\n");
return false;
}
if (header->id != kGDeflateId)
{
printf("Unknown stream format: %d\n", header->id);
return false;
}
return true;
}
struct DecompressionContext
{
const uint8_t* inputPtr;
size_t inputSize;
uint8_t* outputPtr;
size_t outputSize;
std::atomic_uint32_t globalIndex;
uint32_t numItems;
std::atomic_bool failed;
};
static void TileDecompressionJob(DecompressionContext& context, uint32_t compressorId)
{
std::unique_ptr<libdeflate_gdeflate_decompressor> decompressor(libdeflate_alloc_gdeflate_decompressor());
const uint32_t* tileOffsets = reinterpret_cast<const uint32_t*>(context.inputPtr + sizeof(TileStream));
const uint8_t* inDataPtr = reinterpret_cast<const uint8_t*>(tileOffsets + context.numItems);
libdeflate_result decompressResult = LIBDEFLATE_SUCCESS;
while (true)
{
const uint32_t tileIndex = context.globalIndex.fetch_add(1, std::memory_order_relaxed);
if (tileIndex >= context.numItems)
break;
const size_t tileOffset = tileIndex > 0 ? tileOffsets[tileIndex] : 0;
libdeflate_gdeflate_in_page compressedPage{};
compressedPage.data = inDataPtr + tileOffset;
compressedPage.nbytes =
tileIndex < context.numItems - 1 ? tileOffsets[tileIndex + 1] - tileOffset : tileOffsets[0];
auto outputOffset = tileIndex * kDefaultTileSize;
decompressResult = libdeflate_gdeflate_decompress(
decompressor.get(),
&compressedPage,
1,
context.outputPtr + outputOffset,
static_cast<size_t>(kDefaultTileSize),
nullptr);
if ( decompressResult != LIBDEFLATE_SUCCESS)
{
context.failed = true;
break;
}
}
}
bool Decompress(uint8_t* output, size_t outputSize, const uint8_t* in, size_t inSize, uint32_t numWorkers)
{
if (nullptr == output || nullptr == in || 0 == outputSize || 0 == inSize)
return false;
numWorkers = std::min(kMaxDecompressWorkers, numWorkers);
numWorkers = std::max(1u, numWorkers);
auto header = reinterpret_cast<const TileStream*>(in);
if (!ValidateStream(header))
return false;
std::thread workers[kMaxDecompressWorkers];
// Run a tile per thread
header = reinterpret_cast<const TileStream*>(in);
if (!ValidateStream(header))
{
return false;
}
DecompressionContext context{};
context.inputPtr = in;
context.inputSize = inSize;
context.outputPtr = output;
context.outputSize = header->GetUncompressedSize();
context.globalIndex = 0;
context.numItems = header->numTiles;
context.failed = false;
uint32_t numWorkersLeft = context.numItems > (2 * numWorkers) ? numWorkers : 1;
const uint32_t compressorId = header->id;
for (auto& worker : workers)
{
if (numWorkersLeft == 1)
break;
worker = std::thread([&context, compressorId]() { TileDecompressionJob(context, compressorId); });
--numWorkersLeft;
}
TileDecompressionJob(context, compressorId);
for (auto& worker : workers)
{
if (worker.joinable())
worker.join();
}
return (!context.failed);
}
} // namespace GDeflate

202
lib/gdeflate/LICENSE Normal file
View file

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

83
lib/gdeflate/TileStream.h Normal file
View file

@ -0,0 +1,83 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) Microsoft Corporation. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "Utils.h"
#include "config.h"
#include <assert.h>
#include <stdint.h>
#include <string>
namespace GDeflate
{
#pragma pack(push, 1)
struct TileStream
{
static constexpr uint32_t kMaxTiles = (1 << 16) - 1;
uint8_t id;
uint8_t magic;
uint16_t numTiles;
uint32_t tileSizeIdx : 2; // this must be set to 1
uint32_t lastTileSize : 18;
uint32_t reserved1 : 12;
TileStream(size_t uncompressedSize)
{
memset(this, 0, sizeof(*this));
tileSizeIdx = 1;
SetCodecId(kGDeflateId);
SetUncompressedSize(uncompressedSize);
}
bool IsValid() const
{
return id == (magic ^ 0xff);
}
size_t GetUncompressedSize() const
{
return numTiles * kDefaultTileSize - (lastTileSize == 0 ? 0 : kDefaultTileSize - lastTileSize);
}
private:
void SetCodecId(uint8_t inId)
{
id = inId;
magic = inId ^ 0xff;
}
void SetUncompressedSize(size_t size)
{
numTiles = static_cast<uint16_t>(size / kDefaultTileSize);
lastTileSize = static_cast<uint32_t>(size - numTiles * kDefaultTileSize);
numTiles += lastTileSize != 0 ? 1 : 0;
}
};
#pragma pack(pop)
static_assert(sizeof(TileStream) == 8, "Tile stream header size overrun!");
} // namespace GDeflate

82
lib/gdeflate/Utils.h Normal file
View file

@ -0,0 +1,82 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) Microsoft Corportaion. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <stdint.h>
#include <algorithm>
#include <atomic>
#include <chrono>
#include <cstring>
#include <limits>
namespace GDeflate
{
template<int N, typename T>
static inline T _align(T a)
{
return (a + T(N) - 1) & ~(T(N) - 1);
}
template<typename T>
static inline T _divRoundup(T a, T b)
{
return (a + b - 1) / b;
}
template<typename T>
static inline uint32_t _lzCount(T a)
{
uint32_t n = 0;
while (0 == (a & 1) && n < sizeof(T) * 8)
{
a >>= 1;
++n;
}
return n;
}
template<typename T>
static inline T GetBits(uint32_t*& in, uint32_t& offset, uint32_t numBitsToRead)
{
constexpr uint32_t kBitsPerBucket = sizeof(*in) * 8;
T bits = 0;
uint32_t numBitsConsumed = 0;
while (numBitsConsumed < numBitsToRead)
{
const uint32_t numBits =
std::min(numBitsToRead - numBitsConsumed, kBitsPerBucket - (offset % kBitsPerBucket));
const T mask = std::numeric_limits<T>().max() >> (sizeof(T) * 8 - numBits);
bits |= (T(*in >> (offset % kBitsPerBucket)) & mask) << numBitsConsumed;
offset += numBits;
numBitsConsumed += numBits;
if (0 == offset % kBitsPerBucket)
in++;
}
return bits;
}
} // namespace GDeflate

29
lib/gdeflate/config.h Normal file
View file

@ -0,0 +1,29 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) Microsoft Corporation. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <stddef.h>
#include <stdint.h>
namespace GDeflate
{
static constexpr uint8_t kGDeflateId = 4;
static const size_t kDefaultTileSize = 64 * 1024; /*!< Default tile size */
} // namespace GDeflate

View file

@ -0,0 +1,36 @@
Copyright 2016 Eric Biggers
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation files
(the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -0,0 +1,338 @@
/*
* common_defs.h
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef COMMON_COMMON_DEFS_H
#define COMMON_COMMON_DEFS_H
#ifdef __GNUC__
# include "compiler_gcc.h"
#elif defined(_MSC_VER)
# include "compiler_msc.h"
#else
# pragma message("Unrecognized compiler. Please add a header file for your compiler. Compilation will proceed, but performance may suffer!")
#endif
/* ========================================================================== */
/* Type definitions */
/* ========================================================================== */
#include <stddef.h> /* size_t */
#ifndef __bool_true_false_are_defined
# include <stdbool.h> /* bool */
#endif
/* Fixed-width integer types */
#include <stdint.h>
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
/* Concatenation macros */
#define CONCAT2(x,y) x##y
#define CONCAT(x,y) CONCAT2(x,y)
/*
* Word type of the target architecture. Use 'size_t' instead of 'unsigned
* long' to account for platforms such as Windows that use 32-bit 'unsigned
* long' on 64-bit architectures.
*/
typedef size_t machine_word_t;
/* Number of bytes in a word */
#define WORDBYTES ((int)sizeof(machine_word_t))
/* Number of bits in a word */
#define WORDBITS (8 * WORDBYTES)
/* ========================================================================== */
/* Optional compiler features */
/* ========================================================================== */
/* LIBEXPORT - export a function from a shared library */
#ifndef LIBEXPORT
# define LIBEXPORT
#endif
/* inline - suggest that a function be inlined */
#ifndef inline
# define inline
#endif
/* forceinline - force a function to be inlined, if possible */
#ifndef forceinline
# define forceinline inline
#endif
/* restrict - annotate a non-aliased pointer */
#ifndef restrict
# define restrict
#endif
/* likely(expr) - hint that an expression is usually true */
#ifndef likely
# define likely(expr) (expr)
#endif
/* unlikely(expr) - hint that an expression is usually false */
#ifndef unlikely
# define unlikely(expr) (expr)
#endif
/* prefetchr(addr) - prefetch into L1 cache for read */
#ifndef prefetchr
# define prefetchr(addr)
#endif
/* prefetchw(addr) - prefetch into L1 cache for write */
#ifndef prefetchw
# define prefetchw(addr)
#endif
/* Does the compiler support the 'target' function attribute? */
#ifndef COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0
#endif
/* Which targets are supported with the 'target' function attribute? */
#ifndef COMPILER_SUPPORTS_BMI2_TARGET
# define COMPILER_SUPPORTS_BMI2_TARGET 0
#endif
#ifndef COMPILER_SUPPORTS_AVX_TARGET
# define COMPILER_SUPPORTS_AVX_TARGET 0
#endif
#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET
# define COMPILER_SUPPORTS_AVX512BW_TARGET 0
#endif
/*
* Which targets are supported with the 'target' function attribute and have
* intrinsics that work within 'target'-ed functions?
*/
#ifndef COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS 0
#endif
/* _aligned_attribute(n) - declare that the annotated variable, or variables of
* the annotated type, are to be aligned on n-byte boundaries */
#ifndef _aligned_attribute
#endif
/* ========================================================================== */
/* Miscellaneous macros */
/* ========================================================================== */
#define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0]))
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)]))
#define ALIGN(n, a) (((n) + (a) - 1) & ~((a) - 1))
/* ========================================================================== */
/* Endianness handling */
/* ========================================================================== */
/*
* CPU_IS_LITTLE_ENDIAN() - a macro which evaluates to 1 if the CPU is little
* endian or 0 if it is big endian. The macro should be defined in a way such
* that the compiler can evaluate it at compilation time. If not defined, a
* fallback is used.
*/
#ifndef CPU_IS_LITTLE_ENDIAN
static forceinline int CPU_IS_LITTLE_ENDIAN(void)
{
union {
unsigned int v;
unsigned char b;
} u;
u.v = 1;
return u.b;
}
#endif
/* bswap16(n) - swap the bytes of a 16-bit integer */
#ifndef bswap16
static forceinline u16 bswap16(u16 n)
{
return (n << 8) | (n >> 8);
}
#endif
/* bswap32(n) - swap the bytes of a 32-bit integer */
#ifndef bswap32
static forceinline u32 bswap32(u32 n)
{
return ((n & 0x000000FF) << 24) |
((n & 0x0000FF00) << 8) |
((n & 0x00FF0000) >> 8) |
((n & 0xFF000000) >> 24);
}
#endif
/* bswap64(n) - swap the bytes of a 64-bit integer */
#ifndef bswap64
static forceinline u64 bswap64(u64 n)
{
return ((n & 0x00000000000000FF) << 56) |
((n & 0x000000000000FF00) << 40) |
((n & 0x0000000000FF0000) << 24) |
((n & 0x00000000FF000000) << 8) |
((n & 0x000000FF00000000) >> 8) |
((n & 0x0000FF0000000000) >> 24) |
((n & 0x00FF000000000000) >> 40) |
((n & 0xFF00000000000000) >> 56);
}
#endif
#define le16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap16(n))
#define le32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap32(n))
#define le64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap64(n))
#define be16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap16(n) : (n))
#define be32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap32(n) : (n))
#define be64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap64(n) : (n))
/* ========================================================================== */
/* Unaligned memory accesses */
/* ========================================================================== */
/*
* UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses
* can be performed efficiently on the target platform.
*/
#ifndef UNALIGNED_ACCESS_IS_FAST
# define UNALIGNED_ACCESS_IS_FAST 0
#endif
/* ========================================================================== */
/* Bit scan functions */
/* ========================================================================== */
/*
* Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
* significant end) of the *most* significant 1 bit in the input value. The
* input value must be nonzero!
*/
#ifndef bsr32
static forceinline unsigned
bsr32(u32 n)
{
unsigned i = 0;
while ((n >>= 1) != 0)
i++;
return i;
}
#endif
#ifndef bsr64
static forceinline unsigned
bsr64(u64 n)
{
unsigned i = 0;
while ((n >>= 1) != 0)
i++;
return i;
}
#endif
static forceinline unsigned
bsrw(machine_word_t n)
{
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
if (WORDBITS == 32)
return bsr32(n);
else
return bsr64(n);
}
/*
* Bit Scan Forward (BSF) - find the 0-based index (relative to the least
* significant end) of the *least* significant 1 bit in the input value. The
* input value must be nonzero!
*/
#ifndef bsf32
static forceinline unsigned
bsf32(u32 n)
{
unsigned i = 0;
while ((n & 1) == 0) {
i++;
n >>= 1;
}
return i;
}
#endif
#ifndef bsf64
static forceinline unsigned
bsf64(u64 n)
{
unsigned i = 0;
while ((n & 1) == 0) {
i++;
n >>= 1;
}
return i;
}
#endif
static forceinline unsigned
bsfw(machine_word_t n)
{
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
if (WORDBITS == 32)
return bsf32(n);
else
return bsf64(n);
}
#endif /* COMMON_COMMON_DEFS_H */

View file

@ -0,0 +1,217 @@
/*
* compiler_gcc.h - definitions for the GNU C Compiler. This also handles clang
* and the Intel C Compiler (icc).
*
* TODO: icc is not well tested, so some things are currently disabled even
* though they maybe can be enabled on some icc versions.
*/
#if !defined(__clang__) && !defined(__INTEL_COMPILER)
# define GCC_PREREQ(major, minor) \
(__GNUC__ > (major) || \
(__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
#else
# define GCC_PREREQ(major, minor) 0
#endif
/* Note: only check the clang version when absolutely necessary!
* "Vendors" such as Apple can use different version numbers. */
#ifdef __clang__
# ifdef __apple_build_version__
# define CLANG_PREREQ(major, minor, apple_version) \
(__apple_build_version__ >= (apple_version))
# else
# define CLANG_PREREQ(major, minor, apple_version) \
(__clang_major__ > (major) || \
(__clang_major__ == (major) && __clang_minor__ >= (minor)))
# endif
#else
# define CLANG_PREREQ(major, minor, apple_version) 0
#endif
#ifndef __has_attribute
# define __has_attribute(attribute) 0
#endif
#ifndef __has_feature
# define __has_feature(feature) 0
#endif
#ifndef __has_builtin
# define __has_builtin(builtin) 0
#endif
#ifdef _WIN32
# define LIBEXPORT __declspec(dllexport)
#else
# define LIBEXPORT __attribute__((visibility("default")))
#endif
#define inline inline
#define forceinline inline __attribute__((always_inline))
#define restrict __restrict__
#define likely(expr) __builtin_expect(!!(expr), 1)
#define unlikely(expr) __builtin_expect(!!(expr), 0)
#define prefetchr(addr) __builtin_prefetch((addr), 0)
#define prefetchw(addr) __builtin_prefetch((addr), 1)
#define _aligned_attribute(n) __attribute__((aligned(n)))
#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE \
(GCC_PREREQ(4, 4) || __has_attribute(target))
#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
# if defined(__i386__) || defined(__x86_64__)
# define COMPILER_SUPPORTS_PCLMUL_TARGET \
(GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128))
# define COMPILER_SUPPORTS_AVX_TARGET \
(GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256))
# define COMPILER_SUPPORTS_BMI2_TARGET \
(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di))
# define COMPILER_SUPPORTS_AVX2_TARGET \
(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_psadbw256))
# define COMPILER_SUPPORTS_AVX512BW_TARGET \
(GCC_PREREQ(5, 1) || __has_builtin(__builtin_ia32_psadbw512))
/*
* Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics
* not available in the main target could not be used in 'target'
* attribute functions. Unfortunately clang has no feature test macro
* for this so we have to check its version.
*/
# if GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000)
# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 1
# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS \
COMPILER_SUPPORTS_PCLMUL_TARGET
# define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS \
COMPILER_SUPPORTS_AVX2_TARGET
# define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS \
COMPILER_SUPPORTS_AVX512BW_TARGET
# endif
# elif defined(__arm__) || defined(__aarch64__)
/*
* Determine whether NEON and crypto intrinsics are supported.
*
* With gcc prior to 6.1, (r230411 for arm32, r226563 for arm64), neither
* was available unless enabled in the main target.
*
* But even after that, to include <arm_neon.h> (which contains both the
* basic NEON intrinsics and the crypto intrinsics) the main target still
* needs to have:
* - gcc: hardware floating point support
* - clang: NEON support (but not necessarily crypto support)
*/
# if (GCC_PREREQ(6, 1) && defined(__ARM_FP)) || \
(defined(__clang__) && defined(__ARM_NEON))
# define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS 1
/*
* The crypto intrinsics are broken on arm32 with clang, even when using
* -mfpu=crypto-neon-fp-armv8, because clang's <arm_neon.h> puts them
* behind __aarch64__. Undefine __ARM_FEATURE_CRYPTO in that case...
*/
# if defined(__clang__) && defined(__arm__)
# undef __ARM_FEATURE_CRYPTO
# elif __has_builtin(__builtin_neon_vmull_p64) || !defined(__clang__)
# define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS 1
# endif
# endif
/*
* Determine whether CRC32 intrinsics are supported.
*
* With gcc r274827 or later (gcc 10.1+, 9.3+, or 8.4+), or with clang,
* they work as expected. (Well, not quite. There's still a bug, but we
* have to work around it later when including arm_acle.h.)
*/
# if GCC_PREREQ(10, 1) || \
(GCC_PREREQ(9, 3) && !GCC_PREREQ(10, 0)) || \
(GCC_PREREQ(8, 4) && !GCC_PREREQ(9, 0)) || \
(defined(__clang__) && __has_builtin(__builtin_arm_crc32b))
# define COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS 1
# endif
# endif /* __arm__ || __aarch64__ */
#endif /* COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE */
/*
* Prior to gcc 5.1 and clang 3.9, emmintrin.h only defined vectors of signed
* integers (e.g. __v4si), not vectors of unsigned integers (e.g. __v4su). But
* we need the unsigned ones in order to avoid signed integer overflow, which is
* undefined behavior. Add the missing definitions for the unsigned ones if
* needed.
*/
#if (GCC_PREREQ(4, 0) && !GCC_PREREQ(5, 1)) || \
(defined(__clang__) && !CLANG_PREREQ(3, 9, 8020000)) || \
defined(__INTEL_COMPILER)
typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
typedef unsigned int __v4su __attribute__((__vector_size__(16)));
typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
typedef unsigned long long __v4du __attribute__((__vector_size__(32)));
typedef unsigned int __v8su __attribute__((__vector_size__(32)));
typedef unsigned short __v16hu __attribute__((__vector_size__(32)));
typedef unsigned char __v32qu __attribute__((__vector_size__(32)));
#endif
#ifdef __INTEL_COMPILER
typedef int __v16si __attribute__((__vector_size__(64)));
typedef short __v32hi __attribute__((__vector_size__(64)));
typedef char __v64qi __attribute__((__vector_size__(64)));
#endif
/* Newer gcc supports __BYTE_ORDER__. Older gcc doesn't. */
#ifdef __BYTE_ORDER__
# define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#endif
#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
# define bswap16 __builtin_bswap16
#endif
#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
# define bswap32 __builtin_bswap32
#endif
#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
# define bswap64 __builtin_bswap64
#endif
#if defined(__x86_64__) || defined(__i386__) || \
defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \
/*
* For all compilation purposes, WebAssembly behaves like any other CPU
* instruction set. Even though WebAssembly engine might be running on top
* of different actual CPU architectures, the WebAssembly spec itself
* permits unaligned access and it will be fast on most of those platforms,
* and simulated at the engine level on others, so it's worth treating it
* as a CPU architecture with fast unaligned access.
*/ defined(__wasm__)
# define UNALIGNED_ACCESS_IS_FAST 1
#endif
#define bsr32(n) (31 - __builtin_clz(n))
#define bsr64(n) (63 - __builtin_clzll(n))
#define bsf32(n) __builtin_ctz(n)
#define bsf64(n) __builtin_ctzll(n)
/*
* Setup rotation macros similar to MSVS intrinsics.
* These should recognized by compilers.
*/
#ifndef _rotr16
#define _rotr16(x,n) ((x>>n) + (x<<(16-n)))
#endif
#ifndef _rotr
#define _rotr(x,n) ((x>>n) + (x<<(32-n)))
#endif
#ifndef _rotr64
#define _rotr64(x,n) ((x>>n) + (x<<(64-n)))
#endif

View file

@ -0,0 +1,80 @@
/*
* compiler_msc.h - definitions for the Microsoft C Compiler
*/
#include <stdint.h>
#include <stdlib.h> /* for _byteswap_*() */
#define LIBEXPORT __declspec(dllexport)
/*
* Old versions (e.g. VS2010) of MSC don't have the C99 header stdbool.h.
* Beware: the below replacement isn't fully standard, since normally any value
* != 0 should be implicitly cast to a bool with value 1... but that doesn't
* happen if bool is really just an 'int'.
*/
typedef int bool;
#define true 1
#define false 0
#define __bool_true_false_are_defined 1
/* Define ssize_t */
#ifdef _WIN64
typedef long long ssize_t;
#else
typedef int ssize_t;
#endif
/* Assume a little endian architecture with fast unaligned access */
#define CPU_IS_LITTLE_ENDIAN() 1
#define UNALIGNED_ACCESS_IS_FAST 1
/* __restrict has nonstandard behavior; don't use it */
#define restrict
/* ... but we can use __inline and __forceinline */
#define inline __inline
#define forceinline __forceinline
/* Byte swap functions */
#define bswap16 _byteswap_ushort
#define bswap32 _byteswap_ulong
#define bswap64 _byteswap_uint64
/* Bit scan functions (32-bit) */
static forceinline unsigned
bsr32(uint32_t n)
{
_BitScanReverse(&n, n);
return n;
}
#define bsr32 bsr32
static forceinline unsigned
bsf32(uint32_t n)
{
_BitScanForward(&n, n);
return n;
}
#define bsf32 bsf32
#ifdef _M_X64 /* Bit scan functions (64-bit) */
static forceinline unsigned
bsr64(uint64_t n)
{
_BitScanReverse64(&n, n);
return n;
}
#define bsr64 bsr64
static forceinline unsigned
bsf64(uint64_t n)
{
_BitScanForward64(&n, n);
return n;
}
#define bsf64 bsf64
#endif /* _M_X64 */

View file

@ -0,0 +1,130 @@
/*
* adler32.c - Adler-32 checksum algorithm
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "lib_common.h"
#include "libdeflate.h"
/* The Adler-32 divisor, or "base", value. */
#define DIVISOR 65521
/*
* MAX_CHUNK_SIZE is the most bytes that can be processed without the
* possibility of s2 overflowing when it is represented as an unsigned 32-bit
* integer. This value was computed using the following Python script:
*
* divisor = 65521
* count = 0
* s1 = divisor - 1
* s2 = divisor - 1
* while True:
* s1 += 0xFF
* s2 += s1
* if s2 > 0xFFFFFFFF:
* break
* count += 1
* print(count)
*
* Note that to get the correct worst-case value, we must assume that every byte
* has value 0xFF and that s1 and s2 started with the highest possible values
* modulo the divisor.
*/
#define MAX_CHUNK_SIZE 5552
typedef u32 (*adler32_func_t)(u32, const u8 *, size_t);
/* Include architecture-specific implementations if available */
#undef DEFAULT_IMPL
#undef DISPATCH
#if defined(__arm__) || defined(__aarch64__)
# include "arm/adler32_impl.h"
#elif defined(__i386__) || defined(__x86_64__)
# include "x86/adler32_impl.h"
#endif
/* Define a generic implementation if needed */
#ifndef DEFAULT_IMPL
#define DEFAULT_IMPL adler32_generic
static u32 adler32_generic(u32 adler, const u8 *p, size_t size)
{
u32 s1 = adler & 0xFFFF;
u32 s2 = adler >> 16;
const u8 * const end = p + size;
while (p != end) {
size_t chunk_size = MIN(end - p, MAX_CHUNK_SIZE);
const u8 *chunk_end = p + chunk_size;
size_t num_unrolled_iterations = chunk_size / 4;
while (num_unrolled_iterations--) {
s1 += *p++;
s2 += s1;
s1 += *p++;
s2 += s1;
s1 += *p++;
s2 += s1;
s1 += *p++;
s2 += s1;
}
while (p != chunk_end) {
s1 += *p++;
s2 += s1;
}
s1 %= DIVISOR;
s2 %= DIVISOR;
}
return (s2 << 16) | s1;
}
#endif /* !DEFAULT_IMPL */
#ifdef DISPATCH
static u32 dispatch(u32, const u8 *, size_t);
static volatile adler32_func_t adler32_impl = dispatch;
/* Choose the fastest implementation at runtime */
static u32 dispatch(u32 adler, const u8 *buffer, size_t size)
{
adler32_func_t f = arch_select_adler32_func();
if (f == NULL)
f = DEFAULT_IMPL;
adler32_impl = f;
return adler32_impl(adler, buffer, size);
}
#else
# define adler32_impl DEFAULT_IMPL /* only one implementation, use it */
#endif
LIBDEFLATEEXPORT u32 LIBDEFLATEAPI
libdeflate_adler32(u32 adler, const void *buffer, size_t size)
{
if (buffer == NULL) /* return initial value */
return 1;
return adler32_impl(adler, buffer, size);
}

View file

@ -0,0 +1,124 @@
/*
* adler32_vec_template.h - template for vectorized Adler-32 implementations
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* This file contains a template for vectorized Adler-32 implementations.
*
* The inner loop between reductions modulo 65521 of an unvectorized Adler-32
* implementation looks something like this:
*
* do {
* s1 += *p;
* s2 += s1;
* } while (++p != chunk_end);
*
* For vectorized calculation of s1, we only need to sum the input bytes. They
* can be accumulated into multiple counters which are eventually summed
* together.
*
* For vectorized calculation of s2, the basic idea is that for each iteration
* that processes N bytes, we can perform the following vectorizable
* calculation:
*
* s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
*
* Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
* separate counters, then do the multiplications by N...1 just once at the end
* rather than once per iteration.
*
* Also, we must account for how previous bytes will affect s2 by doing the
* following at beginning of each iteration:
*
* s2 += s1 * N
*
* Furthermore, like s1, "s2" can actually be multiple counters which are
* eventually summed together.
*/
static u32 ATTRIBUTES
FUNCNAME(u32 adler, const u8 *p, size_t size)
{
u32 s1 = adler & 0xFFFF;
u32 s2 = adler >> 16;
const u8 * const end = p + size;
const u8 *vend;
const size_t max_chunk_size =
MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) -
(MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) %
IMPL_SEGMENT_SIZE);
/* Process a byte at a time until the needed alignment is reached */
if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
do {
s1 += *p++;
s2 += s1;
} while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
s1 %= DIVISOR;
s2 %= DIVISOR;
}
/*
* Process "chunks" of bytes using vector instructions. Chunk sizes are
* limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never
* overflow before being reduced modulo DIVISOR. For vector processing,
* chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and
* may be further limited to IMPL_MAX_CHUNK_SIZE.
*/
STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0);
vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE);
while (p != vend) {
size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size);
s2 += s1 * chunk_size;
FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size),
&s1, &s2);
p += chunk_size;
s1 %= DIVISOR;
s2 %= DIVISOR;
}
/* Process any remaining bytes */
if (p != end) {
do {
s1 += *p++;
s2 += s1;
} while (p != end);
s1 %= DIVISOR;
s2 %= DIVISOR;
}
return (s2 << 16) | s1;
}
#undef FUNCNAME
#undef FUNCNAME_CHUNK
#undef ATTRIBUTES
#undef IMPL_ALIGNMENT
#undef IMPL_SEGMENT_SIZE
#undef IMPL_MAX_CHUNK_SIZE

View file

@ -0,0 +1,125 @@
/*
* arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LIB_ARM_ADLER32_IMPL_H
#define LIB_ARM_ADLER32_IMPL_H
#include "cpu_features.h"
/* NEON implementation */
#undef DISPATCH_NEON
#if !defined(DEFAULT_IMPL) && \
(defined(__ARM_NEON) || (ARM_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS))
# define FUNCNAME adler32_neon
# define FUNCNAME_CHUNK adler32_neon_chunk
# define IMPL_ALIGNMENT 16
# define IMPL_SEGMENT_SIZE 32
/* Prevent unsigned overflow of the 16-bit precision byte counters */
# define IMPL_MAX_CHUNK_SIZE (32 * (0xFFFF / 0xFF))
# ifdef __ARM_NEON
# define ATTRIBUTES
# define DEFAULT_IMPL adler32_neon
# else
# ifdef __arm__
# define ATTRIBUTES __attribute__((target("fpu=neon")))
# else
# define ATTRIBUTES __attribute__((target("+simd")))
# endif
# define DISPATCH 1
# define DISPATCH_NEON 1
# endif
# include <arm_neon.h>
static forceinline ATTRIBUTES void
adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
u32 *s1, u32 *s2)
{
uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
do {
const uint8x16_t bytes1 = *p++;
const uint8x16_t bytes2 = *p++;
uint16x8_t tmp;
v_s2 += v_s1;
/* Vector Pairwise Add Long (u8 => u16) */
tmp = vpaddlq_u8(bytes1);
/* Vector Pairwise Add and Accumulate Long (u8 => u16) */
tmp = vpadalq_u8(tmp, bytes2);
/* Vector Pairwise Add and Accumulate Long (u16 => u32) */
v_s1 = vpadalq_u16(v_s1, tmp);
/* Vector Add Wide (u8 => u16) */
v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
} while (p != end);
/* Vector Shift Left (u32) */
v_s2 = vqshlq_n_u32(v_s2, 5);
/* Vector Multiply Accumulate Long (u16 => u32) */
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), (uint16x4_t) { 32, 31, 30, 29 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), (uint16x4_t) { 24, 23, 22, 21 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), (uint16x4_t) { 16, 15, 14, 13 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10, 9 });
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) { 8, 7, 6, 5 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) { 4, 3, 2, 1 });
*s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
}
# include "../adler32_vec_template.h"
#endif /* NEON implementation */
#ifdef DISPATCH
static inline adler32_func_t
arch_select_adler32_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_NEON
if (features & ARM_CPU_FEATURE_NEON)
return adler32_neon;
#endif
return NULL;
}
#endif /* DISPATCH */
#endif /* LIB_ARM_ADLER32_IMPL_H */

View file

@ -0,0 +1,133 @@
/*
* arm/cpu_features.c - feature detection for ARM processors
*
* Copyright 2018 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* ARM processors don't have a standard way for unprivileged programs to detect
* processor features. But, on Linux we can read the AT_HWCAP and AT_HWCAP2
* values from /proc/self/auxv.
*
* Ideally we'd use the C library function getauxval(), but it's not guaranteed
* to be available: it was only added to glibc in 2.16, and in Android it was
* added to API level 18 for ARM and level 21 for AArch64.
*/
#include "../cpu_features_common.h" /* must be included first */
#include "cpu_features.h"
#if ARM_CPU_FEATURES_ENABLED
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <unistd.h>
#define AT_HWCAP 16
#define AT_HWCAP2 26
volatile u32 _cpu_features = 0;
static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2)
{
int fd;
unsigned long auxbuf[32];
int filled = 0;
int i;
fd = open("/proc/self/auxv", O_RDONLY);
if (fd < 0)
return;
for (;;) {
do {
int ret = read(fd, &((char *)auxbuf)[filled],
sizeof(auxbuf) - filled);
if (ret <= 0) {
if (ret < 0 && errno == EINTR)
continue;
goto out;
}
filled += ret;
} while (filled < 2 * sizeof(long));
i = 0;
do {
unsigned long type = auxbuf[i];
unsigned long value = auxbuf[i + 1];
if (type == AT_HWCAP)
*hwcap = value;
else if (type == AT_HWCAP2)
*hwcap2 = value;
i += 2;
filled -= 2 * sizeof(long);
} while (filled >= 2 * sizeof(long));
memmove(auxbuf, &auxbuf[i], filled);
}
out:
close(fd);
}
static const struct cpu_feature arm_cpu_feature_table[] = {
{ARM_CPU_FEATURE_NEON, "neon"},
{ARM_CPU_FEATURE_PMULL, "pmull"},
{ARM_CPU_FEATURE_CRC32, "crc32"},
};
void setup_cpu_features(void)
{
u32 features = 0;
unsigned long hwcap = 0;
unsigned long hwcap2 = 0;
scan_auxv(&hwcap, &hwcap2);
#ifdef __arm__
STATIC_ASSERT(sizeof(long) == 4);
if (hwcap & (1 << 12)) /* HWCAP_NEON */
features |= ARM_CPU_FEATURE_NEON;
if (hwcap2 & (1 << 1)) /* HWCAP2_PMULL */
features |= ARM_CPU_FEATURE_PMULL;
if (hwcap2 & (1 << 4)) /* HWCAP2_CRC32 */
features |= ARM_CPU_FEATURE_CRC32;
#else
STATIC_ASSERT(sizeof(long) == 8);
if (hwcap & (1 << 1)) /* HWCAP_ASIMD */
features |= ARM_CPU_FEATURE_NEON;
if (hwcap & (1 << 4)) /* HWCAP_PMULL */
features |= ARM_CPU_FEATURE_PMULL;
if (hwcap & (1 << 7)) /* HWCAP_CRC32 */
features |= ARM_CPU_FEATURE_CRC32;
#endif
disable_cpu_features_for_testing(&features, arm_cpu_feature_table,
ARRAY_LEN(arm_cpu_feature_table));
_cpu_features = features | ARM_CPU_FEATURES_KNOWN;
}
#endif /* ARM_CPU_FEATURES_ENABLED */

View file

@ -0,0 +1,40 @@
/*
* arm/cpu_features.h - feature detection for ARM processors
*/
#ifndef LIB_ARM_CPU_FEATURES_H
#define LIB_ARM_CPU_FEATURES_H
#include "../lib_common.h"
#if (defined(__arm__) || defined(__aarch64__)) && \
defined(__linux__) && \
COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE && \
!defined(FREESTANDING)
# define ARM_CPU_FEATURES_ENABLED 1
#else
# define ARM_CPU_FEATURES_ENABLED 0
#endif
#if ARM_CPU_FEATURES_ENABLED
#define ARM_CPU_FEATURE_NEON 0x00000001
#define ARM_CPU_FEATURE_PMULL 0x00000002
#define ARM_CPU_FEATURE_CRC32 0x00000004
#define ARM_CPU_FEATURES_KNOWN 0x80000000
extern volatile u32 _cpu_features;
void setup_cpu_features(void);
static inline u32 get_cpu_features(void)
{
if (_cpu_features == 0)
setup_cpu_features();
return _cpu_features;
}
#endif /* ARM_CPU_FEATURES_ENABLED */
#endif /* LIB_ARM_CPU_FEATURES_H */

View file

@ -0,0 +1,247 @@
/*
* arm/crc32_impl.h
*
* Copyright 2017 Jun He <jun.he@linaro.org>
* Copyright 2018 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LIB_ARM_CRC32_IMPL_H
#define LIB_ARM_CRC32_IMPL_H
#include "cpu_features.h"
/* Implementation using ARM CRC32 instructions */
#undef DISPATCH_ARM
#if !defined(DEFAULT_IMPL) && \
(defined(__ARM_FEATURE_CRC32) || \
(ARM_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_CRC32_TARGET_INTRINSICS))
# ifdef __ARM_FEATURE_CRC32
# define ATTRIBUTES
# define DEFAULT_IMPL crc32_arm
# else
# ifdef __arm__
# ifdef __clang__
# define ATTRIBUTES __attribute__((target("armv8-a,crc")))
# else
# define ATTRIBUTES __attribute__((target("arch=armv8-a+crc")))
# endif
# else
# ifdef __clang__
# define ATTRIBUTES __attribute__((target("crc")))
# else
# define ATTRIBUTES __attribute__((target("+crc")))
# endif
# endif
# define DISPATCH 1
# define DISPATCH_ARM 1
# endif
/*
* gcc's (as of 10.1) version of arm_acle.h for arm32, and clang's (as of
* 10.0.1) version of arm_acle.h for both arm32 and arm64, have a bug where they
* only define the CRC32 functions like __crc32b() when __ARM_FEATURE_CRC32 is
* defined. That prevents them from being used via __attribute__((target)) when
* the main target doesn't have CRC32 support enabled. The actual built-ins
* like __builtin_arm_crc32b() are available and work, however; it's just the
* wrappers in arm_acle.h like __crc32b() that erroneously don't get defined.
* Work around this by manually defining __ARM_FEATURE_CRC32.
*/
#ifndef __ARM_FEATURE_CRC32
# define __ARM_FEATURE_CRC32 1
#endif
#include <arm_acle.h>
static u32 ATTRIBUTES
crc32_arm(u32 remainder, const u8 *p, size_t size)
{
while (size != 0 && (uintptr_t)p & 7) {
remainder = __crc32b(remainder, *p++);
size--;
}
while (size >= 32) {
remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 0)));
remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 1)));
remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 2)));
remainder = __crc32d(remainder, le64_bswap(*((u64 *)p + 3)));
p += 32;
size -= 32;
}
while (size >= 8) {
remainder = __crc32d(remainder, le64_bswap(*(u64 *)p));
p += 8;
size -= 8;
}
while (size != 0) {
remainder = __crc32b(remainder, *p++);
size--;
}
return remainder;
}
#undef ATTRIBUTES
#endif /* Implementation using ARM CRC32 instructions */
/*
* CRC-32 folding with ARM Crypto extension-PMULL
*
* This works the same way as the x86 PCLMUL version.
* See x86/crc32_pclmul_template.h for an explanation.
*/
#undef DISPATCH_PMULL
#if !defined(DEFAULT_IMPL) && \
(defined(__ARM_FEATURE_CRYPTO) || \
(ARM_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS)) && \
/* not yet tested on big endian, probably needs changes to work there */ \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
# define FUNCNAME crc32_pmull
# define FUNCNAME_ALIGNED crc32_pmull_aligned
# ifdef __ARM_FEATURE_CRYPTO
# define ATTRIBUTES
# define DEFAULT_IMPL crc32_pmull
# else
# ifdef __arm__
# define ATTRIBUTES __attribute__((target("fpu=crypto-neon-fp-armv8")))
# else
# ifdef __clang__
# define ATTRIBUTES __attribute__((target("crypto")))
# else
# define ATTRIBUTES __attribute__((target("+crypto")))
# endif
# endif
# define DISPATCH 1
# define DISPATCH_PMULL 1
# endif
#include <arm_neon.h>
static forceinline ATTRIBUTES uint8x16_t
clmul_00(uint8x16_t a, uint8x16_t b)
{
return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a),
(poly64_t)vget_low_u8(b));
}
static forceinline ATTRIBUTES uint8x16_t
clmul_10(uint8x16_t a, uint8x16_t b)
{
return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a),
(poly64_t)vget_high_u8(b));
}
static forceinline ATTRIBUTES uint8x16_t
clmul_11(uint8x16_t a, uint8x16_t b)
{
return (uint8x16_t)vmull_high_p64((poly64x2_t)a, (poly64x2_t)b);
}
static forceinline ATTRIBUTES uint8x16_t
fold_128b(uint8x16_t dst, uint8x16_t src, uint8x16_t multipliers)
{
return dst ^ clmul_00(src, multipliers) ^ clmul_11(src, multipliers);
}
static forceinline ATTRIBUTES u32
crc32_pmull_aligned(u32 remainder, const uint8x16_t *p, size_t nr_segs)
{
/* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */
const uint8x16_t multipliers_4 =
(uint8x16_t)(uint64x2_t){ 0x8F352D95, 0x1D9513D7 };
const uint8x16_t multipliers_1 =
(uint8x16_t)(uint64x2_t){ 0xAE689191, 0xCCAA009E };
const uint8x16_t final_multiplier =
(uint8x16_t)(uint64x2_t){ 0xB8BC6765 };
const uint8x16_t mask32 = (uint8x16_t)(uint32x4_t){ 0xFFFFFFFF };
const uint8x16_t barrett_reduction_constants =
(uint8x16_t)(uint64x2_t){ 0x00000001F7011641,
0x00000001DB710641 };
const uint8x16_t zeroes = (uint8x16_t){ 0 };
const uint8x16_t * const end = p + nr_segs;
const uint8x16_t * const end512 = p + (nr_segs & ~3);
uint8x16_t x0, x1, x2, x3;
x0 = *p++ ^ (uint8x16_t)(uint32x4_t){ remainder };
if (nr_segs >= 4) {
x1 = *p++;
x2 = *p++;
x3 = *p++;
/* Fold 512 bits at a time */
while (p != end512) {
x0 = fold_128b(*p++, x0, multipliers_4);
x1 = fold_128b(*p++, x1, multipliers_4);
x2 = fold_128b(*p++, x2, multipliers_4);
x3 = fold_128b(*p++, x3, multipliers_4);
}
/* Fold 512 bits => 128 bits */
x1 = fold_128b(x1, x0, multipliers_1);
x2 = fold_128b(x2, x1, multipliers_1);
x0 = fold_128b(x3, x2, multipliers_1);
}
/* Fold 128 bits at a time */
while (p != end)
x0 = fold_128b(*p++, x0, multipliers_1);
/* Fold 128 => 96 bits, implicitly appending 32 zeroes */
x0 = vextq_u8(x0, zeroes, 8) ^ clmul_10(x0, multipliers_1);
/* Fold 96 => 64 bits */
x0 = vextq_u8(x0, zeroes, 4) ^ clmul_00(x0 & mask32, final_multiplier);
/* Reduce 64 => 32 bits using Barrett reduction */
x1 = x0;
x0 = clmul_00(x0 & mask32, barrett_reduction_constants);
x0 = clmul_10(x0 & mask32, barrett_reduction_constants);
return vgetq_lane_u32((uint32x4_t)(x0 ^ x1), 1);
}
#define IMPL_ALIGNMENT 16
#define IMPL_SEGMENT_SIZE 16
#include "../crc32_vec_template.h"
#endif /* PMULL implementation */
#ifdef DISPATCH
static inline crc32_func_t
arch_select_crc32_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_ARM
if (features & ARM_CPU_FEATURE_CRC32)
return crc32_arm;
#endif
#ifdef DISPATCH_PMULL
if (features & ARM_CPU_FEATURE_PMULL)
return crc32_pmull;
#endif
return NULL;
}
#endif /* DISPATCH */
#endif /* LIB_ARM_CRC32_IMPL_H */

View file

@ -0,0 +1,86 @@
/*
* arm/matchfinder_impl.h - ARM implementations of matchfinder functions
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LIB_ARM_MATCHFINDER_IMPL_H
#define LIB_ARM_MATCHFINDER_IMPL_H
#ifdef __ARM_NEON
# include <arm_neon.h>
static forceinline void
matchfinder_init_neon(mf_pos_t *data, size_t size)
{
int16x8_t *p = (int16x8_t *)data;
int16x8_t v = (int16x8_t) {
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
};
STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
do {
p[0] = v;
p[1] = v;
p[2] = v;
p[3] = v;
p += 4;
size -= 4 * sizeof(*p);
} while (size != 0);
}
#define matchfinder_init matchfinder_init_neon
static forceinline void
matchfinder_rebase_neon(mf_pos_t *data, size_t size)
{
int16x8_t *p = (int16x8_t *)data;
int16x8_t v = (int16x8_t) {
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
};
STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
do {
p[0] = vqaddq_s16(p[0], v);
p[1] = vqaddq_s16(p[1], v);
p[2] = vqaddq_s16(p[2], v);
p[3] = vqaddq_s16(p[3], v);
p += 4;
size -= 4 * sizeof(*p);
} while (size != 0);
}
#define matchfinder_rebase matchfinder_rebase_neon
#endif /* __ARM_NEON */
#endif /* LIB_ARM_MATCHFINDER_IMPL_H */

View file

@ -0,0 +1,363 @@
/*
* bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* ----------------------------------------------------------------------------
*
* This is a Binary Trees (bt) based matchfinder.
*
* The main data structure is a hash table where each hash bucket contains a
* binary tree of sequences whose first 4 bytes share the same hash code. Each
* sequence is identified by its starting position in the input buffer. Each
* binary tree is always sorted such that each left child represents a sequence
* lexicographically lesser than its parent and each right child represents a
* sequence lexicographically greater than its parent.
*
* The algorithm processes the input buffer sequentially. At each byte
* position, the hash code of the first 4 bytes of the sequence beginning at
* that position (the sequence being matched against) is computed. This
* identifies the hash bucket to use for that position. Then, a new binary tree
* node is created to represent the current sequence. Then, in a single tree
* traversal, the hash bucket's binary tree is searched for matches and is
* re-rooted at the new node.
*
* Compared to the simpler algorithm that uses linked lists instead of binary
* trees (see hc_matchfinder.h), the binary tree version gains more information
* at each node visitation. Ideally, the binary tree version will examine only
* 'log(n)' nodes to find the same matches that the linked list version will
* find by examining 'n' nodes. In addition, the binary tree version can
* examine fewer bytes at each node by taking advantage of the common prefixes
* that result from the sort order, whereas the linked list version may have to
* examine up to the full length of the match at each node.
*
* However, it is not always best to use the binary tree version. It requires
* nearly twice as much memory as the linked list version, and it takes time to
* keep the binary trees sorted, even at positions where the compressor does not
* need matches. Generally, when doing fast compression on small buffers,
* binary trees are the wrong approach. They are best suited for thorough
* compression and/or large buffers.
*
* ----------------------------------------------------------------------------
*/
#ifndef LIB_BT_MATCHFINDER_H
#define LIB_BT_MATCHFINDER_H
#include "matchfinder_common.h"
#define BT_MATCHFINDER_HASH3_ORDER 16
#define BT_MATCHFINDER_HASH3_WAYS 2
#define BT_MATCHFINDER_HASH4_ORDER 16
#define BT_MATCHFINDER_TOTAL_HASH_SIZE \
(((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
(1UL << BT_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t))
/* Representation of a match found by the bt_matchfinder */
struct lz_match {
/* The number of bytes matched. */
u16 length;
/* The offset back from the current position that was matched. */
u16 offset;
};
struct bt_matchfinder {
/* The hash table for finding length 3 matches */
mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
/* The hash table which contains the roots of the binary trees for
* finding length 4+ matches */
mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
/* The child node references for the binary trees. The left and right
* children of the node for the sequence with position 'pos' are
* 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */
mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
}
#ifdef _aligned_attribute
_aligned_attribute(MATCHFINDER_MEM_ALIGNMENT)
#endif
;
/* Prepare the matchfinder for a new input buffer. */
static forceinline void
bt_matchfinder_init(struct bt_matchfinder *mf)
{
STATIC_ASSERT(BT_MATCHFINDER_TOTAL_HASH_SIZE %
MATCHFINDER_SIZE_ALIGNMENT == 0);
matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_SIZE);
}
static forceinline void
bt_matchfinder_slide_window(struct bt_matchfinder *mf)
{
STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
}
static forceinline mf_pos_t *
bt_left_child(struct bt_matchfinder *mf, s32 node)
{
return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
}
static forceinline mf_pos_t *
bt_right_child(struct bt_matchfinder *mf, s32 node)
{
return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
}
/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
* and bt_matchfinder_skip_position(). There must be sufficiently many bytes
* remaining to load a 32-bit integer from the *next* position. */
#define BT_MATCHFINDER_REQUIRED_NBYTES 5
/* Advance the binary tree matchfinder by one byte, optionally recording
* matches. @record_matches should be a compile-time constant. */
static forceinline struct lz_match *
bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
const u8 * const restrict in_base,
const ptrdiff_t cur_pos,
const u32 max_len,
const u32 nice_len,
const u32 max_search_depth,
u32 * const restrict next_hashes,
u32 * const restrict best_len_ret,
struct lz_match * restrict lz_matchptr,
const bool record_matches)
{
const u8 *in_next = in_base + cur_pos;
u32 depth_remaining = max_search_depth;
const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
u32 next_hashseq;
u32 hash3;
u32 hash4;
s32 cur_node;
#if BT_MATCHFINDER_HASH3_WAYS >= 2
s32 cur_node_2;
#endif
const u8 *matchptr;
mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
u32 best_lt_len, best_gt_len;
u32 len;
u32 best_len = 3;
STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
BT_MATCHFINDER_HASH3_WAYS <= 2);
next_hashseq = get_unaligned_le32(in_next + 1);
hash3 = next_hashes[0];
hash4 = next_hashes[1];
next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER);
next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER);
prefetchw(&mf->hash3_tab[next_hashes[0]]);
prefetchw(&mf->hash4_tab[next_hashes[1]]);
cur_node = mf->hash3_tab[hash3][0];
mf->hash3_tab[hash3][0] = cur_pos;
#if BT_MATCHFINDER_HASH3_WAYS >= 2
cur_node_2 = mf->hash3_tab[hash3][1];
mf->hash3_tab[hash3][1] = cur_node;
#endif
if (record_matches && cur_node > cutoff) {
u32 seq3 = load_u24_unaligned(in_next);
if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
lz_matchptr->length = 3;
lz_matchptr->offset = in_next - &in_base[cur_node];
lz_matchptr++;
}
#if BT_MATCHFINDER_HASH3_WAYS >= 2
else if (cur_node_2 > cutoff &&
seq3 == load_u24_unaligned(&in_base[cur_node_2]))
{
lz_matchptr->length = 3;
lz_matchptr->offset = in_next - &in_base[cur_node_2];
lz_matchptr++;
}
#endif
}
cur_node = mf->hash4_tab[hash4];
mf->hash4_tab[hash4] = cur_pos;
pending_lt_ptr = bt_left_child(mf, cur_pos);
pending_gt_ptr = bt_right_child(mf, cur_pos);
if (cur_node <= cutoff) {
*pending_lt_ptr = MATCHFINDER_INITVAL;
*pending_gt_ptr = MATCHFINDER_INITVAL;
*best_len_ret = best_len;
return lz_matchptr;
}
best_lt_len = 0;
best_gt_len = 0;
len = 0;
for (;;) {
matchptr = &in_base[cur_node];
if (matchptr[len] == in_next[len]) {
len = lz_extend(in_next, matchptr, len + 1, max_len);
if (!record_matches || len > best_len) {
if (record_matches) {
best_len = len;
lz_matchptr->length = len;
lz_matchptr->offset = in_next - matchptr;
lz_matchptr++;
}
if (len >= nice_len) {
*pending_lt_ptr = *bt_left_child(mf, cur_node);
*pending_gt_ptr = *bt_right_child(mf, cur_node);
*best_len_ret = best_len;
return lz_matchptr;
}
}
}
if (matchptr[len] < in_next[len]) {
*pending_lt_ptr = cur_node;
pending_lt_ptr = bt_right_child(mf, cur_node);
cur_node = *pending_lt_ptr;
best_lt_len = len;
if (best_gt_len < len)
len = best_gt_len;
} else {
*pending_gt_ptr = cur_node;
pending_gt_ptr = bt_left_child(mf, cur_node);
cur_node = *pending_gt_ptr;
best_gt_len = len;
if (best_lt_len < len)
len = best_lt_len;
}
if (cur_node <= cutoff || !--depth_remaining) {
*pending_lt_ptr = MATCHFINDER_INITVAL;
*pending_gt_ptr = MATCHFINDER_INITVAL;
*best_len_ret = best_len;
return lz_matchptr;
}
}
}
/*
* Retrieve a list of matches with the current position.
*
* @mf
* The matchfinder structure.
* @in_base
* Pointer to the next byte in the input buffer to process _at the last
* time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
* @cur_pos
* The current position in the input buffer relative to @in_base (the
* position of the sequence being matched against).
* @max_len
* The maximum permissible match length at this position. Must be >=
* BT_MATCHFINDER_REQUIRED_NBYTES.
* @nice_len
* Stop searching if a match of at least this length is found.
* Must be <= @max_len.
* @max_search_depth
* Limit on the number of potential matches to consider. Must be >= 1.
* @next_hashes
* The precomputed hash codes for the sequence beginning at @in_next.
* These will be used and then updated with the precomputed hashcodes for
* the sequence beginning at @in_next + 1.
* @best_len_ret
* If a match of length >= 4 was found, then the length of the longest such
* match is written here; otherwise 3 is written here. (Note: this is
* redundant with the 'struct lz_match' array, but this is easier for the
* compiler to optimize when inlined and the caller immediately does a
* check against 'best_len'.)
* @lz_matchptr
* An array in which this function will record the matches. The recorded
* matches will be sorted by strictly increasing length and (non-strictly)
* increasing offset. The maximum number of matches that may be found is
* 'nice_len - 2'.
*
* The return value is a pointer to the next available slot in the @lz_matchptr
* array. (If no matches were found, this will be the same as @lz_matchptr.)
*/
static forceinline struct lz_match *
bt_matchfinder_get_matches(struct bt_matchfinder *mf,
const u8 *in_base,
ptrdiff_t cur_pos,
u32 max_len,
u32 nice_len,
u32 max_search_depth,
u32 next_hashes[2],
u32 *best_len_ret,
struct lz_match *lz_matchptr)
{
return bt_matchfinder_advance_one_byte(mf,
in_base,
cur_pos,
max_len,
nice_len,
max_search_depth,
next_hashes,
best_len_ret,
lz_matchptr,
true);
}
/*
* Advance the matchfinder, but don't record any matches.
*
* This is very similar to bt_matchfinder_get_matches() because both functions
* must do hashing and tree re-rooting.
*/
static forceinline void
bt_matchfinder_skip_position(struct bt_matchfinder *mf,
const u8 *in_base,
ptrdiff_t cur_pos,
u32 nice_len,
u32 max_search_depth,
u32 next_hashes[2])
{
u32 best_len;
bt_matchfinder_advance_one_byte(mf,
in_base,
cur_pos,
nice_len,
nice_len,
max_search_depth,
next_hashes,
&best_len,
NULL,
false);
}
#endif /* LIB_BT_MATCHFINDER_H */

View file

@ -0,0 +1,88 @@
/*
* cpu_features_common.h - code shared by all lib/$arch/cpu_features.c
*
* Copyright 2020 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LIB_CPU_FEATURES_COMMON_H
#define LIB_CPU_FEATURES_COMMON_H
#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
# define _GNU_SOURCE 1 /* for strdup() and strtok_r() */
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
#endif
#include "lib_common.h"
struct cpu_feature {
u32 bit;
const char *name;
};
#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
/* Disable any features that are listed in $LIBDEFLATE_DISABLE_CPU_FEATURES. */
static inline void
disable_cpu_features_for_testing(u32 *features,
const struct cpu_feature *feature_table,
size_t feature_table_length)
{
char *env_value, *strbuf, *p, *saveptr = NULL;
size_t i;
env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES");
if (!env_value)
return;
strbuf = strdup(env_value);
if (!strbuf)
abort();
p = strtok_r(strbuf, ",", &saveptr);
while (p) {
for (i = 0; i < feature_table_length; i++) {
if (strcmp(p, feature_table[i].name) == 0) {
*features &= ~feature_table[i].bit;
break;
}
}
if (i == feature_table_length) {
fprintf(stderr,
"unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n",
p);
abort();
}
p = strtok_r(NULL, ",", &saveptr);
}
free(strbuf);
}
#else /* TEST_SUPPORT__DO_NOT_USE */
static inline void
disable_cpu_features_for_testing(u32 *features,
const struct cpu_feature *feature_table,
size_t feature_table_length)
{
}
#endif /* !TEST_SUPPORT__DO_NOT_USE */
#endif /* LIB_CPU_FEATURES_COMMON_H */

View file

@ -0,0 +1,313 @@
/*
* crc32.c - CRC-32 checksum algorithm for the gzip format
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* High-level description of CRC
* =============================
*
* Consider a bit sequence 'bits[1...len]'. Interpret 'bits' as the "message"
* polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
* where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute:
*
* R(x) = M(x)*x^n mod G(x)
*
* where G(x) is a selected "generator" polynomial of degree 'n'. The remainder
* R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x)
* interpreted as a bitstring of length 'n'.
*
* CRC used in gzip
* ================
*
* In the gzip format (RFC 1952):
*
* - The bitstring to checksum is formed from the bytes of the uncompressed
* data by concatenating the bits from the bytes in order, proceeding
* from the low-order bit to the high-order bit within each byte.
*
* - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
* x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
* Consequently, the CRC length is 32 bits ("CRC-32").
*
* - The highest order 32 coefficients of M(x)*x^n are inverted.
*
* - All 32 coefficients of R(x) are inverted.
*
* The two inversions cause added leading and trailing zero bits to affect the
* resulting CRC, whereas with a regular CRC such bits would have no effect on
* the CRC.
*
* Computation and optimizations
* =============================
*
* We can compute R(x) through "long division", maintaining only 32 bits of
* state at any given time. Multiplication by 'x' can be implemented as
* right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
* highest order bit represents the coefficient of x^0), and both addition and
* subtraction can be implemented as bitwise exclusive OR (since we are working
* in GF(2)). Here is an unoptimized implementation:
*
* static u32 crc32_gzip(const u8 *buffer, size_t size)
* {
* u32 remainder = 0;
* const u32 divisor = 0xEDB88320;
*
* for (size_t i = 0; i < size * 8 + 32; i++) {
* int bit;
* u32 multiple;
*
* if (i < size * 8)
* bit = (buffer[i / 8] >> (i % 8)) & 1;
* else
* bit = 0; // one of the 32 appended 0 bits
*
* if (i < 32) // the first 32 bits are inverted
* bit ^= 1;
*
* if (remainder & 1)
* multiple = divisor;
* else
* multiple = 0;
*
* remainder >>= 1;
* remainder |= (u32)bit << 31;
* remainder ^= multiple;
* }
*
* return ~remainder;
* }
*
* In this implementation, the 32-bit integer 'remainder' maintains the
* remainder of the currently processed portion of the message (with 32 zero
* bits appended) when divided by the generator polynomial. 'remainder' is the
* representation of R(x), and 'divisor' is the representation of G(x) excluding
* the x^32 coefficient. For each bit to process, we multiply R(x) by 'x^1',
* then add 'x^0' if the new bit is a 1. If this causes R(x) to gain a nonzero
* x^32 term, then we subtract G(x) from R(x).
*
* We can speed this up by taking advantage of the fact that XOR is commutative
* and associative, so the order in which we combine the inputs into 'remainder'
* is unimportant. And since each message bit we add doesn't affect the choice
* of 'multiple' until 32 bits later, we need not actually add each message bit
* until that point:
*
* static u32 crc32_gzip(const u8 *buffer, size_t size)
* {
* u32 remainder = ~0;
* const u32 divisor = 0xEDB88320;
*
* for (size_t i = 0; i < size * 8; i++) {
* int bit;
* u32 multiple;
*
* bit = (buffer[i / 8] >> (i % 8)) & 1;
* remainder ^= bit;
* if (remainder & 1)
* multiple = divisor;
* else
* multiple = 0;
* remainder >>= 1;
* remainder ^= multiple;
* }
*
* return ~remainder;
* }
*
* With the above implementation we get the effect of 32 appended 0 bits for
* free; they never affect the choice of a divisor, nor would they change the
* value of 'remainder' if they were to be actually XOR'ed in. And by starting
* with a remainder of all 1 bits, we get the effect of complementing the first
* 32 message bits.
*
* The next optimization is to process the input in multi-bit units. Suppose
* that we insert the next 'n' message bits into the remainder. Then we get an
* intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
* bits is the amount by which the low 32 bits of the remainder will change as a
* result of cancelling out those 'n' bits. Taking n=8 (one byte) and
* precomputing a table containing the CRC of each possible byte, we get
* crc32_slice1() defined below.
*
* As a further optimization, we could increase the multi-bit unit size to 16.
* However, that is inefficient because the table size explodes from 256 entries
* (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
* fit in L1 cache on typical processors.
*
* However, we can actually process 4 bytes at a time using 4 different tables
* with 256 entries each. Logically, we form a 64-bit intermediate remainder
* and cancel out the high 32 bits in 8-bit chunks. Bits 32-39 are cancelled
* out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
* CRC of those bits with 8 zero bits appended, and so on. This method is
* implemented in crc32_slice4(), defined below.
*
* In crc32_slice8(), this method is extended to 8 bytes at a time. The
* intermediate remainder (which we never actually store explicitly) is 96 bits.
*
* On CPUs that support fast carryless multiplication, CRCs can be computed even
* more quickly via "folding". See e.g. the x86 PCLMUL implementation.
*/
#include "lib_common.h"
#include "libdeflate.h"
typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
/* Include architecture-specific implementations if available */
#undef CRC32_SLICE1
#undef CRC32_SLICE4
#undef CRC32_SLICE8
#undef DEFAULT_IMPL
#undef DISPATCH
#if defined(__arm__) || defined(__aarch64__)
# include "arm/crc32_impl.h"
#elif defined(__i386__) || defined(__x86_64__)
# include "x86/crc32_impl.h"
#endif
/*
* Define a generic implementation (crc32_slice8()) if needed. crc32_slice1()
* may also be needed as a fallback for architecture-specific implementations.
*/
#ifndef DEFAULT_IMPL
# define CRC32_SLICE8 1
# define DEFAULT_IMPL crc32_slice8
#endif
#if defined(CRC32_SLICE1) || defined(CRC32_SLICE4) || defined(CRC32_SLICE8)
#include "crc32_table.h"
static forceinline u32
crc32_update_byte(u32 remainder, u8 next_byte)
{
return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
}
#endif
#ifdef CRC32_SLICE1
static u32
crc32_slice1(u32 remainder, const u8 *buffer, size_t size)
{
size_t i;
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
for (i = 0; i < size; i++)
remainder = crc32_update_byte(remainder, buffer[i]);
return remainder;
}
#endif /* CRC32_SLICE1 */
#ifdef CRC32_SLICE4
static u32
crc32_slice4(u32 remainder, const u8 *buffer, size_t size)
{
const u8 *p = buffer;
const u8 *end = buffer + size;
const u8 *end32;
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
for (; ((uintptr_t)p & 3) && p != end; p++)
remainder = crc32_update_byte(remainder, *p);
end32 = p + ((end - p) & ~3);
for (; p != end32; p += 4) {
u32 v = le32_bswap(*(const u32 *)p);
remainder =
crc32_table[0x300 + (u8)((remainder ^ v) >> 0)] ^
crc32_table[0x200 + (u8)((remainder ^ v) >> 8)] ^
crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
}
for (; p != end; p++)
remainder = crc32_update_byte(remainder, *p);
return remainder;
}
#endif /* CRC32_SLICE4 */
#ifdef CRC32_SLICE8
static u32
crc32_slice8(u32 remainder, const u8 *buffer, size_t size)
{
const u8 *p = buffer;
const u8 *end = buffer + size;
const u8 *end64;
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
for (; ((uintptr_t)p & 7) && p != end; p++)
remainder = crc32_update_byte(remainder, *p);
end64 = p + ((end - p) & ~7);
for (; p != end64; p += 8) {
u32 v1 = le32_bswap(*(const u32 *)(p + 0));
u32 v2 = le32_bswap(*(const u32 *)(p + 4));
remainder =
crc32_table[0x700 + (u8)((remainder ^ v1) >> 0)] ^
crc32_table[0x600 + (u8)((remainder ^ v1) >> 8)] ^
crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
crc32_table[0x300 + (u8)(v2 >> 0)] ^
crc32_table[0x200 + (u8)(v2 >> 8)] ^
crc32_table[0x100 + (u8)(v2 >> 16)] ^
crc32_table[0x000 + (u8)(v2 >> 24)];
}
for (; p != end; p++)
remainder = crc32_update_byte(remainder, *p);
return remainder;
}
#endif /* CRC32_SLICE8 */
#ifdef DISPATCH
static u32 dispatch(u32, const u8 *, size_t);
static volatile crc32_func_t crc32_impl = dispatch;
/* Choose the fastest implementation at runtime */
static u32 dispatch(u32 remainder, const u8 *buffer, size_t size)
{
crc32_func_t f = arch_select_crc32_func();
if (f == NULL)
f = DEFAULT_IMPL;
crc32_impl = f;
return crc32_impl(remainder, buffer, size);
}
#else
# define crc32_impl DEFAULT_IMPL /* only one implementation, use it */
#endif
LIBDEFLATEEXPORT u32 LIBDEFLATEAPI
libdeflate_crc32(u32 remainder, const void *buffer, size_t size)
{
if (buffer == NULL) /* return initial value */
return 0;
return ~crc32_impl(~remainder, buffer, size);
}

View file

@ -0,0 +1,526 @@
/*
* crc32_table.h - data table to accelerate CRC-32 computation
*
* THIS FILE WAS AUTOMATICALLY GENERATED BY gen_crc32_table.c. DO NOT EDIT.
*/
#include <stdint.h>
static const uint32_t crc32_table[] = {
0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
#if defined(CRC32_SLICE4) || defined(CRC32_SLICE8)
0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3,
0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7,
0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb,
0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf,
0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496,
0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a,
0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e,
0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761,
0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69,
0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d,
0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530,
0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034,
0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c,
0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6,
0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2,
0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce,
0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97,
0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93,
0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f,
0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b,
0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60,
0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c,
0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768,
0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35,
0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d,
0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539,
0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88,
0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c,
0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484,
0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9,
0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd,
0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1,
0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a,
0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e,
0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522,
0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026,
0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f,
0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773,
0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277,
0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d,
0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85,
0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81,
0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc,
0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8,
0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0,
0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f,
0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b,
0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27,
0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e,
0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a,
0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876,
0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72,
0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59,
0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685,
0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1,
0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d,
0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5,
0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91,
0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d,
0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9,
0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901,
0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd,
0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9,
0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315,
0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad,
0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399,
0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45,
0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221,
0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9,
0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835,
0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151,
0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d,
0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5,
0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1,
0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d,
0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609,
0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1,
0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d,
0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9,
0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05,
0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd,
0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9,
0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75,
0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711,
0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339,
0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5,
0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281,
0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d,
0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895,
0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1,
0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d,
0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819,
0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1,
0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d,
0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69,
0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5,
0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d,
0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9,
0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625,
0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41,
0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89,
0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555,
0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31,
0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed,
0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee,
0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9,
0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701,
0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056,
0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26,
0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e,
0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9,
0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0,
0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f,
0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68,
0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f,
0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018,
0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7,
0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3,
0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084,
0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c,
0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c,
0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b,
0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3,
0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4,
0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba,
0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002,
0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755,
0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72,
0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d,
0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca,
0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5,
0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82,
0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d,
0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a,
0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d,
0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5,
0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb,
0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc,
0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04,
0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953,
0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623,
0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b,
0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc,
0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8,
0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907,
0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50,
0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677,
0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120,
0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf,
0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6,
0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981,
0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639,
0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949,
0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e,
0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6,
0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1,
#endif /* CRC32_SLICE4 || CRC32_SLICE8 */
#if defined(CRC32_SLICE8)
0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0,
0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10,
0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111,
0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1,
0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52,
0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92,
0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693,
0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053,
0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4,
0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314,
0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15,
0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5,
0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256,
0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496,
0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997,
0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57,
0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299,
0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459,
0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958,
0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98,
0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b,
0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db,
0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda,
0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a,
0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d,
0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d,
0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c,
0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c,
0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f,
0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf,
0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de,
0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e,
0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42,
0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82,
0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183,
0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743,
0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0,
0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00,
0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601,
0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1,
0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546,
0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386,
0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87,
0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847,
0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4,
0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404,
0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905,
0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5,
0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b,
0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb,
0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca,
0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a,
0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589,
0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349,
0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48,
0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888,
0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f,
0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf,
0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce,
0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e,
0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d,
0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d,
0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c,
0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c,
0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae,
0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8,
0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3,
0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5,
0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035,
0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223,
0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258,
0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e,
0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798,
0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e,
0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5,
0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3,
0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503,
0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715,
0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e,
0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578,
0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2,
0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4,
0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf,
0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9,
0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59,
0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f,
0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834,
0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22,
0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4,
0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2,
0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99,
0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f,
0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f,
0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79,
0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02,
0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14,
0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676,
0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460,
0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b,
0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d,
0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed,
0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb,
0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680,
0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496,
0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340,
0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156,
0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d,
0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b,
0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db,
0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd,
0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6,
0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0,
0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a,
0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c,
0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77,
0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61,
0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81,
0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97,
0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec,
0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa,
0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c,
0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a,
0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41,
0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957,
0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7,
0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1,
0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da,
0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc,
0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d,
0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e,
0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa,
0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9,
0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653,
0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240,
0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834,
0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27,
0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301,
0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712,
0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66,
0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975,
0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf,
0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc,
0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8,
0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb,
0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4,
0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7,
0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183,
0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590,
0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a,
0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739,
0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d,
0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e,
0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678,
0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b,
0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f,
0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c,
0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6,
0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5,
0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1,
0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2,
0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f,
0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c,
0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08,
0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b,
0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1,
0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2,
0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6,
0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5,
0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3,
0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0,
0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794,
0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387,
0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d,
0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e,
0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a,
0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49,
0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516,
0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105,
0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71,
0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62,
0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8,
0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb,
0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf,
0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac,
0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a,
0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899,
0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed,
0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe,
0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044,
0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457,
0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23,
0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30,
0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3,
0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919,
0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56,
0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac,
0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8,
0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832,
0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d,
0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387,
0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5,
0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f,
0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00,
0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa,
0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e,
0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64,
0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b,
0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1,
0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e,
0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4,
0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb,
0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041,
0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425,
0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf,
0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90,
0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a,
0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758,
0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2,
0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced,
0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217,
0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673,
0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889,
0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6,
0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c,
0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239,
0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3,
0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c,
0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776,
0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312,
0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8,
0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7,
0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d,
0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f,
0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95,
0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda,
0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520,
0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144,
0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe,
0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1,
0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b,
0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4,
0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e,
0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61,
0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b,
0x061d761c, 0xcab77682, 0x44387161, 0x889271ff,
0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05,
0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a,
0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0,
0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282,
0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78,
0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937,
0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd,
0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9,
0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53,
0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c,
0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6,
#endif /* CRC32_SLICE8 */
};

View file

@ -0,0 +1,61 @@
/*
* crc32_vec_template.h - template for vectorized CRC-32 implementations
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#define CRC32_SLICE1 1
static u32 crc32_slice1(u32, const u8 *, size_t);
/*
* Template for vectorized CRC-32 implementations.
*
* Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead
* of crc32_slice8() because only a few bytes need to be processed, so a smaller
* table is preferable.
*/
static u32 ATTRIBUTES
FUNCNAME(u32 remainder, const u8 *p, size_t size)
{
if ((uintptr_t)p % IMPL_ALIGNMENT) {
size_t n = MIN(size, -(uintptr_t)p % IMPL_ALIGNMENT);
remainder = crc32_slice1(remainder, p, n);
p += n;
size -= n;
}
if (size >= IMPL_SEGMENT_SIZE) {
remainder = FUNCNAME_ALIGNED(remainder, (const void *)p,
size / IMPL_SEGMENT_SIZE);
p += size - (size % IMPL_SEGMENT_SIZE);
size %= IMPL_SEGMENT_SIZE;
}
return crc32_slice1(remainder, p, size);
}
#undef FUNCNAME
#undef FUNCNAME_ALIGNED
#undef ATTRIBUTES
#undef IMPL_ALIGNMENT
#undef IMPL_SEGMENT_SIZE

View file

@ -0,0 +1,421 @@
/*
* decompress_template.h
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* This is the actual DEFLATE decompression routine, lifted out of
* deflate_decompress.c so that it can be compiled multiple times with different
* target instruction sets.
*/
static enum libdeflate_result ATTRIBUTES
FUNCNAME(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
{
u8 *out_next = out;
u8 * const out_end = out_next + out_nbytes_avail;
const u8 *in_next = in;
const u8 * const in_end = in_next + in_nbytes;
bitbuf_t bitbuf = 0;
unsigned bitsleft = 0;
size_t overrun_count = 0;
unsigned i;
unsigned is_final_block;
unsigned block_type;
u16 len;
u16 nlen;
unsigned num_litlen_syms;
unsigned num_offset_syms;
u16 tmp16;
u32 tmp32;
next_block:
/* Starting to read the next block. */
;
STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
ENSURE_BITS(1 + 2 + 5 + 5 + 4);
/* BFINAL: 1 bit */
is_final_block = POP_BITS(1);
/* BTYPE: 2 bits */
block_type = POP_BITS(2);
if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
/* Dynamic Huffman block. */
/* The order in which precode lengths are stored. */
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
};
unsigned num_explicit_precode_lens;
/* Read the codeword length counts. */
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
num_litlen_syms = POP_BITS(5) + 257;
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
num_offset_syms = POP_BITS(5) + 1;
STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
num_explicit_precode_lens = POP_BITS(4) + 4;
d->static_codes_loaded = false;
/* Read the precode codeword lengths. */
STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
for (i = 0; i < num_explicit_precode_lens; i++) {
ENSURE_BITS(3);
d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
}
for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
/* Build the decode table for the precode. */
SAFETY_CHECK(build_precode_decode_table(d));
/* Expand the literal/length and offset codeword lengths. */
for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
u32 entry;
unsigned presym;
u8 rep_val;
unsigned rep_count;
ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
/* (The code below assumes that the precode decode table
* does not have any subtables.) */
STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
/* Read the next precode symbol. */
entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
presym = entry >> HUFFDEC_RESULT_SHIFT;
if (presym < 16) {
/* Explicit codeword length */
d->u.l.lens[i++] = presym;
continue;
}
/* Run-length encoded codeword lengths */
/* Note: we don't need verify that the repeat count
* doesn't overflow the number of elements, since we
* have enough extra spaces to allow for the worst-case
* overflow (138 zeroes when only 1 length was
* remaining).
*
* In the case of the small repeat counts (presyms 16
* and 17), it is fastest to always write the maximum
* number of entries. That gets rid of branches that
* would otherwise be required.
*
* It is not just because of the numerical order that
* our checks go in the order 'presym < 16', 'presym ==
* 16', and 'presym == 17'. For typical data this is
* ordered from most frequent to least frequent case.
*/
STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
if (presym == 16) {
/* Repeat the previous length 3 - 6 times */
SAFETY_CHECK(i != 0);
rep_val = d->u.l.lens[i - 1];
STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
rep_count = 3 + POP_BITS(2);
d->u.l.lens[i + 0] = rep_val;
d->u.l.lens[i + 1] = rep_val;
d->u.l.lens[i + 2] = rep_val;
d->u.l.lens[i + 3] = rep_val;
d->u.l.lens[i + 4] = rep_val;
d->u.l.lens[i + 5] = rep_val;
i += rep_count;
} else if (presym == 17) {
/* Repeat zero 3 - 10 times */
STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
rep_count = 3 + POP_BITS(3);
d->u.l.lens[i + 0] = 0;
d->u.l.lens[i + 1] = 0;
d->u.l.lens[i + 2] = 0;
d->u.l.lens[i + 3] = 0;
d->u.l.lens[i + 4] = 0;
d->u.l.lens[i + 5] = 0;
d->u.l.lens[i + 6] = 0;
d->u.l.lens[i + 7] = 0;
d->u.l.lens[i + 8] = 0;
d->u.l.lens[i + 9] = 0;
i += rep_count;
} else {
/* Repeat zero 11 - 138 times */
STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
rep_count = 11 + POP_BITS(7);
memset(&d->u.l.lens[i], 0,
rep_count * sizeof(d->u.l.lens[i]));
i += rep_count;
}
}
} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
/* Uncompressed block: copy 'len' bytes literally from the input
* buffer to the output buffer. */
ALIGN_INPUT();
SAFETY_CHECK(in_end - in_next >= 4);
len = READ_U16();
nlen = READ_U16();
SAFETY_CHECK(len == (u16)~nlen);
if (unlikely(len > out_end - out_next))
return LIBDEFLATE_INSUFFICIENT_SPACE;
SAFETY_CHECK(len <= in_end - in_next);
memcpy(out_next, in_next, len);
in_next += len;
out_next += len;
goto block_done;
} else {
SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
/*
* Static Huffman block: build the decode tables for the static
* codes. Skip doing so if the tables are already set up from
* an earlier static block; this speeds up decompression of
* degenerate input of many empty or very short static blocks.
*
* Afterwards, the remainder is the same as decompressing a
* dynamic Huffman block.
*/
if (d->static_codes_loaded)
goto have_decode_tables;
d->static_codes_loaded = true;
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
for (i = 0; i < 144; i++)
d->u.l.lens[i] = 8;
for (; i < 256; i++)
d->u.l.lens[i] = 9;
for (; i < 280; i++)
d->u.l.lens[i] = 7;
for (; i < 288; i++)
d->u.l.lens[i] = 8;
for (; i < 288 + 32; i++)
d->u.l.lens[i] = 5;
num_litlen_syms = 288;
num_offset_syms = 32;
}
/* Decompressing a Huffman block (either dynamic or static) */
SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
have_decode_tables:
/* The main DEFLATE decode loop */
for (;;) {
u32 entry;
u32 length;
u32 offset;
const u8 *src;
u8 *dst;
/* Decode a litlen symbol. */
ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)];
if (entry & HUFFDEC_SUBTABLE_POINTER) {
/* Litlen subtable required (uncommon case) */
REMOVE_BITS(LITLEN_TABLEBITS);
entry = d->u.litlen_decode_table[
((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
BITS(entry & HUFFDEC_LENGTH_MASK)];
}
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
if (entry & HUFFDEC_LITERAL) {
/* Literal */
if (unlikely(out_next == out_end))
return LIBDEFLATE_INSUFFICIENT_SPACE;
*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
continue;
}
/* Match or end-of-block */
entry >>= HUFFDEC_RESULT_SHIFT;
ENSURE_BITS(MAX_ENSURE);
/* Pop the extra length bits and add them to the length base to
* produce the full length. */
length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
/* The match destination must not end after the end of the
* output buffer. For efficiency, combine this check with the
* end-of-block check. We're using 0 for the special
* end-of-block length, so subtract 1 and it turn it into
* SIZE_MAX. */
STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
if (unlikely((size_t)length - 1 >= out_end - out_next)) {
if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH))
return LIBDEFLATE_INSUFFICIENT_SPACE;
goto block_done;
}
/* Decode the match offset. */
entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
if (entry & HUFFDEC_SUBTABLE_POINTER) {
/* Offset subtable required (uncommon case) */
REMOVE_BITS(OFFSET_TABLEBITS);
entry = d->offset_decode_table[
((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
BITS(entry & HUFFDEC_LENGTH_MASK)];
}
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
entry >>= HUFFDEC_RESULT_SHIFT;
STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
DEFLATE_MAX_OFFSET_CODEWORD_LEN +
DEFLATE_MAX_EXTRA_OFFSET_BITS))
ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
/* Pop the extra offset bits and add them to the offset base to
* produce the full offset. */
offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
/* The match source must not begin before the beginning of the
* output buffer. */
SAFETY_CHECK(offset <= out_next - (const u8 *)out);
/*
* Copy the match: 'length' bytes at 'out_next - offset' to
* 'out_next', possibly overlapping. If the match doesn't end
* too close to the end of the buffer and offset >= WORDBYTES ||
* offset == 1, take a fast path which copies a word at a time
* -- potentially more than the length of the match, but that's
* fine as long as we check for enough extra space.
*
* The remaining cases are not performance-critical so are
* handled by a simple byte-by-byte copy.
*/
src = out_next - offset;
dst = out_next;
out_next += length;
if (UNALIGNED_ACCESS_IS_FAST &&
/* max overrun is writing 3 words for a min length match */
likely(out_end - out_next >=
3 * WORDBYTES - DEFLATE_MIN_MATCH_LEN)) {
if (offset >= WORDBYTES) { /* words don't overlap? */
copy_word_unaligned(src, dst);
src += WORDBYTES;
dst += WORDBYTES;
copy_word_unaligned(src, dst);
src += WORDBYTES;
dst += WORDBYTES;
do {
copy_word_unaligned(src, dst);
src += WORDBYTES;
dst += WORDBYTES;
} while (dst < out_next);
} else if (offset == 1) {
/* RLE encoding of previous byte, common if the
* data contains many repeated bytes */
machine_word_t v = repeat_byte(*src);
store_word_unaligned(v, dst);
dst += WORDBYTES;
store_word_unaligned(v, dst);
dst += WORDBYTES;
do {
store_word_unaligned(v, dst);
dst += WORDBYTES;
} while (dst < out_next);
} else {
*dst++ = *src++;
*dst++ = *src++;
do {
*dst++ = *src++;
} while (dst < out_next);
}
} else {
STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
*dst++ = *src++;
*dst++ = *src++;
do {
*dst++ = *src++;
} while (dst < out_next);
}
}
block_done:
/* Finished decoding a block. */
if (!is_final_block)
goto next_block;
/* That was the last block. */
/* Discard any readahead bits and check for excessive overread */
ALIGN_INPUT();
/* Optionally return the actual number of bytes read */
if (actual_in_nbytes_ret)
*actual_in_nbytes_ret = in_next - (u8 *)in;
/* Optionally return the actual number of bytes written */
if (actual_out_nbytes_ret) {
*actual_out_nbytes_ret = out_next - (u8 *)out;
} else {
if (out_next != out_end)
return LIBDEFLATE_SHORT_OUTPUT;
}
return LIBDEFLATE_SUCCESS;
}
#undef FUNCNAME
#undef ATTRIBUTES

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,13 @@
#ifndef LIB_DEFLATE_COMPRESS_H
#define LIB_DEFLATE_COMPRESS_H
#include "lib_common.h"
/* DEFLATE compression is private to deflate_compress.c, but we do need to be
* able to query the compression level for zlib and gzip header generation. */
struct libdeflate_compressor;
unsigned int deflate_get_compression_level(struct libdeflate_compressor *c);
#endif /* LIB_DEFLATE_COMPRESS_H */

View file

@ -0,0 +1,102 @@
/*
* deflate_constants.h - constants for the DEFLATE compression format
*/
#ifndef LIB_DEFLATE_CONSTANTS_H
#define LIB_DEFLATE_CONSTANTS_H
#ifdef WITH_GDEFLATE
/* Enable GDeflate mode. */
#define GDEFLATE
/* GDeflate is deflate64-based. */
#define DEFLATE64
/* Number of GDeflate streams. */
#define NUM_STREAMS 32
/* The number of bits to keep in input buffer. */
#define LOW_WATERMARK_BITS 32
/* Number of bits per GDeflate bit-packet. */
#define BITS_PER_PACKET 32
/* GDeflate page size. */
#define GDEFLATE_PAGE_SIZE 65536
#endif
/* Valid block types */
#define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0
#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1
#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2
/* Minimum and maximum supported match lengths (in bytes) */
#define DEFLATE_MIN_MATCH_LEN 3
#ifndef DEFLATE64
#define DEFLATE_MAX_MATCH_LEN 258
#else
/* The maximum length of a deflate64 match is 65538, however the bt_matcher
* uses 16-bits to store match lengths so to have the bt_matcher to
* work without overflows the maximum length either needs to be reduced
* to fit into 16-bits, or length storage type changed to a wider type. */
#define DEFLATE_MAX_MATCH_LEN 65535
#endif
/* Minimum and maximum supported match offsets (in bytes) */
#define DEFLATE_MIN_MATCH_OFFSET 1
#ifdef DEFLATE64
#define DEFLATE_MAX_MATCH_OFFSET (32768*2)
#define DEFLATE_MAX_WINDOW_SIZE (32768*2)
#else
#define DEFLATE_MAX_MATCH_OFFSET 32768
#define DEFLATE_MAX_WINDOW_SIZE 32768
#endif
/* Number of symbols in each Huffman code. Note: for the literal/length
* and offset codes, these are actually the maximum values; a given block
* might use fewer symbols. */
#define DEFLATE_NUM_PRECODE_SYMS 19
#define DEFLATE_NUM_LITLEN_SYMS 288
#define DEFLATE_NUM_OFFSET_SYMS 32
/* The maximum number of symbols across all codes */
#define DEFLATE_MAX_NUM_SYMS 288
/* Division of symbols in the literal/length code */
#define DEFLATE_NUM_LITERALS 256
#define DEFLATE_END_OF_BLOCK 256
#define DEFLATE_NUM_LEN_SYMS 31
/* Maximum codeword length, in bits, within each Huffman code */
#define DEFLATE_MAX_PRE_CODEWORD_LEN 7
#define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15
#define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15
/* The maximum codeword length across all codes */
#define DEFLATE_MAX_CODEWORD_LEN 15
/* Maximum possible overrun when decoding codeword lengths */
#define DEFLATE_MAX_LENS_OVERRUN 137
/*
* Maximum number of extra bits that may be required to represent a match
* length or offset.
*/
#ifdef DEFLATE64
#define DEFLATE_MAX_EXTRA_LENGTH_BITS 16
#define DEFLATE_MAX_EXTRA_OFFSET_BITS 14
#else
#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5
#define DEFLATE_MAX_EXTRA_OFFSET_BITS 14
#endif
/* The maximum number of bits in which a match can be represented. This
* is the absolute worst case, which assumes the longest possible Huffman
* codewords and the maximum numbers of extra bits. */
#define DEFLATE_MAX_MATCH_BITS \
(DEFLATE_MAX_LITLEN_CODEWORD_LEN + DEFLATE_MAX_EXTRA_LENGTH_BITS + \
DEFLATE_MAX_OFFSET_CODEWORD_LEN + DEFLATE_MAX_EXTRA_OFFSET_BITS)
#endif /* LIB_DEFLATE_CONSTANTS_H */

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,246 @@
/*
* gdeflate_compress.c - a compressor for GDEFLATE
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
#define WITH_GDEFLATE
#define HIDE_INTERFACE
#define libdeflate_compressor libdeflate_gdeflate_compressor
#include "deflate_compress.c"
LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
libdeflate_alloc_gdeflate_compressor(int compression_level)
{
struct libdeflate_compressor *c;
size_t size = offsetof(struct libdeflate_compressor, p);
if (compression_level < 0 || compression_level > 12)
return NULL;
#if SUPPORT_NEAR_OPTIMAL_PARSING
if (compression_level >= 8)
size += sizeof(c->p.n);
else if (compression_level >= 1)
size += sizeof(c->p.g);
#else
if (compression_level >= 1)
size += sizeof(c->p.g);
#endif
c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
if (!c)
return NULL;
c->compression_level = compression_level;
/*
* The higher the compression level, the more we should bother trying to
* compress very small inputs.
*/
c->min_size_to_compress = 56 - (compression_level * 4);
switch (compression_level) {
case 0:
c->impl = deflate_compress_none;
break;
case 1:
c->impl = deflate_compress_greedy;
c->max_search_depth = 2;
c->nice_match_length = 8;
break;
case 2:
c->impl = deflate_compress_greedy;
c->max_search_depth = 6;
c->nice_match_length = 10;
break;
case 3:
c->impl = deflate_compress_greedy;
c->max_search_depth = 12;
c->nice_match_length = 14;
break;
case 4:
c->impl = deflate_compress_greedy;
c->max_search_depth = 24;
c->nice_match_length = 24;
break;
case 5:
c->impl = deflate_compress_lazy;
c->max_search_depth = 20;
c->nice_match_length = 30;
break;
case 6:
c->impl = deflate_compress_lazy;
c->max_search_depth = 40;
c->nice_match_length = 65;
break;
case 7:
c->impl = deflate_compress_lazy;
c->max_search_depth = 100;
c->nice_match_length = 130;
break;
#if SUPPORT_NEAR_OPTIMAL_PARSING
case 8:
c->impl = deflate_compress_near_optimal;
c->max_search_depth = 12;
c->nice_match_length = 20;
c->p.n.num_optim_passes = 1;
break;
case 9:
c->impl = deflate_compress_near_optimal;
c->max_search_depth = 16;
c->nice_match_length = 26;
c->p.n.num_optim_passes = 2;
break;
case 10:
c->impl = deflate_compress_near_optimal;
c->max_search_depth = 30;
c->nice_match_length = 50;
c->p.n.num_optim_passes = 2;
break;
case 11:
c->impl = deflate_compress_near_optimal;
c->max_search_depth = 60;
c->nice_match_length = 80;
c->p.n.num_optim_passes = 3;
break;
default:
c->impl = deflate_compress_near_optimal;
c->max_search_depth = 100;
c->nice_match_length = 133;
c->p.n.num_optim_passes = 4;
break;
#else
case 8:
c->impl = deflate_compress_lazy;
c->max_search_depth = 150;
c->nice_match_length = 200;
break;
default:
c->impl = deflate_compress_lazy;
c->max_search_depth = 200;
c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
break;
#endif
}
deflate_init_offset_slot_fast(c);
deflate_init_static_codes(c);
deflate_init_length_slot();
return c;
}
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_gdeflate_compress(struct libdeflate_compressor *c,
const void *in, size_t in_nbytes,
struct libdeflate_gdeflate_out_page* out_pages,
size_t out_npages)
{
const u8 * in_bytes = in;
size_t out_nbytes = 0;
size_t npages;
size_t upper_bound = libdeflate_gdeflate_compress_bound(c,
in_nbytes, &npages);
size_t page_upper_bound = upper_bound / npages;
if (unlikely(out_pages == NULL || out_npages != npages))
return 0;
for (size_t page = 0; page < npages; page++) {
size_t comp_page_nbytes;
const size_t page_nbytes = in_nbytes > GDEFLATE_PAGE_SIZE ?
GDEFLATE_PAGE_SIZE : in_nbytes;
if (unlikely(out_pages[page].nbytes < page_upper_bound)) {
return 0;
}
comp_page_nbytes = (*c->impl)(c, in_bytes, page_nbytes,
out_pages[page].data, page_upper_bound);
out_pages[page].nbytes = comp_page_nbytes;
/* Page did not fit - bail out. */
if (unlikely(comp_page_nbytes == 0))
return 0;
in_bytes += page_nbytes;
in_nbytes -= page_nbytes;
out_nbytes += comp_page_nbytes;
}
return out_nbytes;
}
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_gdeflate_compress_ex(struct libdeflate_compressor *c,
const struct libdeflate_gdeflate_in_page* in_pages,
struct libdeflate_gdeflate_out_page* out_pages, size_t npages)
{
if (unlikely(out_pages == NULL || in_pages == NULL))
return 0;
for (size_t page = 0; page < npages; page++) {
size_t comp_page_nbytes;
comp_page_nbytes = (*c->impl)(c, in_pages[page].data, in_pages[page].nbytes,
out_pages[page].data, out_pages[page].nbytes);
out_pages[page].nbytes = comp_page_nbytes;
/* Page did not fit - bail out. */
if (unlikely(comp_page_nbytes == 0))
return page;
}
return npages;
}
LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_free_gdeflate_compressor(struct libdeflate_compressor *c)
{
libdeflate_aligned_free(c);
}
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_gdeflate_compress_bound(struct libdeflate_compressor *c,
size_t in_nbytes, size_t *out_npages)
{
const size_t page_bound = libdeflate_deflate_compress_bound(c, GDEFLATE_PAGE_SIZE);
const size_t npages = DIV_ROUND_UP(in_nbytes, GDEFLATE_PAGE_SIZE);
if (out_npages)
*out_npages = npages;
return (page_bound + (NUM_STREAMS * BITS_PER_PACKET) / 8) * npages;
}

View file

@ -0,0 +1,134 @@
/*
* gdeflate_decompress.c - a decompressor for GDEFLATE
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
#define WITH_GDEFLATE
#define HIDE_INTERFACE
#define libdeflate_decompressor libdeflate_gdeflate_decompressor
#include "deflate_decompress.c"
/*
* This is the main GDEFLATE decompression routine. See libdeflate.h for the
* documentation.
*
* Note that the real code is in gdeflate_decompress_template.h. The part here
* just handles calling the appropriate implementation depending on the CPU
* features at runtime.
*/
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_gdeflate_decompress(struct libdeflate_decompressor * restrict d,
struct libdeflate_gdeflate_in_page *in_pages,
size_t in_npages, void * restrict out,
size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
{
u8 * restrict out_bytes = out;
if (unlikely(in_pages == NULL || in_npages == 0))
return LIBDEFLATE_BAD_DATA;
for (size_t npage = 0; npage < in_npages; npage++) {
size_t page_out_nbytes_ret, page_in_nbytes_ret;
enum libdeflate_result res;
res = decompress_impl(d, in_pages[npage].data,
in_pages[npage].nbytes, out_bytes,
out_nbytes_avail, &page_in_nbytes_ret,
&page_out_nbytes_ret);
if (unlikely(res != LIBDEFLATE_SUCCESS))
return res;
out_bytes += page_out_nbytes_ret;
out_nbytes_avail -= page_out_nbytes_ret;
if (actual_out_nbytes_ret)
*actual_out_nbytes_ret += page_out_nbytes_ret;
}
return LIBDEFLATE_SUCCESS;
}
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_gdeflate_decompress_ex(struct libdeflate_gdeflate_decompressor * restrict d,
struct libdeflate_gdeflate_in_page * restrict in_pages,
struct libdeflate_gdeflate_out_page * restrict out_pages, size_t npages)
{
if (unlikely(in_pages == NULL || out_pages == NULL || npages == 0))
return LIBDEFLATE_BAD_DATA;
for (size_t npage = 0; npage < npages; npage++) {
size_t page_out_nbytes_ret, page_in_nbytes_ret;
enum libdeflate_result res;
res = decompress_impl(d, in_pages[npage].data,
in_pages[npage].nbytes, out_pages[npage].data,
out_pages[npage].nbytes, &page_in_nbytes_ret,
&page_out_nbytes_ret);
if (unlikely(res != LIBDEFLATE_SUCCESS))
return res;
}
return LIBDEFLATE_SUCCESS;
}
LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI
libdeflate_alloc_gdeflate_decompressor(void)
{
/*
* Note that only certain parts of the decompressor actually must be
* initialized here:
*
* - 'static_codes_loaded' must be initialized to false.
*
* - The first half of the main portion of each decode table must be
* initialized to any value, to avoid reading from uninitialized
* memory during table expansion in build_decode_table(). (Although,
* this is really just to avoid warnings with dynamic tools like
* valgrind, since build_decode_table() is guaranteed to initialize
* all entries eventually anyway.)
*
* But for simplicity, we currently just zero the whole decompressor.
*/
struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d));
if (d == NULL)
return NULL;
memset(d, 0, sizeof(*d));
return d;
}
LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_free_gdeflate_decompressor(struct libdeflate_decompressor *d)
{
libdeflate_free(d);
}

View file

@ -0,0 +1,611 @@
/*
* gdeflate_decompress_template.h
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This is the actual GDEFLATE decompression routine, lifted out of
* gdeflate_decompress.c so that it can be compiled multiple times with
* different target instruction sets.
*/
/*
* Does the bitbuffer variable currently contain at least 'n' bits?
*/
#undef HAVE_BITS
#define HAVE_BITS(n) (s->bitsleft[s->idx] >= (n))
/*
* Load more bits from the input buffer until the specified number of bits is
* present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE
* and CAN_ENSURE().
*/
#undef ENSURE_BITS
#define ENSURE_BITS(n) \
if (!HAVE_BITS(n)) { \
s->bitbuf[s->idx] |= (bitbuf_t)(get_unaligned_le32(in_next)) << \
s->bitsleft[s->idx]; \
in_next += BITS_PER_PACKET/8; \
s->bitsleft[s->idx] += BITS_PER_PACKET; \
}
/*
* Return the next 'n' bits from the bitbuffer variable without removing them.
*/
#undef BITS
#define BITS(n) ((u32)s->bitbuf[s->idx] & (((u32)1 << (n)) - 1))
/*
* Remove the next 'n' bits from the bitbuffer variable.
*/
#undef REMOVE_BITS
#define REMOVE_BITS(n) (s->bitbuf[s->idx] >>= (n), s->bitsleft[s->idx] -= (n))
/*
* Setup copy advance method depending on a number of streams used.
*/
#if (NUM_STREAMS == 32)
#define ADVANCE_COPIES() is_copy = _rotr(is_copy, 1)
#else
# pragma message("Invalid number of GDeflate streams used!")
#endif
/*
* Reset GDeflate stream index.
*/
#define RESET() s->idx = 0
/*
* Advance GDeflate stream index. Refill bits if necessary.
*/
#define ADVANCE() do { \
ENSURE_BITS(LOW_WATERMARK_BITS); \
s->idx = (s->idx + 1)%NUM_STREAMS; \
ADVANCE_COPIES(); \
} while(0)
/*
* Tells if current GDeflate stream has a deferred copy.
*/
#define IS_COPY() (is_copy&1)
/*
* Stores a deferred copy in current GDeflate stream.
*/
#define STORE_COPY(len, out) do { \
s->copies[s->idx].length = len; \
s->copies[s->idx].out_next = out; \
is_copy |= 1; \
} while(0)
/*
* Marks a copy in current current GDeflate stream as complete.
*/
#define COPY_COMPLETE() (is_copy &= ~1)
/*
* Prevent multiple type declarations.
*/
#ifndef GDEFLATE_TYPES_DECLARED
/*
* Setup is_copy type depending on a number of streams used.
*/
#if (NUM_STREAMS == 32)
typedef u32 is_copy_t;
#else
# pragma message("Invalid number of GDeflate streams used!")
#endif
/*
* GDeflate deferred copy state structure.
*/
struct gdeflate_deferred_copy {
unsigned length;
u8 * out_next;
};
/*
* GDeflate state structure.
*/
struct gdeflate_state {
bitbuf_t bitbuf[NUM_STREAMS];
unsigned bitsleft[NUM_STREAMS];
struct gdeflate_deferred_copy copies[NUM_STREAMS];
unsigned idx;
};
/*
* Prevent multiple type declarations.
*/
#define GDEFLATE_TYPES_DECLARED
#endif /* GDEFLATE_TYPES_DECLARED */
/*
* Specialize gdeflate_do_copy function name.
*/
#define DO_COPY CONCAT(FUNCNAME, _gdeflate_do_copy)
/*
* Perform a deferred GDeflate copy.
*/
static forceinline enum libdeflate_result ATTRIBUTES
DO_COPY(struct libdeflate_decompressor * restrict d,
struct gdeflate_state * s, u8 * out, u8 * const out_end)
{
u32 entry;
u32 offset;
const u8 *src;
u8 *dst;
u32 tmp32;
/* Pop match params. */
u32 length = s->copies[s->idx].length;
u8 * out_next = s->copies[s->idx].out_next;
/* Decode the match offset. */
entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
if (entry & HUFFDEC_SUBTABLE_POINTER) {
/* Offset subtable required (uncommon case) */
REMOVE_BITS(OFFSET_TABLEBITS);
entry = d->offset_decode_table[
((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
BITS(entry & HUFFDEC_LENGTH_MASK)];
}
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
entry >>= HUFFDEC_RESULT_SHIFT;
/* Pop the extra offset bits and add them to the offset base to
* produce the full offset. */
offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
/* The match source must not begin before the beginning of the
* output buffer. */
SAFETY_CHECK(offset <= out_next - (const u8 *)out);
/*
* Copy the match: 'length' bytes at 'out_next - offset' to
* 'out_next', possibly overlapping. If the match doesn't end
* too close to the end of the buffer and offset >= WORDBYTES ||
* offset == 1, take a fast path which copies a word at a time
* -- potentially more than the length of the match, but that's
* fine as long as we check for enough extra space.
*
* The remaining cases are not performance-critical so are
* handled by a simple byte-by-byte copy.
*/
src = out_next - offset;
dst = out_next;
out_next += length;
if (UNALIGNED_ACCESS_IS_FAST &&
likely(out_end - out_next >= WORDBYTES && length >= WORDBYTES)) {
if (offset >= WORDBYTES) { /* words don't overlap? */
while (dst < out_next - WORDBYTES) {
copy_word_unaligned(src, dst);
src += WORDBYTES;
dst += WORDBYTES;
}
/* Tail. */
while (dst < out_next) *dst++ = *src++;
} else if (offset == 1) {
/* RLE encoding of previous byte, common if the
* data contains many repeated bytes */
machine_word_t v = repeat_byte(*src);
while (dst < out_next - WORDBYTES) {
store_word_unaligned(v, dst);
dst += WORDBYTES;
}
/* Tail. */
while (dst < out_next) *dst++ = (u8)v;
} else {
*dst++ = *src++;
*dst++ = *src++;
do {
*dst++ = *src++;
} while (dst < out_next);
}
} else {
STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
*dst++ = *src++;
*dst++ = *src++;
do {
*dst++ = *src++;
} while (dst < out_next);
}
return LIBDEFLATE_SUCCESS;
}
static enum libdeflate_result ATTRIBUTES
FUNCNAME(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
{
u8 *out_next = out;
u8 * const out_end = out_next + out_nbytes_avail;
const u8 *in_next = in;
const u8 * const in_end = in_next + in_nbytes;
struct gdeflate_state state;
struct gdeflate_state * s = &state;
unsigned i;
unsigned is_final_block;
unsigned block_type;
u16 len;
unsigned num_litlen_syms;
unsigned num_offset_syms;
u32 tmp32;
is_copy_t is_copy = 0;
/* Starting to read GDeflate stream. */
RESET();
for (unsigned n = 0; n < NUM_STREAMS; n++) {
s->bitbuf[n] = 0;
s->bitsleft[n] = 0;
s->copies[n].length = 0;
ADVANCE();
}
next_block:
/* Starting to read the next block. */
RESET();
/* BFINAL: 1 bit */
is_final_block = POP_BITS(1);
/* BTYPE: 2 bits */
block_type = POP_BITS(2);
ENSURE_BITS(LOW_WATERMARK_BITS);
if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
/* Dynamic Huffman block. */
/* The order in which precode lengths are stored. */
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
};
unsigned num_explicit_precode_lens;
/* Read the codeword length counts. */
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
num_litlen_syms = POP_BITS(5) + 257;
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
num_offset_syms = POP_BITS(5) + 1;
STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
num_explicit_precode_lens = POP_BITS(4) + 4;
d->static_codes_loaded = false;
ENSURE_BITS(LOW_WATERMARK_BITS);
/* Read the precode codeword lengths. */
STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
for (i = 0; i < num_explicit_precode_lens; i++) {
d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
ADVANCE();
}
for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
/* Build the decode table for the precode. */
SAFETY_CHECK(build_precode_decode_table(d));
RESET();
/* Expand the literal/length and offset codeword lengths. */
for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
u32 entry;
unsigned presym;
u8 rep_val;
unsigned rep_count;
/* (The code below assumes that the precode decode table
* does not have any subtables.) */
STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
/* Read the next precode symbol. */
entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
presym = entry >> HUFFDEC_RESULT_SHIFT;
if (presym < 16) {
/* Explicit codeword length */
d->u.l.lens[i++] = presym;
ADVANCE();
continue;
}
/* Run-length encoded codeword lengths */
/* Note: we don't need verify that the repeat count
* doesn't overflow the number of elements, since we
* have enough extra spaces to allow for the worst-case
* overflow (138 zeroes when only 1 length was
* remaining).
*
* In the case of the small repeat counts (presyms 16
* and 17), it is fastest to always write the maximum
* number of entries. That gets rid of branches that
* would otherwise be required.
*
* It is not just because of the numerical order that
* our checks go in the order 'presym < 16', 'presym ==
* 16', and 'presym == 17'. For typical data this is
* ordered from most frequent to least frequent case.
*/
STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
if (presym == 16) {
/* Repeat the previous length 3 - 6 times */
SAFETY_CHECK(i != 0);
rep_val = d->u.l.lens[i - 1];
STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
rep_count = 3 + POP_BITS(2);
d->u.l.lens[i + 0] = rep_val;
d->u.l.lens[i + 1] = rep_val;
d->u.l.lens[i + 2] = rep_val;
d->u.l.lens[i + 3] = rep_val;
d->u.l.lens[i + 4] = rep_val;
d->u.l.lens[i + 5] = rep_val;
i += rep_count;
} else if (presym == 17) {
/* Repeat zero 3 - 10 times */
STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
rep_count = 3 + POP_BITS(3);
d->u.l.lens[i + 0] = 0;
d->u.l.lens[i + 1] = 0;
d->u.l.lens[i + 2] = 0;
d->u.l.lens[i + 3] = 0;
d->u.l.lens[i + 4] = 0;
d->u.l.lens[i + 5] = 0;
d->u.l.lens[i + 6] = 0;
d->u.l.lens[i + 7] = 0;
d->u.l.lens[i + 8] = 0;
d->u.l.lens[i + 9] = 0;
i += rep_count;
} else {
/* Repeat zero 11 - 138 times */
STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
rep_count = 11 + POP_BITS(7);
memset(&d->u.l.lens[i], 0,
rep_count * sizeof(d->u.l.lens[i]));
i += rep_count;
}
ADVANCE();
}
} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
/* Uncompressed block: copy 'len' bytes literally from the input
* buffer to the output buffer. */
/* Count bits in the bit buffers. */
u32 num_buffered_bits = 0;
for (u32 n = 0; n < NUM_STREAMS; n++)
num_buffered_bits += s->bitsleft[n];
SAFETY_CHECK(in_end - in_next + (num_buffered_bits + 7)/8 >= 2);
len = POP_BITS(16);
if (unlikely(len > out_end - out_next))
return LIBDEFLATE_INSUFFICIENT_SPACE;
SAFETY_CHECK(len <= in_end - in_next + (num_buffered_bits + 7)/8);
while (len) {
*out_next++ = POP_BITS(8);
len--;
ADVANCE();
}
goto block_done;
} else {
SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
/*
* Static Huffman block: build the decode tables for the static
* codes. Skip doing so if the tables are already set up from
* an earlier static block; this speeds up decompression of
* degenerate input of many empty or very short static blocks.
*
* Afterwards, the remainder is the same as decompressing a
* dynamic Huffman block.
*/
if (d->static_codes_loaded)
goto have_decode_tables;
d->static_codes_loaded = true;
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
for (i = 0; i < 144; i++)
d->u.l.lens[i] = 8;
for (; i < 256; i++)
d->u.l.lens[i] = 9;
for (; i < 280; i++)
d->u.l.lens[i] = 7;
for (; i < 288; i++)
d->u.l.lens[i] = 8;
for (; i < 288 + 32; i++)
d->u.l.lens[i] = 5;
num_litlen_syms = 288;
num_offset_syms = 32;
}
/* Decompressing a Huffman block (either dynamic or static) */
SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
have_decode_tables:
RESET();
/* The main GDEFLATE decode loop */
for (;;) {
u32 entry;
u32 length;
if (likely(!IS_COPY())) {
/* Decode a litlen symbol. */
entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)];
if (entry & HUFFDEC_SUBTABLE_POINTER) {
/* Litlen subtable required (uncommon case) */
REMOVE_BITS(LITLEN_TABLEBITS);
entry = d->u.litlen_decode_table[
((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
BITS(entry & HUFFDEC_LENGTH_MASK)];
}
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
if (entry & HUFFDEC_LITERAL) {
/* Literal */
if (unlikely(out_next == out_end))
return LIBDEFLATE_INSUFFICIENT_SPACE;
*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
ADVANCE();
continue;
}
/* Match or end-of-block */
entry >>= HUFFDEC_RESULT_SHIFT;
/* Pop the extra length bits and add them to the length base to
* produce the full length. */
length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
/* The match destination must not end after the end of the
* output buffer. For efficiency, combine this check with the
* end-of-block check. We're using 0 for the special
* end-of-block length, so subtract 1 and it turn it into
* SIZE_MAX. */
STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
if (unlikely((size_t)length - 1 >= out_end - out_next)) {
if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH))
return LIBDEFLATE_INSUFFICIENT_SPACE;
goto block_done;
}
/* Store copy for use later. */
STORE_COPY(length, out_next);
/* Advance output stream. */
out_next += length;
} else {
enum libdeflate_result res =
DO_COPY(d, s, out, out_end);
if (unlikely(res))
return res;
COPY_COMPLETE();
}
ADVANCE();
}
block_done:
/* Run the outstanding deferred copies. */
for (unsigned n = 0; n < NUM_STREAMS; n++) {
if (IS_COPY()) {
enum libdeflate_result res =
DO_COPY(d, s, out, out_end);
if (unlikely(res))
return res;
COPY_COMPLETE();
}
ADVANCE();
}
/* Finished decoding a block. */
if (!is_final_block)
goto next_block;
/* That was the last block. */
/* Optionally return the actual number of bytes read */
if (actual_in_nbytes_ret)
*actual_in_nbytes_ret = in_next - (u8 *)in;
/* Optionally return the actual number of bytes written */
if (actual_out_nbytes_ret) {
*actual_out_nbytes_ret = out_next - (u8 *)out;
} else {
if (out_next != out_end)
return LIBDEFLATE_SHORT_OUTPUT;
}
return LIBDEFLATE_SUCCESS;
}
#undef FUNCNAME
#undef ATTRIBUTES
#undef IS_COPY
#undef DO_COPY
#undef ADVANCE_COPIES
#undef STORE_COPY
#undef COPY_COMPLETE
#undef RESET
#undef ADVANCE

View file

@ -0,0 +1,95 @@
/*
* gzip_compress.c - compress with a gzip wrapper
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "deflate_compress.h"
#include "gzip_constants.h"
#include "unaligned.h"
#include "libdeflate.h"
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_gzip_compress(struct libdeflate_compressor *c,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail)
{
u8 *out_next = out;
unsigned compression_level;
u8 xfl;
size_t deflate_size;
if (out_nbytes_avail <= GZIP_MIN_OVERHEAD)
return 0;
/* ID1 */
*out_next++ = GZIP_ID1;
/* ID2 */
*out_next++ = GZIP_ID2;
/* CM */
*out_next++ = GZIP_CM_DEFLATE;
/* FLG */
*out_next++ = 0;
/* MTIME */
put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next);
out_next += 4;
/* XFL */
xfl = 0;
compression_level = deflate_get_compression_level(c);
if (compression_level < 2)
xfl |= GZIP_XFL_FASTEST_COMPRESSION;
else if (compression_level >= 8)
xfl |= GZIP_XFL_SLOWEST_COMPRESSION;
*out_next++ = xfl;
/* OS */
*out_next++ = GZIP_OS_UNKNOWN; /* OS */
/* Compressed data */
deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
out_nbytes_avail - GZIP_MIN_OVERHEAD);
if (deflate_size == 0)
return 0;
out_next += deflate_size;
/* CRC32 */
put_unaligned_le32(libdeflate_crc32(0, in, in_nbytes), out_next);
out_next += 4;
/* ISIZE */
put_unaligned_le32((u32)in_nbytes, out_next);
out_next += 4;
return out_next - (u8 *)out;
}
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_gzip_compress_bound(struct libdeflate_compressor *c,
size_t in_nbytes)
{
return GZIP_MIN_OVERHEAD +
libdeflate_deflate_compress_bound(c, in_nbytes);
}

View file

@ -0,0 +1,45 @@
/*
* gzip_constants.h - constants for the gzip wrapper format
*/
#ifndef LIB_GZIP_CONSTANTS_H
#define LIB_GZIP_CONSTANTS_H
#define GZIP_MIN_HEADER_SIZE 10
#define GZIP_FOOTER_SIZE 8
#define GZIP_MIN_OVERHEAD (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
#define GZIP_ID1 0x1F
#define GZIP_ID2 0x8B
#define GZIP_CM_DEFLATE 8
#define GZIP_FTEXT 0x01
#define GZIP_FHCRC 0x02
#define GZIP_FEXTRA 0x04
#define GZIP_FNAME 0x08
#define GZIP_FCOMMENT 0x10
#define GZIP_FRESERVED 0xE0
#define GZIP_MTIME_UNAVAILABLE 0
#define GZIP_XFL_SLOWEST_COMPRESSION 0x02
#define GZIP_XFL_FASTEST_COMPRESSION 0x04
#define GZIP_OS_FAT 0
#define GZIP_OS_AMIGA 1
#define GZIP_OS_VMS 2
#define GZIP_OS_UNIX 3
#define GZIP_OS_VM_CMS 4
#define GZIP_OS_ATARI_TOS 5
#define GZIP_OS_HPFS 6
#define GZIP_OS_MACINTOSH 7
#define GZIP_OS_Z_SYSTEM 8
#define GZIP_OS_CP_M 9
#define GZIP_OS_TOPS_20 10
#define GZIP_OS_NTFS 11
#define GZIP_OS_QDOS 12
#define GZIP_OS_RISCOS 13
#define GZIP_OS_UNKNOWN 255
#endif /* LIB_GZIP_CONSTANTS_H */

View file

@ -0,0 +1,148 @@
/*
* gzip_decompress.c - decompress with a gzip wrapper
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "gzip_constants.h"
#include "unaligned.h"
#include "libdeflate.h"
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret)
{
const u8 *in_next = in;
const u8 * const in_end = in_next + in_nbytes;
u8 flg;
size_t actual_in_nbytes;
size_t actual_out_nbytes;
enum libdeflate_result result;
if (in_nbytes < GZIP_MIN_OVERHEAD)
return LIBDEFLATE_BAD_DATA;
/* ID1 */
if (*in_next++ != GZIP_ID1)
return LIBDEFLATE_BAD_DATA;
/* ID2 */
if (*in_next++ != GZIP_ID2)
return LIBDEFLATE_BAD_DATA;
/* CM */
if (*in_next++ != GZIP_CM_DEFLATE)
return LIBDEFLATE_BAD_DATA;
flg = *in_next++;
/* MTIME */
in_next += 4;
/* XFL */
in_next += 1;
/* OS */
in_next += 1;
if (flg & GZIP_FRESERVED)
return LIBDEFLATE_BAD_DATA;
/* Extra field */
if (flg & GZIP_FEXTRA) {
u16 xlen = get_unaligned_le16(in_next);
in_next += 2;
if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE)
return LIBDEFLATE_BAD_DATA;
in_next += xlen;
}
/* Original file name (zero terminated) */
if (flg & GZIP_FNAME) {
while (*in_next++ != 0 && in_next != in_end)
;
if (in_end - in_next < GZIP_FOOTER_SIZE)
return LIBDEFLATE_BAD_DATA;
}
/* File comment (zero terminated) */
if (flg & GZIP_FCOMMENT) {
while (*in_next++ != 0 && in_next != in_end)
;
if (in_end - in_next < GZIP_FOOTER_SIZE)
return LIBDEFLATE_BAD_DATA;
}
/* CRC16 for gzip header */
if (flg & GZIP_FHCRC) {
in_next += 2;
if (in_end - in_next < GZIP_FOOTER_SIZE)
return LIBDEFLATE_BAD_DATA;
}
/* Compressed data */
result = libdeflate_deflate_decompress_ex(d, in_next,
in_end - GZIP_FOOTER_SIZE - in_next,
out, out_nbytes_avail,
&actual_in_nbytes,
actual_out_nbytes_ret);
if (result != LIBDEFLATE_SUCCESS)
return result;
if (actual_out_nbytes_ret)
actual_out_nbytes = *actual_out_nbytes_ret;
else
actual_out_nbytes = out_nbytes_avail;
in_next += actual_in_nbytes;
/* CRC32 */
if (libdeflate_crc32(0, out, actual_out_nbytes) !=
get_unaligned_le32(in_next))
return LIBDEFLATE_BAD_DATA;
in_next += 4;
/* ISIZE */
if ((u32)actual_out_nbytes != get_unaligned_le32(in_next))
return LIBDEFLATE_BAD_DATA;
in_next += 4;
if (actual_in_nbytes_ret)
*actual_in_nbytes_ret = in_next - (u8 *)in;
return LIBDEFLATE_SUCCESS;
}
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
{
return libdeflate_gzip_decompress_ex(d, in, in_nbytes,
out, out_nbytes_avail,
NULL, actual_out_nbytes_ret);
}

View file

@ -0,0 +1,412 @@
/*
* hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* ---------------------------------------------------------------------------
*
* Algorithm
*
* This is a Hash Chains (hc) based matchfinder.
*
* The main data structure is a hash table where each hash bucket contains a
* linked list (or "chain") of sequences whose first 4 bytes share the same hash
* code. Each sequence is identified by its starting position in the input
* buffer.
*
* The algorithm processes the input buffer sequentially. At each byte
* position, the hash code of the first 4 bytes of the sequence beginning at
* that position (the sequence being matched against) is computed. This
* identifies the hash bucket to use for that position. Then, this hash
* bucket's linked list is searched for matches. Then, a new linked list node
* is created to represent the current sequence and is prepended to the list.
*
* This algorithm has several useful properties:
*
* - It only finds true Lempel-Ziv matches; i.e., those where the matching
* sequence occurs prior to the sequence being matched against.
*
* - The sequences in each linked list are always sorted by decreasing starting
* position. Therefore, the closest (smallest offset) matches are found
* first, which in many compression formats tend to be the cheapest to encode.
*
* - Although fast running time is not guaranteed due to the possibility of the
* lists getting very long, the worst degenerate behavior can be easily
* prevented by capping the number of nodes searched at each position.
*
* - If the compressor decides not to search for matches at a certain position,
* then that position can be quickly inserted without searching the list.
*
* - The algorithm is adaptable to sliding windows: just store the positions
* relative to a "base" value that is updated from time to time, and stop
* searching each list when the sequences get too far away.
*
* ----------------------------------------------------------------------------
*
* Optimizations
*
* The main hash table and chains handle length 4+ matches. Length 3 matches
* are handled by a separate hash table with no chains. This works well for
* typical "greedy" or "lazy"-style compressors, where length 3 matches are
* often only helpful if they have small offsets. Instead of searching a full
* chain for length 3+ matches, the algorithm just checks for one close length 3
* match, then focuses on finding length 4+ matches.
*
* The longest_match() and skip_positions() functions are inlined into the
* compressors that use them. This isn't just about saving the overhead of a
* function call. These functions are intended to be called from the inner
* loops of compressors, where giving the compiler more control over register
* allocation is very helpful. There is also significant benefit to be gained
* from allowing the CPU to predict branches independently at each call site.
* For example, "lazy"-style compressors can be written with two calls to
* longest_match(), each of which starts with a different 'best_len' and
* therefore has significantly different performance characteristics.
*
* Although any hash function can be used, a multiplicative hash is fast and
* works well.
*
* On some processors, it is significantly faster to extend matches by whole
* words (32 or 64 bits) instead of by individual bytes. For this to be the
* case, the processor must implement unaligned memory accesses efficiently and
* must have either a fast "find first set bit" instruction or a fast "find last
* set bit" instruction, depending on the processor's endianness.
*
* The code uses one loop for finding the first match and one loop for finding a
* longer match. Each of these loops is tuned for its respective task and in
* combination are faster than a single generalized loop that handles both
* tasks.
*
* The code also uses a tight inner loop that only compares the last and first
* bytes of a potential match. It is only when these bytes match that a full
* match extension is attempted.
*
* ----------------------------------------------------------------------------
*/
#ifndef LIB_HC_MATCHFINDER_H
#define LIB_HC_MATCHFINDER_H
#include "matchfinder_common.h"
#define HC_MATCHFINDER_HASH3_ORDER 15
#define HC_MATCHFINDER_HASH4_ORDER 16
#define HC_MATCHFINDER_TOTAL_HASH_SIZE \
(((1UL << HC_MATCHFINDER_HASH3_ORDER) + \
(1UL << HC_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t))
struct hc_matchfinder {
/* The hash table for finding length 3 matches */
mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
/* The hash table which contains the first nodes of the linked lists for
* finding length 4+ matches */
mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
/* The "next node" references for the linked lists. The "next node" of
* the node for the sequence with position 'pos' is 'next_tab[pos]'. */
mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE];
}
#ifdef _aligned_attribute
_aligned_attribute(MATCHFINDER_MEM_ALIGNMENT)
#endif
;
/* Prepare the matchfinder for a new input buffer. */
static forceinline void
hc_matchfinder_init(struct hc_matchfinder *mf)
{
STATIC_ASSERT(HC_MATCHFINDER_TOTAL_HASH_SIZE %
MATCHFINDER_SIZE_ALIGNMENT == 0);
matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_SIZE);
}
static forceinline void
hc_matchfinder_slide_window(struct hc_matchfinder *mf)
{
STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
}
/*
* Find the longest match longer than 'best_len' bytes.
*
* @mf
* The matchfinder structure.
* @in_base_p
* Location of a pointer which points to the place in the input data the
* matchfinder currently stores positions relative to. This may be updated
* by this function.
* @cur_pos
* The current position in the input buffer relative to @in_base (the
* position of the sequence being matched against).
* @best_len
* Require a match longer than this length.
* @max_len
* The maximum permissible match length at this position.
* @nice_len
* Stop searching if a match of at least this length is found.
* Must be <= @max_len.
* @max_search_depth
* Limit on the number of potential matches to consider. Must be >= 1.
* @next_hashes
* The precomputed hash codes for the sequence beginning at @in_next.
* These will be used and then updated with the precomputed hashcodes for
* the sequence beginning at @in_next + 1.
* @offset_ret
* If a match is found, its offset is returned in this location.
*
* Return the length of the match found, or 'best_len' if no match longer than
* 'best_len' was found.
*/
static forceinline u32
hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
const u8 ** const restrict in_base_p,
const u8 * const restrict in_next,
u32 best_len,
const u32 max_len,
const u32 nice_len,
const u32 max_search_depth,
u32 * const restrict next_hashes,
u32 * const restrict offset_ret)
{
u32 depth_remaining = max_search_depth;
const u8 *best_matchptr = in_next;
mf_pos_t cur_node3, cur_node4;
u32 hash3, hash4;
u32 next_hashseq;
u32 seq4;
const u8 *matchptr;
u32 len;
u32 cur_pos = in_next - *in_base_p;
const u8 *in_base;
mf_pos_t cutoff;
if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
hc_matchfinder_slide_window(mf);
*in_base_p += MATCHFINDER_WINDOW_SIZE;
cur_pos = 0;
}
in_base = *in_base_p;
cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
goto out;
/* Get the precomputed hash codes. */
hash3 = next_hashes[0];
hash4 = next_hashes[1];
/* From the hash buckets, get the first node of each linked list. */
cur_node3 = mf->hash3_tab[hash3];
cur_node4 = mf->hash4_tab[hash4];
/* Update for length 3 matches. This replaces the singleton node in the
* 'hash3' bucket with the node for the current sequence. */
mf->hash3_tab[hash3] = cur_pos;
/* Update for length 4 matches. This prepends the node for the current
* sequence to the linked list in the 'hash4' bucket. */
mf->hash4_tab[hash4] = cur_pos;
mf->next_tab[cur_pos] = cur_node4;
/* Compute the next hash codes. */
next_hashseq = get_unaligned_le32(in_next + 1);
next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
prefetchw(&mf->hash3_tab[next_hashes[0]]);
prefetchw(&mf->hash4_tab[next_hashes[1]]);
if (best_len < 4) { /* No match of length >= 4 found yet? */
/* Check for a length 3 match if needed. */
if (cur_node3 <= cutoff)
goto out;
seq4 = load_u32_unaligned(in_next);
if (best_len < 3) {
matchptr = &in_base[cur_node3];
if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
best_len = 3;
best_matchptr = matchptr;
}
}
/* Check for a length 4 match. */
if (cur_node4 <= cutoff)
goto out;
for (;;) {
/* No length 4 match found yet. Check the first 4 bytes. */
matchptr = &in_base[cur_node4];
if (load_u32_unaligned(matchptr) == seq4)
break;
/* The first 4 bytes did not match. Keep trying. */
cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
if (cur_node4 <= cutoff || !--depth_remaining)
goto out;
}
/* Found a match of length >= 4. Extend it to its full length. */
best_matchptr = matchptr;
best_len = lz_extend(in_next, best_matchptr, 4, max_len);
if (best_len >= nice_len)
goto out;
cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
if (cur_node4 <= cutoff || !--depth_remaining)
goto out;
} else {
if (cur_node4 <= cutoff || best_len >= nice_len)
goto out;
}
/* Check for matches of length >= 5. */
for (;;) {
for (;;) {
matchptr = &in_base[cur_node4];
/* Already found a length 4 match. Try for a longer
* match; start by checking either the last 4 bytes and
* the first 4 bytes, or the last byte. (The last byte,
* the one which would extend the match length by 1, is
* the most important.) */
#if UNALIGNED_ACCESS_IS_FAST
if ((load_u32_unaligned(matchptr + best_len - 3) ==
load_u32_unaligned(in_next + best_len - 3)) &&
(load_u32_unaligned(matchptr) ==
load_u32_unaligned(in_next)))
#else
if (matchptr[best_len] == in_next[best_len])
#endif
break;
/* Continue to the next node in the list. */
cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
if (cur_node4 <= cutoff || !--depth_remaining)
goto out;
}
#if UNALIGNED_ACCESS_IS_FAST
len = 4;
#else
len = 0;
#endif
len = lz_extend(in_next, matchptr, len, max_len);
if (len > best_len) {
/* This is the new longest match. */
best_len = len;
best_matchptr = matchptr;
if (best_len >= nice_len)
goto out;
}
/* Continue to the next node in the list. */
cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
if (cur_node4 <= cutoff || !--depth_remaining)
goto out;
}
out:
*offset_ret = in_next - best_matchptr;
return best_len;
}
/*
* Advance the matchfinder, but don't search for matches.
*
* @mf
* The matchfinder structure.
* @in_base_p
* Location of a pointer which points to the place in the input data the
* matchfinder currently stores positions relative to. This may be updated
* by this function.
* @cur_pos
* The current position in the input buffer relative to @in_base.
* @end_pos
* The end position of the input buffer, relative to @in_base.
* @next_hashes
* The precomputed hash codes for the sequence beginning at @in_next.
* These will be used and then updated with the precomputed hashcodes for
* the sequence beginning at @in_next + @count.
* @count
* The number of bytes to advance. Must be > 0.
*
* Returns @in_next + @count.
*/
static forceinline const u8 *
hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf,
const u8 ** const restrict in_base_p,
const u8 *in_next,
const u8 * const in_end,
const u32 count,
u32 * const restrict next_hashes)
{
u32 cur_pos;
u32 hash3, hash4;
u32 next_hashseq;
u32 remaining = count;
if (unlikely(count + 5 > in_end - in_next))
return &in_next[count];
cur_pos = in_next - *in_base_p;
hash3 = next_hashes[0];
hash4 = next_hashes[1];
do {
if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
hc_matchfinder_slide_window(mf);
*in_base_p += MATCHFINDER_WINDOW_SIZE;
cur_pos = 0;
}
mf->hash3_tab[hash3] = cur_pos;
mf->next_tab[cur_pos] = mf->hash4_tab[hash4];
mf->hash4_tab[hash4] = cur_pos;
next_hashseq = get_unaligned_le32(++in_next);
hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
cur_pos++;
} while (--remaining);
prefetchw(&mf->hash3_tab[hash3]);
prefetchw(&mf->hash4_tab[hash4]);
next_hashes[0] = hash3;
next_hashes[1] = hash4;
return in_next;
}
#endif /* LIB_HC_MATCHFINDER_H */

View file

@ -0,0 +1,67 @@
/*
* lib_common.h - internal header included by all library code
*/
#ifndef LIB_LIB_COMMON_H
#define LIB_LIB_COMMON_H
#ifdef LIBDEFLATE_H
# error "lib_common.h must always be included before libdeflate.h"
/* because BUILDING_LIBDEFLATE must be set first */
#endif
#define BUILDING_LIBDEFLATE
#include "../common/common_defs.h"
/*
* Prefix with "_libdeflate_" all global symbols which are not part of the API
* and don't already have a "libdeflate" prefix. This avoids exposing overly
* generic names when libdeflate is built as a static library.
*
* Note that the chosen prefix is not really important and can be changed
* without breaking library users. It was just chosen so that the resulting
* symbol names are unlikely to conflict with those from any other software.
* Also note that this fixup has no useful effect when libdeflate is built as a
* shared library, since these symbols are not exported.
*/
#define SYM_FIXUP(sym) _libdeflate_##sym
#define deflate_get_compression_level SYM_FIXUP(deflate_get_compression_level)
#define _cpu_features SYM_FIXUP(_cpu_features)
#define setup_cpu_features SYM_FIXUP(setup_cpu_features)
void *libdeflate_malloc(size_t size);
void libdeflate_free(void *ptr);
void *libdeflate_aligned_malloc(size_t alignment, size_t size);
void libdeflate_aligned_free(void *ptr);
#ifdef FREESTANDING
/*
* With -ffreestanding, <string.h> may be missing, and we must provide
* implementations of memset(), memcpy(), memmove(), and memcmp().
* See https://gcc.gnu.org/onlinedocs/gcc/Standards.html
*
* Also, -ffreestanding disables interpreting calls to these functions as
* built-ins. E.g., calling memcpy(&v, p, WORDBYTES) will make a function call,
* not be optimized to a single load instruction. For performance reasons we
* don't want that. So, declare these functions as macros that expand to the
* corresponding built-ins. This approach is recommended in the gcc man page.
* We still need the actual function definitions in case gcc calls them.
*/
void *memset(void *s, int c, size_t n);
#define memset(s, c, n) __builtin_memset((s), (c), (n))
void *memcpy(void *dest, const void *src, size_t n);
#define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n))
void *memmove(void *dest, const void *src, size_t n);
#define memmove(dest, src, n) __builtin_memmove((dest), (src), (n))
int memcmp(const void *s1, const void *s2, size_t n);
#define memcmp(s1, s2, n) __builtin_memcmp((s1), (s2), (n))
#else
#include <string.h>
#endif
#endif /* LIB_LIB_COMMON_H */

View file

@ -0,0 +1,180 @@
/*
* matchfinder_common.h - common code for Lempel-Ziv matchfinding
*/
#ifndef LIB_MATCHFINDER_COMMON_H
#define LIB_MATCHFINDER_COMMON_H
#include "lib_common.h"
#include "unaligned.h"
#ifndef MATCHFINDER_WINDOW_ORDER
# error "MATCHFINDER_WINDOW_ORDER must be defined!"
#endif
#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER)
#ifdef DEFLATE64
typedef s32 mf_pos_t;
#else
typedef s16 mf_pos_t;
#endif
#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
/*
* Required alignment of the matchfinder buffer pointer and size. The values
* here come from the AVX-2 implementation, which is the worst case.
*/
#define MATCHFINDER_MEM_ALIGNMENT 32
#define MATCHFINDER_SIZE_ALIGNMENT 128
#undef matchfinder_init
#undef matchfinder_rebase
#if defined(_aligned_attribute) && !defined(DEFLATE64)
# if defined(__arm__) || defined(__aarch64__)
# include "arm/matchfinder_impl.h"
# elif defined(__i386__) || defined(__x86_64__)
# include "x86/matchfinder_impl.h"
# endif
#endif
/*
* Initialize the hash table portion of the matchfinder.
*
* Essentially, this is an optimized memset().
*
* 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and
* 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT.
*/
#ifndef matchfinder_init
static forceinline void
matchfinder_init(mf_pos_t *data, size_t size)
{
size_t num_entries = size / sizeof(*data);
size_t i;
for (i = 0; i < num_entries; i++)
data[i] = MATCHFINDER_INITVAL;
}
#endif
/*
* Slide the matchfinder by WINDOW_SIZE bytes.
*
* This must be called just after each WINDOW_SIZE bytes have been run through
* the matchfinder.
*
* This will subtract WINDOW_SIZE bytes from each entry in the array specified.
* The effect is that all entries are updated to be relative to the current
* position, rather than the position WINDOW_SIZE bytes prior.
*
* Underflow is detected and replaced with signed saturation. This ensures that
* once the sliding window has passed over a position, that position forever
* remains out of bounds.
*
* The array passed in must contain all matchfinder data that is
* position-relative. Concretely, this will include the hash table as well as
* the table of positions that is used to link together the sequences in each
* hash bucket. Note that in the latter table, the links are 1-ary in the case
* of "hash chains", and 2-ary in the case of "binary trees". In either case,
* the links need to be rebased in the same way.
*
* 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and
* 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT.
*/
#ifndef matchfinder_rebase
static forceinline void
matchfinder_rebase(mf_pos_t *data, size_t size)
{
size_t num_entries = size / sizeof(*data);
size_t i;
if (MATCHFINDER_WINDOW_SIZE == 32768 && sizeof(mf_pos_t) == 2) {
/* Branchless version for 32768 byte windows. If the value was
* already negative, clear all bits except the sign bit; this
* changes the value to -32768. Otherwise, set the sign bit;
* this is equivalent to subtracting 32768. */
for (i = 0; i < num_entries; i++) {
u16 v = data[i];
u16 sign_bit = v & 0x8000;
v &= sign_bit - ((sign_bit >> 15) ^ 1);
v |= 0x8000;
data[i] = v;
}
return;
}
for (i = 0; i < num_entries; i++) {
if (data[i] >= 0)
data[i] -= (mf_pos_t)MATCHFINDER_WINDOW_SIZE;
else
data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
}
}
#endif
/*
* The hash function: given a sequence prefix held in the low-order bits of a
* 32-bit value, multiply by a carefully-chosen large constant. Discard any
* bits of the product that don't fit in a 32-bit value, but take the
* next-highest @num_bits bits of the product as the hash value, as those have
* the most randomness.
*/
static forceinline u32
lz_hash(u32 seq, unsigned num_bits)
{
return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
}
/*
* Return the number of bytes at @matchptr that match the bytes at @strptr, up
* to a maximum of @max_len. Initially, @start_len bytes are matched.
*/
static forceinline unsigned
lz_extend(const u8 * const strptr, const u8 * const matchptr,
const unsigned start_len, const unsigned max_len)
{
unsigned len = start_len;
machine_word_t v_word;
if (UNALIGNED_ACCESS_IS_FAST) {
if (likely(max_len - len >= 4 * WORDBYTES)) {
#define COMPARE_WORD_STEP \
v_word = load_word_unaligned(&matchptr[len]) ^ \
load_word_unaligned(&strptr[len]); \
if (v_word != 0) \
goto word_differs; \
len += WORDBYTES; \
COMPARE_WORD_STEP
COMPARE_WORD_STEP
COMPARE_WORD_STEP
COMPARE_WORD_STEP
#undef COMPARE_WORD_STEP
}
while (len + WORDBYTES <= max_len) {
v_word = load_word_unaligned(&matchptr[len]) ^
load_word_unaligned(&strptr[len]);
if (v_word != 0)
goto word_differs;
len += WORDBYTES;
}
}
while (len < max_len && matchptr[len] == strptr[len])
len++;
return len;
word_differs:
if (CPU_IS_LITTLE_ENDIAN())
len += (bsfw(v_word) >> 3);
else
len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
return len;
}
#endif /* LIB_MATCHFINDER_COMMON_H */

View file

@ -0,0 +1,228 @@
/*
* unaligned.h - inline functions for unaligned memory accesses
*/
#ifndef LIB_UNALIGNED_H
#define LIB_UNALIGNED_H
#include "lib_common.h"
/***** Unaligned loads and stores without endianness conversion *****/
/*
* memcpy() is portable, and it usually gets optimized appropriately by modern
* compilers. I.e., each memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled
* to a load or store instruction, not to an actual function call.
*
* We no longer use the "packed struct" approach, as that is nonstandard, has
* unclear semantics, and doesn't receive enough testing
* (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994).
*
* arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception
* where memcpy() generates inefficient code
* (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366). However, we no longer
* consider that one case important enough to maintain different code for.
* If you run into it, please just use a newer version of gcc (or use clang).
*/
#define DEFINE_UNALIGNED_TYPE(type) \
static forceinline type \
load_##type##_unaligned(const void *p) \
{ \
type v; \
memcpy(&v, p, sizeof(v)); \
return v; \
} \
\
static forceinline void \
store_##type##_unaligned(type v, void *p) \
{ \
memcpy(p, &v, sizeof(v)); \
}
DEFINE_UNALIGNED_TYPE(u16)
DEFINE_UNALIGNED_TYPE(u32)
DEFINE_UNALIGNED_TYPE(u64)
DEFINE_UNALIGNED_TYPE(machine_word_t)
#define load_word_unaligned load_machine_word_t_unaligned
#define store_word_unaligned store_machine_word_t_unaligned
/***** Unaligned loads with endianness conversion *****/
static forceinline u16
get_unaligned_le16(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return le16_bswap(load_u16_unaligned(p));
else
return ((u16)p[1] << 8) | p[0];
}
static forceinline u16
get_unaligned_be16(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return be16_bswap(load_u16_unaligned(p));
else
return ((u16)p[0] << 8) | p[1];
}
static forceinline u32
get_unaligned_le32(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return le32_bswap(load_u32_unaligned(p));
else
return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
((u32)p[1] << 8) | p[0];
}
static forceinline u32
get_unaligned_be32(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return be32_bswap(load_u32_unaligned(p));
else
return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
((u32)p[2] << 8) | p[3];
}
static forceinline u64
get_unaligned_le64(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return le64_bswap(load_u64_unaligned(p));
else
return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
((u64)p[5] << 40) | ((u64)p[4] << 32) |
((u64)p[3] << 24) | ((u64)p[2] << 16) |
((u64)p[1] << 8) | p[0];
}
static forceinline machine_word_t
get_unaligned_leword(const u8 *p)
{
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
if (WORDBITS == 32)
return get_unaligned_le32(p);
else
return get_unaligned_le64(p);
}
/***** Unaligned stores with endianness conversion *****/
static forceinline void
put_unaligned_le16(u16 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u16_unaligned(le16_bswap(v), p);
} else {
p[0] = (u8)(v >> 0);
p[1] = (u8)(v >> 8);
}
}
static forceinline void
put_unaligned_be16(u16 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u16_unaligned(be16_bswap(v), p);
} else {
p[0] = (u8)(v >> 8);
p[1] = (u8)(v >> 0);
}
}
static forceinline void
put_unaligned_le32(u32 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u32_unaligned(le32_bswap(v), p);
} else {
p[0] = (u8)(v >> 0);
p[1] = (u8)(v >> 8);
p[2] = (u8)(v >> 16);
p[3] = (u8)(v >> 24);
}
}
static forceinline void
put_unaligned_be32(u32 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u32_unaligned(be32_bswap(v), p);
} else {
p[0] = (u8)(v >> 24);
p[1] = (u8)(v >> 16);
p[2] = (u8)(v >> 8);
p[3] = (u8)(v >> 0);
}
}
static forceinline void
put_unaligned_le64(u64 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u64_unaligned(le64_bswap(v), p);
} else {
p[0] = (u8)(v >> 0);
p[1] = (u8)(v >> 8);
p[2] = (u8)(v >> 16);
p[3] = (u8)(v >> 24);
p[4] = (u8)(v >> 32);
p[5] = (u8)(v >> 40);
p[6] = (u8)(v >> 48);
p[7] = (u8)(v >> 56);
}
}
static forceinline void
put_unaligned_leword(machine_word_t v, u8 *p)
{
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
if (WORDBITS == 32)
put_unaligned_le32(v, p);
else
put_unaligned_le64(v, p);
}
/***** 24-bit loads *****/
/*
* Given a 32-bit value that was loaded with the platform's native endianness,
* return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
* bits contain the first 3 bytes, arranged in octets in a platform-dependent
* order, at the memory location from which the input 32-bit value was loaded.
*/
static forceinline u32
loaded_u32_to_u24(u32 v)
{
if (CPU_IS_LITTLE_ENDIAN())
return v & 0xFFFFFF;
else
return v >> 8;
}
/*
* Load the next 3 bytes from the memory location @p into the 24 low-order bits
* of a 32-bit value. The order in which the 3 bytes will be arranged as octets
* in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES
* bytes must be available at @p; note that this may be more than 3.
*/
static forceinline u32
load_u24_unaligned(const u8 *p)
{
#if UNALIGNED_ACCESS_IS_FAST
# define LOAD_U24_REQUIRED_NBYTES 4
return loaded_u32_to_u24(load_u32_unaligned(p));
#else
# define LOAD_U24_REQUIRED_NBYTES 3
if (CPU_IS_LITTLE_ENDIAN())
return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
else
return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
#endif
}
#endif /* LIB_UNALIGNED_H */

View file

@ -0,0 +1,142 @@
/*
* utils.c - utility functions for libdeflate
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "lib_common.h"
#include "libdeflate.h"
#ifdef FREESTANDING
# define malloc NULL
# define free NULL
#else
# include <stdlib.h>
#endif
static void *(*libdeflate_malloc_func)(size_t) = malloc;
static void (*libdeflate_free_func)(void *) = free;
void *
libdeflate_malloc(size_t size)
{
return (*libdeflate_malloc_func)(size);
}
void
libdeflate_free(void *ptr)
{
(*libdeflate_free_func)(ptr);
}
void *
libdeflate_aligned_malloc(size_t alignment, size_t size)
{
void *ptr = libdeflate_malloc(sizeof(void *) + alignment - 1 + size);
if (ptr) {
void *orig_ptr = ptr;
ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
((void **)ptr)[-1] = orig_ptr;
}
return ptr;
}
void
libdeflate_aligned_free(void *ptr)
{
if (ptr)
libdeflate_free(((void **)ptr)[-1]);
}
LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
void (*free_func)(void *))
{
libdeflate_malloc_func = malloc_func;
libdeflate_free_func = free_func;
}
/*
* Implementations of libc functions for freestanding library builds.
* Normal library builds don't use these. Not optimized yet; usually the
* compiler expands these functions and doesn't actually call them anyway.
*/
#ifdef FREESTANDING
#undef memset
void * __attribute__((weak))
memset(void *s, int c, size_t n)
{
u8 *p = s;
size_t i;
for (i = 0; i < n; i++)
p[i] = c;
return s;
}
#undef memcpy
void * __attribute__((weak))
memcpy(void *dest, const void *src, size_t n)
{
u8 *d = dest;
const u8 *s = src;
size_t i;
for (i = 0; i < n; i++)
d[i] = s[i];
return dest;
}
#undef memmove
void * __attribute__((weak))
memmove(void *dest, const void *src, size_t n)
{
u8 *d = dest;
const u8 *s = src;
size_t i;
if (d <= s)
return memcpy(d, s, n);
for (i = n; i > 0; i--)
d[i - 1] = s[i - 1];
return dest;
}
#undef memcmp
int __attribute__((weak))
memcmp(const void *s1, const void *s2, size_t n)
{
const u8 *p1 = s1;
const u8 *p2 = s2;
size_t i;
for (i = 0; i < n; i++) {
if (p1[i] != p2[i])
return (int)p1[i] - (int)p2[i];
}
return 0;
}
#endif /* FREESTANDING */

View file

@ -0,0 +1,337 @@
/*
* x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LIB_X86_ADLER32_IMPL_H
#define LIB_X86_ADLER32_IMPL_H
#include "cpu_features.h"
/*
* The following macros horizontally sum the s1 counters and add them to the
* real s1, and likewise for s2. They do this via a series of reductions, each
* of which halves the vector length, until just one counter remains.
*
* The s1 reductions don't depend on the s2 reductions and vice versa, so for
* efficiency they are interleaved. Also, every other s1 counter is 0 due to
* the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than
* 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits.
*/
#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2) \
{ \
__v4su s1_last = (v_s1), s2_last = (v_s2); \
\
/* 128 => 32 bits */ \
s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x31); \
s1_last += (__v4su)_mm_shuffle_epi32((__m128i)s1_last, 0x02); \
s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x02); \
\
*(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last); \
*(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last); \
}
#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2) \
{ \
__v4su s1_128bit, s2_128bit; \
\
/* 256 => 128 bits */ \
s1_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 0) + \
(__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 1); \
s2_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 0) + \
(__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 1); \
\
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \
}
#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \
{ \
__v8su s1_256bit, s2_256bit; \
\
/* 512 => 256 bits */ \
s1_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 0) + \
(__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 1); \
s2_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 0) + \
(__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 1); \
\
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \
}
/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */
#undef DISPATCH_AVX512BW
#if !defined(DEFAULT_IMPL) && \
/*
* clang before v3.9 is missing some AVX-512BW intrinsics including
* _mm512_sad_epu8(), a.k.a. __builtin_ia32_psadbw512. So just make using
* AVX-512BW, even when __AVX512BW__ is defined, conditional on
* COMPILER_SUPPORTS_AVX512BW_TARGET where we check for that builtin.
*/ \
COMPILER_SUPPORTS_AVX512BW_TARGET && \
(defined(__AVX512BW__) || (X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS))
# define FUNCNAME adler32_avx512bw
# define FUNCNAME_CHUNK adler32_avx512bw_chunk
# define IMPL_ALIGNMENT 64
# define IMPL_SEGMENT_SIZE 64
# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE
# ifdef __AVX512BW__
# define ATTRIBUTES
# define DEFAULT_IMPL adler32_avx512bw
# else
# define ATTRIBUTES __attribute__((target("avx512bw")))
# define DISPATCH 1
# define DISPATCH_AVX512BW 1
# endif
# include <immintrin.h>
static forceinline ATTRIBUTES void
adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
u32 *s1, u32 *s2)
{
const __m512i zeroes = _mm512_setzero_si512();
const __v64qi multipliers = (__v64qi){
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __v32hi ones = (__v32hi)_mm512_set1_epi16(1);
__v16si v_s1 = (__v16si)zeroes;
__v16si v_s1_sums = (__v16si)zeroes;
__v16si v_s2 = (__v16si)zeroes;
do {
/* Load the next 64-byte segment */
__m512i bytes = *p++;
/* Multiply the bytes by 64...1 (the number of times they need
* to be added to s2) and add adjacent products */
__v32hi sums = (__v32hi)_mm512_maddubs_epi16(
bytes, (__m512i)multipliers);
/* Keep sum of all previous s1 counters, for adding to s2 later.
* This allows delaying the multiplication by 64 to the end. */
v_s1_sums += v_s1;
/* Add the sum of each group of 8 bytes to the corresponding s1
* counter */
v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes);
/* Add the sum of each group of 4 products of the bytes by
* 64...1 to the corresponding s2 counter */
v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums,
(__m512i)ones);
} while (p != end);
/* Finish the s2 counters by adding the sum of the s1 values at the
* beginning of each segment, multiplied by the segment size (64) */
v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6);
/* Add the counters to the real s1 and s2 */
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
}
# include "../adler32_vec_template.h"
#endif /* AVX-512BW implementation */
/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */
#undef DISPATCH_AVX2
#if !defined(DEFAULT_IMPL) && \
(defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS))
# define FUNCNAME adler32_avx2
# define FUNCNAME_CHUNK adler32_avx2_chunk
# define IMPL_ALIGNMENT 32
# define IMPL_SEGMENT_SIZE 32
# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE
# ifdef __AVX2__
# define ATTRIBUTES
# define DEFAULT_IMPL adler32_avx2
# else
# define ATTRIBUTES __attribute__((target("avx2")))
# define DISPATCH 1
# define DISPATCH_AVX2 1
# endif
# include <immintrin.h>
static forceinline ATTRIBUTES void
adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
{
const __m256i zeroes = _mm256_setzero_si256();
const __v32qu multipliers = (__v32qu){
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __v16hu ones = (__v16hu)_mm256_set1_epi16(1);
__v8su v_s1 = (__v8su)zeroes;
__v8su v_s1_sums = (__v8su)zeroes;
__v8su v_s2 = (__v8su)zeroes;
do {
/* Load the next 32-byte segment */
__m256i bytes = *p++;
/* Multiply the bytes by 32...1 (the number of times they need
* to be added to s2) and add adjacent products */
__v16hu sums = (__v16hu)_mm256_maddubs_epi16(
bytes, (__m256i)multipliers);
/* Keep sum of all previous s1 counters, for adding to s2 later.
* This allows delaying the multiplication by 32 to the end. */
v_s1_sums += v_s1;
/* Add the sum of each group of 8 bytes to the corresponding s1
* counter */
v_s1 += (__v8su)_mm256_sad_epu8(bytes, zeroes);
/* Add the sum of each group of 4 products of the bytes by
* 32...1 to the corresponding s2 counter */
v_s2 += (__v8su)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
} while (p != end);
/* Finish the s2 counters by adding the sum of the s1 values at the
* beginning of each segment, multiplied by the segment size (32) */
v_s2 += (__v8su)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
/* Add the counters to the real s1 and s2 */
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
}
# include "../adler32_vec_template.h"
#endif /* AVX2 implementation */
/* SSE2 implementation */
#undef DISPATCH_SSE2
#if !defined(DEFAULT_IMPL) && \
(defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS))
# define FUNCNAME adler32_sse2
# define FUNCNAME_CHUNK adler32_sse2_chunk
# define IMPL_ALIGNMENT 16
# define IMPL_SEGMENT_SIZE 32
/*
* The 16-bit precision byte counters must not be allowed to undergo *signed*
* overflow, otherwise the signed multiplications at the end (_mm_madd_epi16)
* would behave incorrectly.
*/
# define IMPL_MAX_CHUNK_SIZE (32 * (0x7FFF / 0xFF))
# ifdef __SSE2__
# define ATTRIBUTES
# define DEFAULT_IMPL adler32_sse2
# else
# define ATTRIBUTES __attribute__((target("sse2")))
# define DISPATCH 1
# define DISPATCH_SSE2 1
# endif
# include <emmintrin.h>
static forceinline ATTRIBUTES void
adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
{
const __m128i zeroes = _mm_setzero_si128();
/* s1 counters: 32-bit, sum of bytes */
__v4su v_s1 = (__v4su)zeroes;
/* s2 counters: 32-bit, sum of s1 values */
__v4su v_s2 = (__v4su)zeroes;
/*
* Thirty-two 16-bit counters for byte sums. Each accumulates the bytes
* that eventually need to be multiplied by a number 32...1 for addition
* into s2.
*/
__v8hu v_byte_sums_a = (__v8hu)zeroes;
__v8hu v_byte_sums_b = (__v8hu)zeroes;
__v8hu v_byte_sums_c = (__v8hu)zeroes;
__v8hu v_byte_sums_d = (__v8hu)zeroes;
do {
/* Load the next 32 bytes */
const __m128i bytes1 = *p++;
const __m128i bytes2 = *p++;
/*
* Accumulate the previous s1 counters into the s2 counters.
* Logically, this really should be v_s2 += v_s1 * 32, but we
* can do the multiplication (or left shift) later.
*/
v_s2 += v_s1;
/*
* s1 update: use "Packed Sum of Absolute Differences" to add
* the bytes horizontally with 8 bytes per sum. Then add the
* sums to the s1 counters.
*/
v_s1 += (__v4su)_mm_sad_epu8(bytes1, zeroes);
v_s1 += (__v4su)_mm_sad_epu8(bytes2, zeroes);
/*
* Also accumulate the bytes into 32 separate counters that have
* 16-bit precision.
*/
v_byte_sums_a += (__v8hu)_mm_unpacklo_epi8(bytes1, zeroes);
v_byte_sums_b += (__v8hu)_mm_unpackhi_epi8(bytes1, zeroes);
v_byte_sums_c += (__v8hu)_mm_unpacklo_epi8(bytes2, zeroes);
v_byte_sums_d += (__v8hu)_mm_unpackhi_epi8(bytes2, zeroes);
} while (p != end);
/* Finish calculating the s2 counters */
v_s2 = (__v4su)_mm_slli_epi32((__m128i)v_s2, 5);
v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_a,
(__m128i)(__v8hu){ 32, 31, 30, 29, 28, 27, 26, 25 });
v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_b,
(__m128i)(__v8hu){ 24, 23, 22, 21, 20, 19, 18, 17 });
v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_c,
(__m128i)(__v8hu){ 16, 15, 14, 13, 12, 11, 10, 9 });
v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_d,
(__m128i)(__v8hu){ 8, 7, 6, 5, 4, 3, 2, 1 });
/* Add the counters to the real s1 and s2 */
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
}
# include "../adler32_vec_template.h"
#endif /* SSE2 implementation */
#ifdef DISPATCH
static inline adler32_func_t
arch_select_adler32_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_AVX512BW
if (features & X86_CPU_FEATURE_AVX512BW)
return adler32_avx512bw;
#endif
#ifdef DISPATCH_AVX2
if (features & X86_CPU_FEATURE_AVX2)
return adler32_avx2;
#endif
#ifdef DISPATCH_SSE2
if (features & X86_CPU_FEATURE_SSE2)
return adler32_sse2;
#endif
return NULL;
}
#endif /* DISPATCH */
#endif /* LIB_X86_ADLER32_IMPL_H */

View file

@ -0,0 +1,152 @@
/*
* x86/cpu_features.c - feature detection for x86 processors
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "../cpu_features_common.h" /* must be included first */
#include "cpu_features.h"
#if X86_CPU_FEATURES_ENABLED
volatile u32 _cpu_features = 0;
/* With old GCC versions we have to manually save and restore the x86_32 PIC
* register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 */
#if defined(__i386__) && defined(__PIC__)
# define EBX_CONSTRAINT "=&r"
#else
# define EBX_CONSTRAINT "=b"
#endif
/* Execute the CPUID instruction. */
static inline void
cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
{
__asm__(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n"
"cpuid \n"
".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
: "a" (leaf), "c" (subleaf));
}
/* Read an extended control register. */
static inline u64
read_xcr(u32 index)
{
u32 edx, eax;
/* Execute the "xgetbv" instruction. Old versions of binutils do not
* recognize this instruction, so list the raw bytes instead. */
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
return ((u64)edx << 32) | eax;
}
#undef BIT
#define BIT(nr) (1UL << (nr))
#define XCR0_BIT_SSE BIT(1)
#define XCR0_BIT_AVX BIT(2)
#define XCR0_BIT_OPMASK BIT(5)
#define XCR0_BIT_ZMM_HI256 BIT(6)
#define XCR0_BIT_HI16_ZMM BIT(7)
#define IS_SET(reg, nr) ((reg) & BIT(nr))
#define IS_ALL_SET(reg, mask) (((reg) & (mask)) == (mask))
static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_SSE2, "sse2"},
{X86_CPU_FEATURE_PCLMUL, "pclmul"},
{X86_CPU_FEATURE_AVX, "avx"},
{X86_CPU_FEATURE_AVX2, "avx2"},
{X86_CPU_FEATURE_BMI2, "bmi2"},
{X86_CPU_FEATURE_AVX512BW, "avx512bw"},
};
/* Initialize _cpu_features with bits for interesting processor features. */
void setup_cpu_features(void)
{
u32 features = 0;
u32 dummy1, dummy2, dummy3, dummy4;
u32 max_function;
u32 features_1, features_2, features_3, features_4;
bool os_avx_support = false;
bool os_avx512_support = false;
/* Get maximum supported function */
cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
if (max_function < 1)
goto out;
/* Standard feature flags */
cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
if (IS_SET(features_1, 26))
features |= X86_CPU_FEATURE_SSE2;
if (IS_SET(features_2, 1))
features |= X86_CPU_FEATURE_PCLMUL;
if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
u64 xcr0 = read_xcr(0);
os_avx_support = IS_ALL_SET(xcr0,
XCR0_BIT_SSE |
XCR0_BIT_AVX);
os_avx512_support = IS_ALL_SET(xcr0,
XCR0_BIT_SSE |
XCR0_BIT_AVX |
XCR0_BIT_OPMASK |
XCR0_BIT_ZMM_HI256 |
XCR0_BIT_HI16_ZMM);
}
if (os_avx_support && IS_SET(features_2, 28))
features |= X86_CPU_FEATURE_AVX;
if (max_function < 7)
goto out;
/* Extended feature flags */
cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
if (os_avx_support && IS_SET(features_3, 5))
features |= X86_CPU_FEATURE_AVX2;
if (IS_SET(features_3, 8))
features |= X86_CPU_FEATURE_BMI2;
if (os_avx512_support && IS_SET(features_3, 30))
features |= X86_CPU_FEATURE_AVX512BW;
out:
disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
ARRAY_LEN(x86_cpu_feature_table));
_cpu_features = features | X86_CPU_FEATURES_KNOWN;
}
#endif /* X86_CPU_FEATURES_ENABLED */

View file

@ -0,0 +1,41 @@
/*
* x86/cpu_features.h - feature detection for x86 processors
*/
#ifndef LIB_X86_CPU_FEATURES_H
#define LIB_X86_CPU_FEATURES_H
#include "../lib_common.h"
#if (defined(__i386__) || defined(__x86_64__)) && \
COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
# define X86_CPU_FEATURES_ENABLED 1
#else
# define X86_CPU_FEATURES_ENABLED 0
#endif
#if X86_CPU_FEATURES_ENABLED
#define X86_CPU_FEATURE_SSE2 0x00000001
#define X86_CPU_FEATURE_PCLMUL 0x00000002
#define X86_CPU_FEATURE_AVX 0x00000004
#define X86_CPU_FEATURE_AVX2 0x00000008
#define X86_CPU_FEATURE_BMI2 0x00000010
#define X86_CPU_FEATURE_AVX512BW 0x00000020
#define X86_CPU_FEATURES_KNOWN 0x80000000
extern volatile u32 _cpu_features;
void setup_cpu_features(void);
static inline u32 get_cpu_features(void)
{
if (_cpu_features == 0)
setup_cpu_features();
return _cpu_features;
}
#endif /* X86_CPU_FEATURES_ENABLED */
#endif /* LIB_X86_CPU_FEATURES_H */

View file

@ -0,0 +1,92 @@
/*
* x86/crc32_impl.h - x86 implementations of CRC-32 checksum algorithm
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LIB_X86_CRC32_IMPL_H
#define LIB_X86_CRC32_IMPL_H
#include "cpu_features.h"
/*
* Include the PCLMUL/AVX implementation? Although our PCLMUL-optimized CRC-32
* function doesn't use any AVX intrinsics specifically, it can benefit a lot
* from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
* MB/s. I expect this is related to the PCLMULQDQ instructions being assembled
* in the newer three-operand form rather than the older two-operand form.
*
* Note: this is only needed if __AVX__ is *not* defined, since otherwise the
* "regular" PCLMUL implementation would already be AVX enabled.
*/
#undef DISPATCH_PCLMUL_AVX
#if !defined(DEFAULT_IMPL) && !defined(__AVX__) && \
X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET && \
(defined(__PCLMUL__) || COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS)
# define FUNCNAME crc32_pclmul_avx
# define FUNCNAME_ALIGNED crc32_pclmul_avx_aligned
# define ATTRIBUTES __attribute__((target("pclmul,avx")))
# define DISPATCH 1
# define DISPATCH_PCLMUL_AVX 1
# include "crc32_pclmul_template.h"
#endif
/* PCLMUL implementation */
#undef DISPATCH_PCLMUL
#if !defined(DEFAULT_IMPL) && \
(defined(__PCLMUL__) || (X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS))
# define FUNCNAME crc32_pclmul
# define FUNCNAME_ALIGNED crc32_pclmul_aligned
# ifdef __PCLMUL__
# define ATTRIBUTES
# define DEFAULT_IMPL crc32_pclmul
# else
# define ATTRIBUTES __attribute__((target("pclmul")))
# define DISPATCH 1
# define DISPATCH_PCLMUL 1
# endif
# include "crc32_pclmul_template.h"
#endif
#ifdef DISPATCH
static inline crc32_func_t
arch_select_crc32_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_PCLMUL_AVX
if ((features & X86_CPU_FEATURE_PCLMUL) &&
(features & X86_CPU_FEATURE_AVX))
return crc32_pclmul_avx;
#endif
#ifdef DISPATCH_PCLMUL
if (features & X86_CPU_FEATURE_PCLMUL)
return crc32_pclmul;
#endif
return NULL;
}
#endif /* DISPATCH */
#endif /* LIB_X86_CRC32_IMPL_H */

View file

@ -0,0 +1,262 @@
/*
* x86/crc32_pclmul_template.h
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <immintrin.h>
/*
* CRC-32 folding with PCLMULQDQ.
*
* The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
* producing an abbreviated message which is congruent the original message
* modulo the generator polynomial G(x).
*
* Folding each 512 bits is implemented as eight 64-bit folds, each of which
* uses one carryless multiplication instruction. It's expected that CPUs may
* be able to execute some of these multiplications in parallel.
*
* Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
* be 95 bits from a constant distance D later in the message. The relevant
* portion of the message can be written as:
*
* M(x) = A(x)*x^D + B(x)
*
* ... where + and * represent addition and multiplication, respectively, of
* polynomials over GF(2). Note that when implemented on a computer, these
* operations are equivalent to XOR and carryless multiplication, respectively.
*
* For the purpose of CRC calculation, only the remainder modulo the generator
* polynomial G(x) matters:
*
* M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
*
* Since the modulo operation can be applied anywhere in a sequence of additions
* and multiplications without affecting the result, this is equivalent to:
*
* M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
*
* For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
* a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
* multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
* product. Then, adding (XOR-ing) the product to B(x) produces a polynomial
* with the same length as B(x) but with the same remainder as 'A(x)*x^D +
* B(x)'. This is the basic fold operation with 64 bits.
*
* Note that the carryless multiplication instruction PCLMULQDQ actually takes
* two 64-bit inputs and produces a 127-bit product in the low-order bits of a
* 128-bit XMM register. This works fine, but care must be taken to account for
* "bit endianness". With the CRC version implemented here, bits are always
* ordered such that the lowest-order bit represents the coefficient of highest
* power of x and the highest-order bit represents the coefficient of the lowest
* power of x. This is backwards from the more intuitive order. Still,
* carryless multiplication works essentially the same either way. It just must
* be accounted for that when we XOR the 95-bit product in the low-order 95 bits
* of a 128-bit XMM register into 128-bits of later data held in another XMM
* register, we'll really be XOR-ing the product into the mathematically higher
* degree end of those later bits, not the lower degree end as may be expected.
*
* So given that caveat and the fact that we process 512 bits per iteration, the
* 'D' values we need for the two 64-bit halves of each 128 bits of data are:
*
* D = (512 + 95) - 64 for the higher-degree half of each 128 bits,
* i.e. the lower order bits in the XMM register
*
* D = (512 + 95) - 128 for the lower-degree half of each 128 bits,
* i.e. the higher order bits in the XMM register
*
* The required 'x^D mod G(x)' values were precomputed.
*
* When <= 512 bits remain in the message, we finish up by folding across
* smaller distances. This works similarly; the distance D is just different,
* so different constant multipliers must be used. Finally, once the remaining
* message is just 64 bits, it is reduced to the CRC-32 using Barrett reduction
* (explained later).
*
* For more information see the original paper from Intel:
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* December 2009
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
*/
static u32 ATTRIBUTES
FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t nr_segs)
{
/* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */
const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
const __v2di barrett_reduction_constants =
(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
const __m128i * const end = p + nr_segs;
const __m128i * const end512 = p + (nr_segs & ~3);
__m128i x0, x1, x2, x3;
/*
* Account for the current 'remainder', i.e. the CRC of the part of the
* message already processed. Explanation: rewrite the message
* polynomial M(x) in terms of the first part A(x), the second part
* B(x), and the length of the second part in bits |B(x)| >= 32:
*
* M(x) = A(x)*x^|B(x)| + B(x)
*
* Then the CRC of M(x) is:
*
* CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
* = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
* = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
*
* Note: all arithmetic is modulo G(x), the generator polynomial; that's
* why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
*
* So the CRC of the full message is the CRC of the second part of the
* message where the first 32 bits of the second part of the message
* have been XOR'ed with the CRC of the first part of the message.
*/
x0 = *p++;
x0 ^= (__m128i)(__v4si){ remainder };
if (p > end512) /* only 128, 256, or 384 bits of input? */
goto _128_bits_at_a_time;
x1 = *p++;
x2 = *p++;
x3 = *p++;
/* Fold 512 bits at a time */
for (; p != end512; p += 4) {
__m128i y0, y1, y2, y3;
y0 = p[0];
y1 = p[1];
y2 = p[2];
y3 = p[3];
/*
* Note: the immediate constant for PCLMULQDQ specifies which
* 64-bit halves of the 128-bit vectors to multiply:
*
* 0x00 means low halves (higher degree polynomial terms for us)
* 0x11 means high halves (lower degree polynomial terms for us)
*/
y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
x0 = y0;
x1 = y1;
x2 = y2;
x3 = y3;
}
/* Fold 512 bits => 128 bits */
x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
x0 = x3;
_128_bits_at_a_time:
while (p != end) {
/* Fold 128 bits into next 128 bits */
x1 = *p++;
x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
x0 = x1;
}
/* Now there are just 128 bits left, stored in 'x0'. */
/*
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
* which is equivalent to multiplying by x^32. This is needed because
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
*/
x0 = _mm_srli_si128(x0, 8) ^
_mm_clmulepi64_si128(x0, multipliers_1, 0x10);
/* Fold 96 => 64 bits */
x0 = _mm_srli_si128(x0, 4) ^
_mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
/*
* Finally, reduce 64 => 32 bits using Barrett reduction.
*
* Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
* compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
*
* R(x) = (A(x)*x^32 + B(x)) mod G(x)
* = (A(x)*x^32) mod G(x) + B(x)
*
* Then, by the Division Algorithm there exists a unique q(x) such that:
*
* A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
*
* Since the left-hand side is of maximum degree 31, the right-hand side
* must be too. This implies that we can apply 'mod x^32' to the
* right-hand side without changing its value:
*
* (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
*
* Note that '+' is equivalent to '-' in polynomials over GF(2).
*
* We also know that:
*
* / A(x)*x^32 \
* q(x) = floor ( --------- )
* \ G(x) /
*
* To compute this efficiently, we can multiply the top and bottom by
* x^32 and move the division by G(x) to the top:
*
* / A(x) * floor(x^64 / G(x)) \
* q(x) = floor ( ------------------------- )
* \ x^32 /
*
* Note that floor(x^64 / G(x)) is a constant.
*
* So finally we have:
*
* / A(x) * floor(x^64 / G(x)) \
* R(x) = B(x) + G(x)*floor ( ------------------------- )
* \ x^32 /
*/
x1 = x0;
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
}
#define IMPL_ALIGNMENT 16
#define IMPL_SEGMENT_SIZE 16
#include "../crc32_vec_template.h"

View file

@ -0,0 +1,35 @@
#ifndef LIB_X86_DECOMPRESS_IMPL_H
#define LIB_X86_DECOMPRESS_IMPL_H
#include "cpu_features.h"
/* Include the BMI2-optimized version? */
#undef DISPATCH_BMI2
#if !defined(__BMI2__) && X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_BMI2_TARGET
# define FUNCNAME deflate_decompress_bmi2
# define ATTRIBUTES __attribute__((target("bmi2")))
# define DISPATCH 1
# define DISPATCH_BMI2 1
#ifdef GDEFLATE
# include "../gdeflate_decompress_template.h"
#else
# include "../decompress_template.h"
#endif
#endif
#ifdef DISPATCH
static inline decompress_func_t
arch_select_decompress_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_BMI2
if (features & X86_CPU_FEATURE_BMI2)
return deflate_decompress_bmi2;
#endif
return NULL;
}
#endif /* DISPATCH */
#endif /* LIB_X86_DECOMPRESS_IMPL_H */

View file

@ -0,0 +1,122 @@
/*
* x86/matchfinder_impl.h - x86 implementations of matchfinder functions
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef LIB_X86_MATCHFINDER_IMPL_H
#define LIB_X86_MATCHFINDER_IMPL_H
#ifdef __AVX2__
# include <immintrin.h>
static forceinline void
matchfinder_init_avx2(mf_pos_t *data, size_t size)
{
__m256i *p = (__m256i *)data;
__m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
do {
p[0] = v;
p[1] = v;
p[2] = v;
p[3] = v;
p += 4;
size -= 4 * sizeof(*p);
} while (size != 0);
}
#define matchfinder_init matchfinder_init_avx2
static forceinline void
matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
{
__m256i *p = (__m256i *)data;
__m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
do {
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
p[0] = _mm256_adds_epi16(p[0], v);
p[1] = _mm256_adds_epi16(p[1], v);
p[2] = _mm256_adds_epi16(p[2], v);
p[3] = _mm256_adds_epi16(p[3], v);
p += 4;
size -= 4 * sizeof(*p);
} while (size != 0);
}
#define matchfinder_rebase matchfinder_rebase_avx2
#elif defined(__SSE2__)
# include <emmintrin.h>
static forceinline void
matchfinder_init_sse2(mf_pos_t *data, size_t size)
{
__m128i *p = (__m128i *)data;
__m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL);
STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
do {
p[0] = v;
p[1] = v;
p[2] = v;
p[3] = v;
p += 4;
size -= 4 * sizeof(*p);
} while (size != 0);
}
#define matchfinder_init matchfinder_init_sse2
static forceinline void
matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
{
__m128i *p = (__m128i *)data;
__m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
do {
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
p[0] = _mm_adds_epi16(p[0], v);
p[1] = _mm_adds_epi16(p[1], v);
p[2] = _mm_adds_epi16(p[2], v);
p[3] = _mm_adds_epi16(p[3], v);
p += 4;
size -= 4 * sizeof(*p);
} while (size != 0);
}
#define matchfinder_rebase matchfinder_rebase_sse2
#endif /* __SSE2__ */
#endif /* LIB_X86_MATCHFINDER_IMPL_H */

View file

@ -0,0 +1,87 @@
/*
* zlib_compress.c - compress with a zlib wrapper
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "deflate_compress.h"
#include "unaligned.h"
#include "zlib_constants.h"
#include "libdeflate.h"
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_zlib_compress(struct libdeflate_compressor *c,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail)
{
u8 *out_next = out;
u16 hdr;
unsigned compression_level;
unsigned level_hint;
size_t deflate_size;
if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD)
return 0;
/* 2 byte header: CMF and FLG */
hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12);
compression_level = deflate_get_compression_level(c);
if (compression_level < 2)
level_hint = ZLIB_FASTEST_COMPRESSION;
else if (compression_level < 6)
level_hint = ZLIB_FAST_COMPRESSION;
else if (compression_level < 8)
level_hint = ZLIB_DEFAULT_COMPRESSION;
else
level_hint = ZLIB_SLOWEST_COMPRESSION;
hdr |= level_hint << 6;
hdr |= 31 - (hdr % 31);
put_unaligned_be16(hdr, out_next);
out_next += 2;
/* Compressed data */
deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
out_nbytes_avail - ZLIB_MIN_OVERHEAD);
if (deflate_size == 0)
return 0;
out_next += deflate_size;
/* ADLER32 */
put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next);
out_next += 4;
return out_next - (u8 *)out;
}
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_zlib_compress_bound(struct libdeflate_compressor *c,
size_t in_nbytes)
{
return ZLIB_MIN_OVERHEAD +
libdeflate_deflate_compress_bound(c, in_nbytes);
}

View file

@ -0,0 +1,21 @@
/*
* zlib_constants.h - constants for the zlib wrapper format
*/
#ifndef LIB_ZLIB_CONSTANTS_H
#define LIB_ZLIB_CONSTANTS_H
#define ZLIB_MIN_HEADER_SIZE 2
#define ZLIB_FOOTER_SIZE 4
#define ZLIB_MIN_OVERHEAD (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE)
#define ZLIB_CM_DEFLATE 8
#define ZLIB_CINFO_32K_WINDOW 7
#define ZLIB_FASTEST_COMPRESSION 0
#define ZLIB_FAST_COMPRESSION 1
#define ZLIB_DEFAULT_COMPRESSION 2
#define ZLIB_SLOWEST_COMPRESSION 3
#endif /* LIB_ZLIB_CONSTANTS_H */

View file

@ -0,0 +1,108 @@
/*
* zlib_decompress.c - decompress with a zlib wrapper
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "unaligned.h"
#include "zlib_constants.h"
#include "libdeflate.h"
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret)
{
const u8 *in_next = in;
const u8 * const in_end = in_next + in_nbytes;
u16 hdr;
size_t actual_in_nbytes;
size_t actual_out_nbytes;
enum libdeflate_result result;
if (in_nbytes < ZLIB_MIN_OVERHEAD)
return LIBDEFLATE_BAD_DATA;
/* 2 byte header: CMF and FLG */
hdr = get_unaligned_be16(in_next);
in_next += 2;
/* FCHECK */
if ((hdr % 31) != 0)
return LIBDEFLATE_BAD_DATA;
/* CM */
if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE)
return LIBDEFLATE_BAD_DATA;
/* CINFO */
if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW)
return LIBDEFLATE_BAD_DATA;
/* FDICT */
if ((hdr >> 5) & 1)
return LIBDEFLATE_BAD_DATA;
/* Compressed data */
result = libdeflate_deflate_decompress_ex(d, in_next,
in_end - ZLIB_FOOTER_SIZE - in_next,
out, out_nbytes_avail,
&actual_in_nbytes, actual_out_nbytes_ret);
if (result != LIBDEFLATE_SUCCESS)
return result;
if (actual_out_nbytes_ret)
actual_out_nbytes = *actual_out_nbytes_ret;
else
actual_out_nbytes = out_nbytes_avail;
in_next += actual_in_nbytes;
/* ADLER32 */
if (libdeflate_adler32(1, out, actual_out_nbytes) !=
get_unaligned_be32(in_next))
return LIBDEFLATE_BAD_DATA;
in_next += 4;
if (actual_in_nbytes_ret)
*actual_in_nbytes_ret = in_next - (u8 *)in;
return LIBDEFLATE_SUCCESS;
}
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_zlib_decompress(struct libdeflate_decompressor *d,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
{
return libdeflate_zlib_decompress_ex(d, in, in_nbytes,
out, out_nbytes_avail,
NULL, actual_out_nbytes_ret);
}

View file

@ -0,0 +1,540 @@
/*
* libdeflate.h - public header for libdeflate
*/
#ifndef LIBDEFLATE_H
#define LIBDEFLATE_H
#ifdef __cplusplus
extern "C" {
#endif
#define LIBDEFLATE_VERSION_MAJOR 1
#define LIBDEFLATE_VERSION_MINOR 8
#define LIBDEFLATE_VERSION_STRING "1.8"
#include <stddef.h>
#include <stdint.h>
/*
* On Windows, if you want to link to the DLL version of libdeflate, then
* #define LIBDEFLATE_DLL. Note that the calling convention is "stdcall".
*/
#ifdef LIBDEFLATE_DLL
# ifdef BUILDING_LIBDEFLATE
# define LIBDEFLATEEXPORT LIBEXPORT
# elif defined(_WIN32) || defined(__CYGWIN__)
# define LIBDEFLATEEXPORT __declspec(dllimport)
# endif
#endif
#ifndef LIBDEFLATEEXPORT
# define LIBDEFLATEEXPORT
#endif
#if defined(_WIN32) && !defined(_WIN64)
# define LIBDEFLATEAPI_ABI __stdcall
#else
# define LIBDEFLATEAPI_ABI
#endif
#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && \
defined(_WIN32) && !defined(_WIN64)
/*
* On 32-bit Windows, gcc assumes 16-byte stack alignment but MSVC only 4.
* Realign the stack when entering libdeflate to avoid crashing in SSE/AVX
* code when called from an MSVC-compiled application.
*/
# define LIBDEFLATEAPI_STACKALIGN __attribute__((force_align_arg_pointer))
#else
# define LIBDEFLATEAPI_STACKALIGN
#endif
#define LIBDEFLATEAPI LIBDEFLATEAPI_ABI LIBDEFLATEAPI_STACKALIGN
/* ========================================================================== */
/* Compression */
/* ========================================================================== */
struct libdeflate_compressor;
/*
* libdeflate_alloc_compressor() allocates a new compressor that supports
* DEFLATE, zlib, and gzip compression. 'compression_level' is the compression
* level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
* medium/default, 9 = slow, 12 = slowest). Level 0 is also supported and means
* "no compression", specifically "create a valid stream, but only emit
* uncompressed blocks" (this will expand the data slightly).
*
* The return value is a pointer to the new compressor, or NULL if out of memory
* or if the compression level is invalid (i.e. outside the range [0, 12]).
*
* Note: for compression, the sliding window size is defined at compilation time
* to 32768, the largest size permissible in the DEFLATE format. It cannot be
* changed at runtime.
*
* A single compressor is not safe to use by multiple threads concurrently.
* However, different threads may use different compressors concurrently.
*/
LIBDEFLATEEXPORT struct libdeflate_compressor * LIBDEFLATEAPI
libdeflate_alloc_compressor(int compression_level);
/*
* libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of
* data. The function attempts to compress 'in_nbytes' bytes of data located at
* 'in' and write the results to 'out', which has space for 'out_nbytes_avail'
* bytes. The return value is the compressed size in bytes, or 0 if the data
* could not be compressed to 'out_nbytes_avail' bytes or fewer.
*/
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail);
/*
* libdeflate_deflate_compress_bound() returns a worst-case upper bound on the
* number of bytes of compressed data that may be produced by compressing any
* buffer of length less than or equal to 'in_nbytes' using
* libdeflate_deflate_compress() with the specified compressor. Mathematically,
* this bound will necessarily be a number greater than or equal to 'in_nbytes'.
* It may be an overestimate of the true upper bound. The return value is
* guaranteed to be the same for all invocations with the same compressor and
* same 'in_nbytes'.
*
* As a special case, 'compressor' may be NULL. This causes the bound to be
* taken across *any* libdeflate_compressor that could ever be allocated with
* this build of the library, with any options.
*
* Note that this function is not necessary in many applications. With
* block-based compression, it is usually preferable to separately store the
* uncompressed size of each block and to store any blocks that did not compress
* to less than their original size uncompressed. In that scenario, there is no
* need to know the worst-case compressed size, since the maximum number of
* bytes of compressed data that may be used would always be one less than the
* input length. You can just pass a buffer of that size to
* libdeflate_deflate_compress() and store the data uncompressed if
* libdeflate_deflate_compress() returns 0, indicating that the compressed data
* did not fit into the provided output buffer.
*/
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor,
size_t in_nbytes);
/*
* Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper
* format.
*/
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_zlib_compress(struct libdeflate_compressor *compressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail);
/*
* Like libdeflate_deflate_compress_bound(), but assumes the data will be
* compressed with libdeflate_zlib_compress() rather than with
* libdeflate_deflate_compress().
*/
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor,
size_t in_nbytes);
/*
* Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper
* format.
*/
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_gzip_compress(struct libdeflate_compressor *compressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail);
/*
* Like libdeflate_deflate_compress_bound(), but assumes the data will be
* compressed with libdeflate_gzip_compress() rather than with
* libdeflate_deflate_compress().
*/
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor,
size_t in_nbytes);
/*
* libdeflate_free_compressor() frees a compressor that was allocated with
* libdeflate_alloc_compressor(). If a NULL pointer is passed in, no action is
* taken.
*/
LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_free_compressor(struct libdeflate_compressor *compressor);
/* ========================================================================== */
/* Decompression */
/* ========================================================================== */
struct libdeflate_decompressor;
/*
* libdeflate_alloc_decompressor() allocates a new decompressor that can be used
* for DEFLATE, zlib, and gzip decompression. The return value is a pointer to
* the new decompressor, or NULL if out of memory.
*
* This function takes no parameters, and the returned decompressor is valid for
* decompressing data that was compressed at any compression level and with any
* sliding window size.
*
* A single decompressor is not safe to use by multiple threads concurrently.
* However, different threads may use different decompressors concurrently.
*/
LIBDEFLATEEXPORT struct libdeflate_decompressor * LIBDEFLATEAPI
libdeflate_alloc_decompressor(void);
/*
* Result of a call to libdeflate_deflate_decompress(),
* libdeflate_zlib_decompress(), or libdeflate_gzip_decompress().
*/
enum libdeflate_result {
/* Decompression was successful. */
LIBDEFLATE_SUCCESS = 0,
/* Decompressed failed because the compressed data was invalid, corrupt,
* or otherwise unsupported. */
LIBDEFLATE_BAD_DATA = 1,
/* A NULL 'actual_out_nbytes_ret' was provided, but the data would have
* decompressed to fewer than 'out_nbytes_avail' bytes. */
LIBDEFLATE_SHORT_OUTPUT = 2,
/* The data would have decompressed to more than 'out_nbytes_avail'
* bytes. */
LIBDEFLATE_INSUFFICIENT_SPACE = 3,
};
/*
* libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream
* from the buffer 'in' with compressed size up to 'in_nbytes' bytes. The
* uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail'
* bytes. If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.
* Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If
* a nonzero result code is returned, then the contents of the output buffer are
* undefined.
*
* Decompression stops at the end of the DEFLATE stream (as indicated by the
* BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
*
* libdeflate_deflate_decompress() can be used in cases where the actual
* uncompressed size is known (recommended) or unknown (not recommended):
*
* - If the actual uncompressed size is known, then pass the actual
* uncompressed size as 'out_nbytes_avail' and pass NULL for
* 'actual_out_nbytes_ret'. This makes libdeflate_deflate_decompress() fail
* with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the
* specified number of bytes.
*
* - If the actual uncompressed size is unknown, then provide a non-NULL
* 'actual_out_nbytes_ret' and provide a buffer with some size
* 'out_nbytes_avail' that you think is large enough to hold all the
* uncompressed data. In this case, if the data decompresses to less than
* or equal to 'out_nbytes_avail' bytes, then
* libdeflate_deflate_decompress() will write the actual uncompressed size
* to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS). Otherwise,
* it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was
* not large enough but no other problems were encountered, or another
* nonzero result code if decompression failed for another reason.
*/
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret'
* argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL,
* then the actual compressed size of the DEFLATE stream (aligned to the next
* byte boundary) is written to *actual_in_nbytes_ret.
*/
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format
* instead of raw DEFLATE.
*
* Decompression will stop at the end of the zlib stream, even if it is shorter
* than 'in_nbytes'. If you need to know exactly where the zlib stream ended,
* use libdeflate_zlib_decompress_ex().
*/
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_zlib_decompress(), but adds the 'actual_in_nbytes_ret'
* argument. If 'actual_in_nbytes_ret' is not NULL and the decompression
* succeeds (indicating that the first zlib-compressed stream in the input
* buffer was decompressed), then the actual number of input bytes consumed is
* written to *actual_in_nbytes_ret.
*/
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format
* instead of raw DEFLATE.
*
* If multiple gzip-compressed members are concatenated, then only the first
* will be decompressed. Use libdeflate_gzip_decompress_ex() if you need
* multi-member support.
*/
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret'
* argument. If 'actual_in_nbytes_ret' is not NULL and the decompression
* succeeds (indicating that the first gzip-compressed member in the input
* buffer was decompressed), then the actual number of input bytes consumed is
* written to *actual_in_nbytes_ret.
*/
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret);
/*
* libdeflate_free_decompressor() frees a decompressor that was allocated with
* libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action
* is taken.
*/
LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
/* ========================================================================== */
/* Checksums */
/* ========================================================================== */
/*
* libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of
* data and returns the updated checksum. When starting a new checksum, the
* required initial value for 'adler' is 1. This value is also returned when
* 'buffer' is specified as NULL.
*/
LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI
libdeflate_adler32(uint32_t adler, const void *buffer, size_t len);
/*
* libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data
* and returns the updated checksum. When starting a new checksum, the required
* initial value for 'crc' is 0. This value is also returned when 'buffer' is
* specified as NULL.
*/
LIBDEFLATEEXPORT uint32_t LIBDEFLATEAPI
libdeflate_crc32(uint32_t crc, const void *buffer, size_t len);
/* ========================================================================== */
/* Custom memory allocator */
/* ========================================================================== */
/*
* Install a custom memory allocator which libdeflate will use for all memory
* allocations. 'malloc_func' is a function that must behave like malloc(), and
* 'free_func' is a function that must behave like free().
*
* There must not be any libdeflate_compressor or libdeflate_decompressor
* structures in existence when calling this function.
*/
LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
void (*free_func)(void *));
/* ========================================================================== */
/* NVIDIA GDEFLATE-related API */
/* ========================================================================== */
/* ========================================================================== */
/* Compression */
/* ========================================================================== */
struct libdeflate_gdeflate_compressor;
struct libdeflate_gdeflate_out_page {
/* Buffer for compressed GDEFLATE page data. */
void *data;
/* Buffer size in bytes. If compression succeeded this field will
* contain the size of compressed GDEFLATE page.
*/
size_t nbytes;
};
struct libdeflate_gdeflate_in_page {
/* Compressed GDEFLATE page data. */
const void* data;
/* Size in bytes of compressed GDEFLATE page. */
size_t nbytes;
};
/*
* libdeflate_alloc_gdeflate_compressor() allocates a new compressor that
* supports GDEFLATE compression. 'compression_level' is the compression
* level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
* medium/default, 9 = slow, 12 = slowest). Level 0 is also supported and means
* "no compression", specifically "create a valid stream, but only emit
* uncompressed blocks" (this will expand the data slightly).
*
* The return value is a pointer to the new compressor, or NULL if out of memory
* or if the compression level is invalid (i.e. outside the range [0, 12]).
*
* Note: for compression, the sliding window size is defined at compilation time
* to 65536, the largest size permissible in the GDEFLATE format. It cannot be
* changed at runtime.
*
* A single compressor is not safe to use by multiple threads concurrently.
* However, different threads may use different compressors concurrently.
*/
LIBDEFLATEEXPORT struct libdeflate_gdeflate_compressor * LIBDEFLATEAPI
libdeflate_alloc_gdeflate_compressor(int compression_level);
/*
* libdeflate_gdeflate_compress() performs raw GDEFLATE compression on a buffer
* of data. The function attempts to compress 'in_nbytes' bytes of data located
* at 'in' and writes page results to 'out_pages', which has preallocated 'data'
* with 'nbytes' space for each page. To determine the number of pages
* 'out_npages' the input data will be split into and the upper bound on
* compressed data use the libdeflate_gdeflate_compress_bound() function. The
* return value is the compressed size in bytes, or 0 if the data could not be
* compressed. If compression succeeded the size of each compressed page will
* be written to 'nbytes' field of 'out_pages'.
*/
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_gdeflate_compress(struct libdeflate_gdeflate_compressor *compressor,
const void *in, size_t in_nbytes,
struct libdeflate_gdeflate_out_page *out_pages,
size_t out_npages);
/*
* libdeflate_gdeflate_compress_ex() performs raw GDEFLATE compression on
* set of pages. The function attempts to compress 'npages' from 'in_pages'
* and writes page results to 'out_pages', which has preallocated 'data'
* with 'nbytes' space for each page. To determine the number of pages
* 'out_npages' the input data will be split into and the upper bound on
* compressed data use the libdeflate_gdeflate_compress_bound() function. The
* return value is the number of processed pages. If compression succeeded
* the size of each compressed page will be written to 'nbytes' field of 'out_pages'.
*/
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_gdeflate_compress_ex(struct libdeflate_gdeflate_compressor *compressor,
const struct libdeflate_gdeflate_in_page* in_pages,
struct libdeflate_gdeflate_out_page* out_pages, size_t npages);
/*
* libdeflate_gdeflate_compress_bound() returns a worst-case upper bound on the
* number of bytes of compressed data that may be produced by compressing any
* buffer of length less than or equal to 'in_nbytes' using
* libdeflate_gdeflate_compress() with the specified compressor. Mathematically,
* this bound will necessarily be a number greater than or equal to 'in_nbytes'.
* It may be an overestimate of the true upper bound. The return value is
* guaranteed to be the same for all invocations with the same compressor and
* same 'in_nbytes'. The 'out_npages' will contain the number of GDEFLATE pages
* the input data will be split into. This number should be used to preallocate
* the page array for libdeflate_gdeflate_compress(). The upper bound on the
* number of compressed data in a page can be found by dividing the function
* result by the 'out_npages' value.
*
* As a special case, 'compressor' may be NULL. This causes the bound to be
* taken across *any* libdeflate_compressor that could ever be allocated with
* this build of the library, with any options.
*/
LIBDEFLATEEXPORT size_t LIBDEFLATEAPI
libdeflate_gdeflate_compress_bound(struct libdeflate_gdeflate_compressor *comp,
size_t in_nbytes, size_t *out_npages);
/*
* libdeflate_free_gdeflate_compressor() frees a compressor that was allocated
* with libdeflate_alloc_gdeflate_compressor(). If a NULL pointer is passed in,
* no action is taken.
*/
LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_free_gdeflate_compressor(struct libdeflate_gdeflate_compressor *comp);
/* ========================================================================== */
/* Decompression */
/* ========================================================================== */
struct libdeflate_gdeflate_decompressor;
/*
* libdeflate_alloc_gdeflate_decompressor() allocates a new decompressor that
* can be used for GDEFLATE decompression. The return value is a pointer to
* the new decompressor, or NULL if out of memory.
*
* This function takes no parameters, and the returned decompressor is valid for
* decompressing data that was compressed at any compression level and with any
* sliding window size.
*
* A single decompressor is not safe to use by multiple threads concurrently.
* However, different threads may use different decompressors concurrently.
*/
LIBDEFLATEEXPORT struct libdeflate_gdeflate_decompressor * LIBDEFLATEAPI
libdeflate_alloc_gdeflate_decompressor(void);
/*
* libdeflate_gdeflate_decompress() decompresses the GDEFLATE-compressed pages
* from the 'in_pages' array with 'in_npages' members. The uncompressed data is
* written to 'out', a buffer with size 'out_nbytes_avail' bytes.
* If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.
* Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If
* a nonzero result code is returned, then the contents of the output buffer are
* undefined.
*
* libdeflate_gdeflate_decompress() can be used only in cases where the actual
* uncompressed size is known.
*/
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_gdeflate_decompress(struct libdeflate_gdeflate_decompressor *decomp,
struct libdeflate_gdeflate_in_page *in_pages,
size_t in_npages, void *out,
size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
/*
* libdeflate_gdeflate_decompress_ex() decompresses the GDEFLATE-compressed pages
* from the 'in_pages' array with 'npages' members. The uncompressed data is
* written to 'out_pages'.
* If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.
* Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If
* a nonzero result code is returned, then the contents of the output buffer are
* undefined.
*
* libdeflate_gdeflate_decompress_ex() can be used only in cases where the actual
* uncompressed size is known.
*/
LIBDEFLATEEXPORT enum libdeflate_result LIBDEFLATEAPI
libdeflate_gdeflate_decompress_ex(struct libdeflate_gdeflate_decompressor *decomp,
struct libdeflate_gdeflate_in_page *in_pages,
struct libdeflate_gdeflate_out_page* out_pages, size_t npages);
/*
* libdeflate_free_gdeflate_decompressor() frees a decompressor that was
* allocated with libdeflate_alloc_gdeflate_decompressor(). If a NULL pointer
* is passed in, no action is taken.
*/
LIBDEFLATEEXPORT void LIBDEFLATEAPI
libdeflate_free_gdeflate_decompressor(struct libdeflate_gdeflate_decompressor *decomp);
#ifdef __cplusplus
}
#endif
#endif /* LIBDEFLATE_H */

View file

@ -4,29 +4,76 @@ namespace fs = std::filesystem;
using namespace Crafter; using namespace Crafter;
extern "C" Configuration CrafterBuildProject(std::span<const std::string_view> args) { extern "C" Configuration CrafterBuildProject(std::span<const std::string_view> args) {
GitProjectSpec mathSpec{ std::vector<std::string> depArgs(args.begin(), args.end());
.source = { .url = "https://forgejo.catcrafts.net/Catcrafts/Crafter.Math.git" },
.args = std::vector<std::string>(args.begin(), args.end()), bool useLocal = false;
}; for (std::string_view a : args) {
Configuration* math = GitProject(mathSpec); if (a == "--local") { useLocal = true; break; }
}
Configuration* math = useLocal
? LocalProject({
.projectFile = "../Crafter.Math/project.cpp",
.args = depArgs,
})
: GitProject({
.source = { .url = "https://forgejo.catcrafts.net/Catcrafts/Crafter.Math.git" },
.args = depArgs,
});
Configuration cfg; Configuration cfg;
cfg.path = "./"; cfg.path = "./";
cfg.name = "Crafter.Asset"; cfg.name = "Crafter.Asset";
// Default to library so consumers (Crafter.Graphics, examples) link against
// libCrafter.Asset.a — the Compression module's non-template entry points
// need to be in a real archive, not just PCM-inline. `--exe` (via the
// example's main.cpp) flips us to Executable to produce crafter-asset.
cfg.type = ConfigurationType::LibraryStatic;
ApplyStandardArgs(cfg, args); ApplyStandardArgs(cfg, args);
bool wantExe = false;
for (std::string_view a : args) {
if (a == "--exe") { wantExe = true; break; }
}
if (wantExe) cfg.type = ConfigurationType::Executable;
cfg.outputName = (cfg.type == ConfigurationType::Executable) ? "crafter-asset" : "Crafter.Asset"; cfg.outputName = (cfg.type == ConfigurationType::Executable) ? "crafter-asset" : "Crafter.Asset";
cfg.dependencies = { math }; cfg.dependencies = { math };
std::array<fs::path, 3> ifaces = { // Vendored GDeflate (Apache-2.0, Microsoft DirectStorage reference) +
// libdeflate (MIT, NVIDIA fork). The C++ wrappers live inline in the
// Compression module impl; libdeflate's .c sources are compiled here.
cfg.compileFlags.push_back("-Ilib/gdeflate/libdeflate");
const std::array<fs::path, 12> libdeflateSources = {
"lib/gdeflate/libdeflate/lib/adler32",
"lib/gdeflate/libdeflate/lib/crc32",
"lib/gdeflate/libdeflate/lib/deflate_compress",
"lib/gdeflate/libdeflate/lib/deflate_decompress",
"lib/gdeflate/libdeflate/lib/gdeflate_compress",
"lib/gdeflate/libdeflate/lib/gdeflate_decompress",
"lib/gdeflate/libdeflate/lib/gzip_compress",
"lib/gdeflate/libdeflate/lib/gzip_decompress",
"lib/gdeflate/libdeflate/lib/utils",
"lib/gdeflate/libdeflate/lib/zlib_compress",
"lib/gdeflate/libdeflate/lib/zlib_decompress",
"lib/gdeflate/libdeflate/lib/x86/cpu_features",
};
for (const fs::path& p : libdeflateSources) cfg.cFiles.push_back(p);
std::array<fs::path, 4> ifaces = {
"interfaces/Crafter.Asset", "interfaces/Crafter.Asset",
"interfaces/Crafter.Asset-Compression",
"interfaces/Crafter.Asset-Texture", "interfaces/Crafter.Asset-Texture",
"interfaces/Crafter.Asset-Mesh", "interfaces/Crafter.Asset-Mesh",
}; };
if (cfg.type == ConfigurationType::Executable) { if (cfg.type == ConfigurationType::Executable) {
std::array<fs::path, 1> impls = { "implementations/main" }; std::array<fs::path, 2> impls = {
"implementations/Crafter.Asset-Compression",
"implementations/main",
};
cfg.GetInterfacesAndImplementations(ifaces, impls); cfg.GetInterfacesAndImplementations(ifaces, impls);
} else { } else {
std::array<fs::path, 0> impls = {}; std::array<fs::path, 1> impls = {
"implementations/Crafter.Asset-Compression",
};
cfg.GetInterfacesAndImplementations(ifaces, impls); cfg.GetInterfacesAndImplementations(ifaces, impls);
} }