270 lines
8.2 KiB
C++
270 lines
8.2 KiB
C++
|
|
/*
|
||
|
|
* SPDX-FileCopyrightText: Copyright (c) 2020, 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||
|
|
* SPDX-FileCopyrightText: Copyright (c) Microsoft Corporation. All rights reserved.
|
||
|
|
* SPDX-License-Identifier: Apache-2.0
|
||
|
|
*
|
||
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
|
* you may not use this file except in compliance with the License.
|
||
|
|
* You may obtain a copy of the License at
|
||
|
|
*
|
||
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
|
*
|
||
|
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
|
* See the License for the specific language governing permissions and
|
||
|
|
* limitations under the License.
|
||
|
|
*/
|
||
|
|
|
||
|
|
#include "GDeflate.h"
|
||
|
|
#include "TileStream.h"
|
||
|
|
#include "Utils.h"
|
||
|
|
#include "config.h"
|
||
|
|
|
||
|
|
#include <assert.h>
|
||
|
|
#include "libdeflate/libdeflate.h"
|
||
|
|
#include <string.h>
|
||
|
|
|
||
|
|
#include <algorithm>
|
||
|
|
#include <atomic>
|
||
|
|
#include <thread>
|
||
|
|
#include <vector>
|
||
|
|
|
||
|
|
template<>
|
||
|
|
struct std::default_delete<libdeflate_gdeflate_compressor>
|
||
|
|
{
|
||
|
|
void operator()(libdeflate_gdeflate_compressor* p) const
|
||
|
|
{
|
||
|
|
libdeflate_free_gdeflate_compressor(p);
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
namespace GDeflate
|
||
|
|
{
|
||
|
|
static constexpr uint32_t kMaxWorkers = 31;
|
||
|
|
static constexpr uint32_t kMinTilesPerWorker = 64;
|
||
|
|
|
||
|
|
struct OutputStreamWrapper
|
||
|
|
{
|
||
|
|
uint8_t* ptr = nullptr;
|
||
|
|
size_t size = 0;
|
||
|
|
size_t pos = 0;
|
||
|
|
|
||
|
|
template<typename T>
|
||
|
|
void StreamOut(T* d, size_t n = 1)
|
||
|
|
{
|
||
|
|
const size_t dataSize = sizeof(T) * n;
|
||
|
|
|
||
|
|
if (pos + dataSize > size)
|
||
|
|
{
|
||
|
|
printf("Fatal: stream overrun!\n");
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
memcpy(ptr + pos, d, dataSize);
|
||
|
|
pos += dataSize;
|
||
|
|
}
|
||
|
|
|
||
|
|
bool SetPos(size_t n)
|
||
|
|
{
|
||
|
|
if (n > size)
|
||
|
|
{
|
||
|
|
printf("Fatal: stream overrun!\n");
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
|
||
|
|
pos = n;
|
||
|
|
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
struct CompressionContext
|
||
|
|
{
|
||
|
|
const uint8_t* inputPtr;
|
||
|
|
size_t inputSize;
|
||
|
|
|
||
|
|
struct Tile
|
||
|
|
{
|
||
|
|
std::vector<uint8_t> data;
|
||
|
|
size_t uncompressedSize = 0;
|
||
|
|
};
|
||
|
|
|
||
|
|
std::vector<Tile> tiles;
|
||
|
|
|
||
|
|
std::atomic_uint32_t globalIndex;
|
||
|
|
uint32_t numItems;
|
||
|
|
|
||
|
|
std::atomic_bool failed;
|
||
|
|
};
|
||
|
|
|
||
|
|
static bool DoCompress(
|
||
|
|
uint8_t* output,
|
||
|
|
size_t* outputSize,
|
||
|
|
const uint8_t* in,
|
||
|
|
size_t inSize,
|
||
|
|
uint32_t level,
|
||
|
|
uint32_t flags)
|
||
|
|
{
|
||
|
|
if (outputSize == nullptr || output == nullptr || in == nullptr || inSize == 0)
|
||
|
|
return false;
|
||
|
|
|
||
|
|
if (inSize > kDefaultTileSize * TileStream::kMaxTiles)
|
||
|
|
return false;
|
||
|
|
|
||
|
|
CompressionContext context{};
|
||
|
|
|
||
|
|
context.inputPtr = in;
|
||
|
|
context.inputSize = inSize;
|
||
|
|
context.numItems = static_cast<uint32_t>((inSize + kDefaultTileSize - 1) / kDefaultTileSize);
|
||
|
|
context.tiles.resize(context.numItems);
|
||
|
|
context.failed = false;
|
||
|
|
|
||
|
|
auto TileCompressionJob = [&context, level]()
|
||
|
|
{
|
||
|
|
size_t pageCount = 0;
|
||
|
|
const size_t scratchSize = libdeflate_gdeflate_compress_bound(nullptr, kDefaultTileSize, &pageCount);
|
||
|
|
assert(pageCount == 1);
|
||
|
|
|
||
|
|
void* scratch = alloca(scratchSize);
|
||
|
|
|
||
|
|
std::unique_ptr<libdeflate_gdeflate_compressor> compressor(libdeflate_alloc_gdeflate_compressor(level));
|
||
|
|
|
||
|
|
while (true)
|
||
|
|
{
|
||
|
|
const uint32_t tileIndex = context.globalIndex.fetch_add(1, std::memory_order_relaxed);
|
||
|
|
|
||
|
|
if (tileIndex >= context.numItems)
|
||
|
|
break;
|
||
|
|
|
||
|
|
const size_t tilePos = tileIndex * kDefaultTileSize;
|
||
|
|
|
||
|
|
size_t remaining = context.inputSize - tilePos;
|
||
|
|
size_t uncompressedSize = std::min<size_t>(remaining, kDefaultTileSize);
|
||
|
|
|
||
|
|
auto& tile = context.tiles[tileIndex];
|
||
|
|
tile.uncompressedSize = uncompressedSize;
|
||
|
|
|
||
|
|
libdeflate_gdeflate_out_page compressedPage{scratch, scratchSize};
|
||
|
|
|
||
|
|
size_t result = libdeflate_gdeflate_compress(
|
||
|
|
compressor.get(),
|
||
|
|
context.inputPtr + tilePos,
|
||
|
|
uncompressedSize,
|
||
|
|
&compressedPage,
|
||
|
|
1);
|
||
|
|
|
||
|
|
if (result == 0)
|
||
|
|
{
|
||
|
|
context.failed = true;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
tile.data.resize(compressedPage.nbytes);
|
||
|
|
memcpy(tile.data.data(), compressedPage.data, compressedPage.nbytes);
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
std::thread workers[kMaxWorkers + 1];
|
||
|
|
const uint32_t maxWorkers = std::min(kMaxWorkers, std::thread::hardware_concurrency());
|
||
|
|
|
||
|
|
uint32_t numWorkersLeft =
|
||
|
|
std::min(maxWorkers, (context.numItems + kMinTilesPerWorker - 1) / kMinTilesPerWorker);
|
||
|
|
|
||
|
|
if (flags & COMPRESS_SINGLE_THREAD)
|
||
|
|
numWorkersLeft = 0;
|
||
|
|
|
||
|
|
for (auto& worker : workers)
|
||
|
|
{
|
||
|
|
if (numWorkersLeft == 0)
|
||
|
|
break;
|
||
|
|
|
||
|
|
worker = std::thread([TileCompressionJob]() { TileCompressionJob(); });
|
||
|
|
|
||
|
|
--numWorkersLeft;
|
||
|
|
}
|
||
|
|
|
||
|
|
TileCompressionJob();
|
||
|
|
|
||
|
|
for (auto& worker : workers)
|
||
|
|
{
|
||
|
|
if (worker.joinable())
|
||
|
|
worker.join();
|
||
|
|
}
|
||
|
|
|
||
|
|
// Compression failed
|
||
|
|
if (context.failed)
|
||
|
|
return false;
|
||
|
|
|
||
|
|
// Compression is done. Prepare the output stream.
|
||
|
|
|
||
|
|
std::vector<uint32_t> tilePtrs;
|
||
|
|
size_t dataPos = 0;
|
||
|
|
|
||
|
|
for (auto const& tile : context.tiles)
|
||
|
|
{
|
||
|
|
tilePtrs.push_back(static_cast<uint32_t>(dataPos));
|
||
|
|
dataPos += tile.data.size();
|
||
|
|
}
|
||
|
|
|
||
|
|
// tilePtrs[0] is used to store the size of the last tile; all the other
|
||
|
|
// elements are offsets to the tile data.
|
||
|
|
tilePtrs[0] = static_cast<uint32_t>(context.tiles.back().data.size());
|
||
|
|
|
||
|
|
assert(tilePtrs.size() <= TileStream::kMaxTiles);
|
||
|
|
assert(tilePtrs.size() == context.tiles.size());
|
||
|
|
assert(tilePtrs.size() == context.numItems);
|
||
|
|
|
||
|
|
OutputStreamWrapper outputStream;
|
||
|
|
outputStream.ptr = output;
|
||
|
|
outputStream.size = *outputSize;
|
||
|
|
|
||
|
|
// calculate uncompressed size
|
||
|
|
size_t uncompressedSize = tilePtrs.size() * kDefaultTileSize;
|
||
|
|
size_t tailSize = context.inputSize - (tilePtrs.size() - 1) * kDefaultTileSize;
|
||
|
|
if (tailSize < kDefaultTileSize)
|
||
|
|
uncompressedSize -= kDefaultTileSize - tailSize;
|
||
|
|
|
||
|
|
assert(uncompressedSize == inSize);
|
||
|
|
|
||
|
|
TileStream header(uncompressedSize);
|
||
|
|
|
||
|
|
assert(tilePtrs.size() == header.numTiles);
|
||
|
|
|
||
|
|
outputStream.StreamOut(&header);
|
||
|
|
outputStream.StreamOut(tilePtrs.data(), tilePtrs.size());
|
||
|
|
|
||
|
|
size_t dataOffset = outputStream.pos;
|
||
|
|
|
||
|
|
for (size_t i = 0; i < tilePtrs.size(); ++i)
|
||
|
|
{
|
||
|
|
CompressionContext::Tile const& tile = context.tiles[i];
|
||
|
|
uint32_t tileOffset = (i == 0) ? 0 : tilePtrs[i];
|
||
|
|
|
||
|
|
outputStream.SetPos(dataOffset + tileOffset);
|
||
|
|
outputStream.StreamOut(tile.data.data(), tile.data.size());
|
||
|
|
}
|
||
|
|
|
||
|
|
*outputSize = outputStream.pos;
|
||
|
|
|
||
|
|
return true;
|
||
|
|
}
|
||
|
|
|
||
|
|
size_t CompressBound(size_t size)
|
||
|
|
{
|
||
|
|
size_t numTiles = std::min(size_t(TileStream::kMaxTiles), (size + kDefaultTileSize - 1) / kDefaultTileSize);
|
||
|
|
numTiles = std::max(size_t(1), numTiles);
|
||
|
|
|
||
|
|
const size_t tileSize = kDefaultTileSize +
|
||
|
|
// Tile header. Ideally need to make it a part of compressor API.
|
||
|
|
(sizeof(uint32_t) + 4 * 208 + 4 * 8);
|
||
|
|
|
||
|
|
return numTiles * tileSize + sizeof(TileStream) + sizeof(uint64_t);
|
||
|
|
}
|
||
|
|
|
||
|
|
bool Compress(uint8_t* output, size_t* outputSize, const uint8_t* in, size_t inSize, uint32_t level, uint32_t flags)
|
||
|
|
{
|
||
|
|
return DoCompress(output, outputSize, in, inSize, level, flags);
|
||
|
|
}
|
||
|
|
|
||
|
|
} // namespace GDeflate
|