asset compression
This commit is contained in:
parent
b9b9ecb84c
commit
30a283c1b3
57 changed files with 13237 additions and 8 deletions
337
lib/gdeflate/libdeflate/lib/x86/adler32_impl.h
Normal file
337
lib/gdeflate/libdeflate/lib/x86/adler32_impl.h
Normal file
|
|
@ -0,0 +1,337 @@
|
|||
/*
|
||||
* x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm
|
||||
*
|
||||
* Copyright 2016 Eric Biggers
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef LIB_X86_ADLER32_IMPL_H
|
||||
#define LIB_X86_ADLER32_IMPL_H
|
||||
|
||||
#include "cpu_features.h"
|
||||
|
||||
/*
|
||||
* The following macros horizontally sum the s1 counters and add them to the
|
||||
* real s1, and likewise for s2. They do this via a series of reductions, each
|
||||
* of which halves the vector length, until just one counter remains.
|
||||
*
|
||||
* The s1 reductions don't depend on the s2 reductions and vice versa, so for
|
||||
* efficiency they are interleaved. Also, every other s1 counter is 0 due to
|
||||
* the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than
|
||||
* 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits.
|
||||
*/
|
||||
|
||||
#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2) \
|
||||
{ \
|
||||
__v4su s1_last = (v_s1), s2_last = (v_s2); \
|
||||
\
|
||||
/* 128 => 32 bits */ \
|
||||
s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x31); \
|
||||
s1_last += (__v4su)_mm_shuffle_epi32((__m128i)s1_last, 0x02); \
|
||||
s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x02); \
|
||||
\
|
||||
*(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last); \
|
||||
*(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last); \
|
||||
}
|
||||
|
||||
#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2) \
|
||||
{ \
|
||||
__v4su s1_128bit, s2_128bit; \
|
||||
\
|
||||
/* 256 => 128 bits */ \
|
||||
s1_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 0) + \
|
||||
(__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 1); \
|
||||
s2_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 0) + \
|
||||
(__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 1); \
|
||||
\
|
||||
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \
|
||||
}
|
||||
|
||||
#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \
|
||||
{ \
|
||||
__v8su s1_256bit, s2_256bit; \
|
||||
\
|
||||
/* 512 => 256 bits */ \
|
||||
s1_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 0) + \
|
||||
(__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 1); \
|
||||
s2_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 0) + \
|
||||
(__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 1); \
|
||||
\
|
||||
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \
|
||||
}
|
||||
|
||||
/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */
|
||||
#undef DISPATCH_AVX512BW
|
||||
#if !defined(DEFAULT_IMPL) && \
|
||||
/*
|
||||
* clang before v3.9 is missing some AVX-512BW intrinsics including
|
||||
* _mm512_sad_epu8(), a.k.a. __builtin_ia32_psadbw512. So just make using
|
||||
* AVX-512BW, even when __AVX512BW__ is defined, conditional on
|
||||
* COMPILER_SUPPORTS_AVX512BW_TARGET where we check for that builtin.
|
||||
*/ \
|
||||
COMPILER_SUPPORTS_AVX512BW_TARGET && \
|
||||
(defined(__AVX512BW__) || (X86_CPU_FEATURES_ENABLED && \
|
||||
COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS))
|
||||
# define FUNCNAME adler32_avx512bw
|
||||
# define FUNCNAME_CHUNK adler32_avx512bw_chunk
|
||||
# define IMPL_ALIGNMENT 64
|
||||
# define IMPL_SEGMENT_SIZE 64
|
||||
# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE
|
||||
# ifdef __AVX512BW__
|
||||
# define ATTRIBUTES
|
||||
# define DEFAULT_IMPL adler32_avx512bw
|
||||
# else
|
||||
# define ATTRIBUTES __attribute__((target("avx512bw")))
|
||||
# define DISPATCH 1
|
||||
# define DISPATCH_AVX512BW 1
|
||||
# endif
|
||||
# include <immintrin.h>
|
||||
static forceinline ATTRIBUTES void
|
||||
adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
|
||||
u32 *s1, u32 *s2)
|
||||
{
|
||||
const __m512i zeroes = _mm512_setzero_si512();
|
||||
const __v64qi multipliers = (__v64qi){
|
||||
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
|
||||
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
|
||||
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
|
||||
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
|
||||
};
|
||||
const __v32hi ones = (__v32hi)_mm512_set1_epi16(1);
|
||||
__v16si v_s1 = (__v16si)zeroes;
|
||||
__v16si v_s1_sums = (__v16si)zeroes;
|
||||
__v16si v_s2 = (__v16si)zeroes;
|
||||
|
||||
do {
|
||||
/* Load the next 64-byte segment */
|
||||
__m512i bytes = *p++;
|
||||
|
||||
/* Multiply the bytes by 64...1 (the number of times they need
|
||||
* to be added to s2) and add adjacent products */
|
||||
__v32hi sums = (__v32hi)_mm512_maddubs_epi16(
|
||||
bytes, (__m512i)multipliers);
|
||||
|
||||
/* Keep sum of all previous s1 counters, for adding to s2 later.
|
||||
* This allows delaying the multiplication by 64 to the end. */
|
||||
v_s1_sums += v_s1;
|
||||
|
||||
/* Add the sum of each group of 8 bytes to the corresponding s1
|
||||
* counter */
|
||||
v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes);
|
||||
|
||||
/* Add the sum of each group of 4 products of the bytes by
|
||||
* 64...1 to the corresponding s2 counter */
|
||||
v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums,
|
||||
(__m512i)ones);
|
||||
} while (p != end);
|
||||
|
||||
/* Finish the s2 counters by adding the sum of the s1 values at the
|
||||
* beginning of each segment, multiplied by the segment size (64) */
|
||||
v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6);
|
||||
|
||||
/* Add the counters to the real s1 and s2 */
|
||||
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
|
||||
}
|
||||
# include "../adler32_vec_template.h"
|
||||
#endif /* AVX-512BW implementation */
|
||||
|
||||
/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */
|
||||
#undef DISPATCH_AVX2
|
||||
#if !defined(DEFAULT_IMPL) && \
|
||||
(defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED && \
|
||||
COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS))
|
||||
# define FUNCNAME adler32_avx2
|
||||
# define FUNCNAME_CHUNK adler32_avx2_chunk
|
||||
# define IMPL_ALIGNMENT 32
|
||||
# define IMPL_SEGMENT_SIZE 32
|
||||
# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE
|
||||
# ifdef __AVX2__
|
||||
# define ATTRIBUTES
|
||||
# define DEFAULT_IMPL adler32_avx2
|
||||
# else
|
||||
# define ATTRIBUTES __attribute__((target("avx2")))
|
||||
# define DISPATCH 1
|
||||
# define DISPATCH_AVX2 1
|
||||
# endif
|
||||
# include <immintrin.h>
|
||||
static forceinline ATTRIBUTES void
|
||||
adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
|
||||
{
|
||||
const __m256i zeroes = _mm256_setzero_si256();
|
||||
const __v32qu multipliers = (__v32qu){
|
||||
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
|
||||
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
|
||||
};
|
||||
const __v16hu ones = (__v16hu)_mm256_set1_epi16(1);
|
||||
__v8su v_s1 = (__v8su)zeroes;
|
||||
__v8su v_s1_sums = (__v8su)zeroes;
|
||||
__v8su v_s2 = (__v8su)zeroes;
|
||||
|
||||
do {
|
||||
/* Load the next 32-byte segment */
|
||||
__m256i bytes = *p++;
|
||||
|
||||
/* Multiply the bytes by 32...1 (the number of times they need
|
||||
* to be added to s2) and add adjacent products */
|
||||
__v16hu sums = (__v16hu)_mm256_maddubs_epi16(
|
||||
bytes, (__m256i)multipliers);
|
||||
|
||||
/* Keep sum of all previous s1 counters, for adding to s2 later.
|
||||
* This allows delaying the multiplication by 32 to the end. */
|
||||
v_s1_sums += v_s1;
|
||||
|
||||
/* Add the sum of each group of 8 bytes to the corresponding s1
|
||||
* counter */
|
||||
v_s1 += (__v8su)_mm256_sad_epu8(bytes, zeroes);
|
||||
|
||||
/* Add the sum of each group of 4 products of the bytes by
|
||||
* 32...1 to the corresponding s2 counter */
|
||||
v_s2 += (__v8su)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
|
||||
} while (p != end);
|
||||
|
||||
/* Finish the s2 counters by adding the sum of the s1 values at the
|
||||
* beginning of each segment, multiplied by the segment size (32) */
|
||||
v_s2 += (__v8su)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
|
||||
|
||||
/* Add the counters to the real s1 and s2 */
|
||||
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
|
||||
}
|
||||
# include "../adler32_vec_template.h"
|
||||
#endif /* AVX2 implementation */
|
||||
|
||||
/* SSE2 implementation */
|
||||
#undef DISPATCH_SSE2
|
||||
#if !defined(DEFAULT_IMPL) && \
|
||||
(defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED && \
|
||||
COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS))
|
||||
# define FUNCNAME adler32_sse2
|
||||
# define FUNCNAME_CHUNK adler32_sse2_chunk
|
||||
# define IMPL_ALIGNMENT 16
|
||||
# define IMPL_SEGMENT_SIZE 32
|
||||
/*
|
||||
* The 16-bit precision byte counters must not be allowed to undergo *signed*
|
||||
* overflow, otherwise the signed multiplications at the end (_mm_madd_epi16)
|
||||
* would behave incorrectly.
|
||||
*/
|
||||
# define IMPL_MAX_CHUNK_SIZE (32 * (0x7FFF / 0xFF))
|
||||
# ifdef __SSE2__
|
||||
# define ATTRIBUTES
|
||||
# define DEFAULT_IMPL adler32_sse2
|
||||
# else
|
||||
# define ATTRIBUTES __attribute__((target("sse2")))
|
||||
# define DISPATCH 1
|
||||
# define DISPATCH_SSE2 1
|
||||
# endif
|
||||
# include <emmintrin.h>
|
||||
static forceinline ATTRIBUTES void
|
||||
adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
|
||||
{
|
||||
const __m128i zeroes = _mm_setzero_si128();
|
||||
|
||||
/* s1 counters: 32-bit, sum of bytes */
|
||||
__v4su v_s1 = (__v4su)zeroes;
|
||||
|
||||
/* s2 counters: 32-bit, sum of s1 values */
|
||||
__v4su v_s2 = (__v4su)zeroes;
|
||||
|
||||
/*
|
||||
* Thirty-two 16-bit counters for byte sums. Each accumulates the bytes
|
||||
* that eventually need to be multiplied by a number 32...1 for addition
|
||||
* into s2.
|
||||
*/
|
||||
__v8hu v_byte_sums_a = (__v8hu)zeroes;
|
||||
__v8hu v_byte_sums_b = (__v8hu)zeroes;
|
||||
__v8hu v_byte_sums_c = (__v8hu)zeroes;
|
||||
__v8hu v_byte_sums_d = (__v8hu)zeroes;
|
||||
|
||||
do {
|
||||
/* Load the next 32 bytes */
|
||||
const __m128i bytes1 = *p++;
|
||||
const __m128i bytes2 = *p++;
|
||||
|
||||
/*
|
||||
* Accumulate the previous s1 counters into the s2 counters.
|
||||
* Logically, this really should be v_s2 += v_s1 * 32, but we
|
||||
* can do the multiplication (or left shift) later.
|
||||
*/
|
||||
v_s2 += v_s1;
|
||||
|
||||
/*
|
||||
* s1 update: use "Packed Sum of Absolute Differences" to add
|
||||
* the bytes horizontally with 8 bytes per sum. Then add the
|
||||
* sums to the s1 counters.
|
||||
*/
|
||||
v_s1 += (__v4su)_mm_sad_epu8(bytes1, zeroes);
|
||||
v_s1 += (__v4su)_mm_sad_epu8(bytes2, zeroes);
|
||||
|
||||
/*
|
||||
* Also accumulate the bytes into 32 separate counters that have
|
||||
* 16-bit precision.
|
||||
*/
|
||||
v_byte_sums_a += (__v8hu)_mm_unpacklo_epi8(bytes1, zeroes);
|
||||
v_byte_sums_b += (__v8hu)_mm_unpackhi_epi8(bytes1, zeroes);
|
||||
v_byte_sums_c += (__v8hu)_mm_unpacklo_epi8(bytes2, zeroes);
|
||||
v_byte_sums_d += (__v8hu)_mm_unpackhi_epi8(bytes2, zeroes);
|
||||
|
||||
} while (p != end);
|
||||
|
||||
/* Finish calculating the s2 counters */
|
||||
v_s2 = (__v4su)_mm_slli_epi32((__m128i)v_s2, 5);
|
||||
v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_a,
|
||||
(__m128i)(__v8hu){ 32, 31, 30, 29, 28, 27, 26, 25 });
|
||||
v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_b,
|
||||
(__m128i)(__v8hu){ 24, 23, 22, 21, 20, 19, 18, 17 });
|
||||
v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_c,
|
||||
(__m128i)(__v8hu){ 16, 15, 14, 13, 12, 11, 10, 9 });
|
||||
v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_d,
|
||||
(__m128i)(__v8hu){ 8, 7, 6, 5, 4, 3, 2, 1 });
|
||||
|
||||
/* Add the counters to the real s1 and s2 */
|
||||
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
|
||||
}
|
||||
# include "../adler32_vec_template.h"
|
||||
#endif /* SSE2 implementation */
|
||||
|
||||
#ifdef DISPATCH
|
||||
static inline adler32_func_t
|
||||
arch_select_adler32_func(void)
|
||||
{
|
||||
u32 features = get_cpu_features();
|
||||
|
||||
#ifdef DISPATCH_AVX512BW
|
||||
if (features & X86_CPU_FEATURE_AVX512BW)
|
||||
return adler32_avx512bw;
|
||||
#endif
|
||||
#ifdef DISPATCH_AVX2
|
||||
if (features & X86_CPU_FEATURE_AVX2)
|
||||
return adler32_avx2;
|
||||
#endif
|
||||
#ifdef DISPATCH_SSE2
|
||||
if (features & X86_CPU_FEATURE_SSE2)
|
||||
return adler32_sse2;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
#endif /* DISPATCH */
|
||||
|
||||
#endif /* LIB_X86_ADLER32_IMPL_H */
|
||||
152
lib/gdeflate/libdeflate/lib/x86/cpu_features.c
Normal file
152
lib/gdeflate/libdeflate/lib/x86/cpu_features.c
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
/*
|
||||
* x86/cpu_features.c - feature detection for x86 processors
|
||||
*
|
||||
* Copyright 2016 Eric Biggers
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "../cpu_features_common.h" /* must be included first */
|
||||
#include "cpu_features.h"
|
||||
|
||||
#if X86_CPU_FEATURES_ENABLED
|
||||
|
||||
volatile u32 _cpu_features = 0;
|
||||
|
||||
/* With old GCC versions we have to manually save and restore the x86_32 PIC
|
||||
* register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 */
|
||||
#if defined(__i386__) && defined(__PIC__)
|
||||
# define EBX_CONSTRAINT "=&r"
|
||||
#else
|
||||
# define EBX_CONSTRAINT "=b"
|
||||
#endif
|
||||
|
||||
/* Execute the CPUID instruction. */
|
||||
static inline void
|
||||
cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
|
||||
{
|
||||
__asm__(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n"
|
||||
"cpuid \n"
|
||||
".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
|
||||
: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
|
||||
: "a" (leaf), "c" (subleaf));
|
||||
}
|
||||
|
||||
/* Read an extended control register. */
|
||||
static inline u64
|
||||
read_xcr(u32 index)
|
||||
{
|
||||
u32 edx, eax;
|
||||
|
||||
/* Execute the "xgetbv" instruction. Old versions of binutils do not
|
||||
* recognize this instruction, so list the raw bytes instead. */
|
||||
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
|
||||
|
||||
return ((u64)edx << 32) | eax;
|
||||
}
|
||||
|
||||
#undef BIT
|
||||
#define BIT(nr) (1UL << (nr))
|
||||
|
||||
#define XCR0_BIT_SSE BIT(1)
|
||||
#define XCR0_BIT_AVX BIT(2)
|
||||
#define XCR0_BIT_OPMASK BIT(5)
|
||||
#define XCR0_BIT_ZMM_HI256 BIT(6)
|
||||
#define XCR0_BIT_HI16_ZMM BIT(7)
|
||||
|
||||
#define IS_SET(reg, nr) ((reg) & BIT(nr))
|
||||
#define IS_ALL_SET(reg, mask) (((reg) & (mask)) == (mask))
|
||||
|
||||
static const struct cpu_feature x86_cpu_feature_table[] = {
|
||||
{X86_CPU_FEATURE_SSE2, "sse2"},
|
||||
{X86_CPU_FEATURE_PCLMUL, "pclmul"},
|
||||
{X86_CPU_FEATURE_AVX, "avx"},
|
||||
{X86_CPU_FEATURE_AVX2, "avx2"},
|
||||
{X86_CPU_FEATURE_BMI2, "bmi2"},
|
||||
{X86_CPU_FEATURE_AVX512BW, "avx512bw"},
|
||||
};
|
||||
|
||||
/* Initialize _cpu_features with bits for interesting processor features. */
|
||||
void setup_cpu_features(void)
|
||||
{
|
||||
u32 features = 0;
|
||||
u32 dummy1, dummy2, dummy3, dummy4;
|
||||
u32 max_function;
|
||||
u32 features_1, features_2, features_3, features_4;
|
||||
bool os_avx_support = false;
|
||||
bool os_avx512_support = false;
|
||||
|
||||
/* Get maximum supported function */
|
||||
cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
|
||||
if (max_function < 1)
|
||||
goto out;
|
||||
|
||||
/* Standard feature flags */
|
||||
cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
|
||||
|
||||
if (IS_SET(features_1, 26))
|
||||
features |= X86_CPU_FEATURE_SSE2;
|
||||
|
||||
if (IS_SET(features_2, 1))
|
||||
features |= X86_CPU_FEATURE_PCLMUL;
|
||||
|
||||
if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
|
||||
u64 xcr0 = read_xcr(0);
|
||||
|
||||
os_avx_support = IS_ALL_SET(xcr0,
|
||||
XCR0_BIT_SSE |
|
||||
XCR0_BIT_AVX);
|
||||
|
||||
os_avx512_support = IS_ALL_SET(xcr0,
|
||||
XCR0_BIT_SSE |
|
||||
XCR0_BIT_AVX |
|
||||
XCR0_BIT_OPMASK |
|
||||
XCR0_BIT_ZMM_HI256 |
|
||||
XCR0_BIT_HI16_ZMM);
|
||||
}
|
||||
|
||||
if (os_avx_support && IS_SET(features_2, 28))
|
||||
features |= X86_CPU_FEATURE_AVX;
|
||||
|
||||
if (max_function < 7)
|
||||
goto out;
|
||||
|
||||
/* Extended feature flags */
|
||||
cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
|
||||
|
||||
if (os_avx_support && IS_SET(features_3, 5))
|
||||
features |= X86_CPU_FEATURE_AVX2;
|
||||
|
||||
if (IS_SET(features_3, 8))
|
||||
features |= X86_CPU_FEATURE_BMI2;
|
||||
|
||||
if (os_avx512_support && IS_SET(features_3, 30))
|
||||
features |= X86_CPU_FEATURE_AVX512BW;
|
||||
|
||||
out:
|
||||
disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
|
||||
ARRAY_LEN(x86_cpu_feature_table));
|
||||
|
||||
_cpu_features = features | X86_CPU_FEATURES_KNOWN;
|
||||
}
|
||||
|
||||
#endif /* X86_CPU_FEATURES_ENABLED */
|
||||
41
lib/gdeflate/libdeflate/lib/x86/cpu_features.h
Normal file
41
lib/gdeflate/libdeflate/lib/x86/cpu_features.h
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* x86/cpu_features.h - feature detection for x86 processors
|
||||
*/
|
||||
|
||||
#ifndef LIB_X86_CPU_FEATURES_H
|
||||
#define LIB_X86_CPU_FEATURES_H
|
||||
|
||||
#include "../lib_common.h"
|
||||
|
||||
#if (defined(__i386__) || defined(__x86_64__)) && \
|
||||
COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
|
||||
# define X86_CPU_FEATURES_ENABLED 1
|
||||
#else
|
||||
# define X86_CPU_FEATURES_ENABLED 0
|
||||
#endif
|
||||
|
||||
#if X86_CPU_FEATURES_ENABLED
|
||||
|
||||
#define X86_CPU_FEATURE_SSE2 0x00000001
|
||||
#define X86_CPU_FEATURE_PCLMUL 0x00000002
|
||||
#define X86_CPU_FEATURE_AVX 0x00000004
|
||||
#define X86_CPU_FEATURE_AVX2 0x00000008
|
||||
#define X86_CPU_FEATURE_BMI2 0x00000010
|
||||
#define X86_CPU_FEATURE_AVX512BW 0x00000020
|
||||
|
||||
#define X86_CPU_FEATURES_KNOWN 0x80000000
|
||||
|
||||
extern volatile u32 _cpu_features;
|
||||
|
||||
void setup_cpu_features(void);
|
||||
|
||||
static inline u32 get_cpu_features(void)
|
||||
{
|
||||
if (_cpu_features == 0)
|
||||
setup_cpu_features();
|
||||
return _cpu_features;
|
||||
}
|
||||
|
||||
#endif /* X86_CPU_FEATURES_ENABLED */
|
||||
|
||||
#endif /* LIB_X86_CPU_FEATURES_H */
|
||||
92
lib/gdeflate/libdeflate/lib/x86/crc32_impl.h
Normal file
92
lib/gdeflate/libdeflate/lib/x86/crc32_impl.h
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
/*
|
||||
* x86/crc32_impl.h - x86 implementations of CRC-32 checksum algorithm
|
||||
*
|
||||
* Copyright 2016 Eric Biggers
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef LIB_X86_CRC32_IMPL_H
|
||||
#define LIB_X86_CRC32_IMPL_H
|
||||
|
||||
#include "cpu_features.h"
|
||||
|
||||
/*
|
||||
* Include the PCLMUL/AVX implementation? Although our PCLMUL-optimized CRC-32
|
||||
* function doesn't use any AVX intrinsics specifically, it can benefit a lot
|
||||
* from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
|
||||
* MB/s. I expect this is related to the PCLMULQDQ instructions being assembled
|
||||
* in the newer three-operand form rather than the older two-operand form.
|
||||
*
|
||||
* Note: this is only needed if __AVX__ is *not* defined, since otherwise the
|
||||
* "regular" PCLMUL implementation would already be AVX enabled.
|
||||
*/
|
||||
#undef DISPATCH_PCLMUL_AVX
|
||||
#if !defined(DEFAULT_IMPL) && !defined(__AVX__) && \
|
||||
X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET && \
|
||||
(defined(__PCLMUL__) || COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS)
|
||||
# define FUNCNAME crc32_pclmul_avx
|
||||
# define FUNCNAME_ALIGNED crc32_pclmul_avx_aligned
|
||||
# define ATTRIBUTES __attribute__((target("pclmul,avx")))
|
||||
# define DISPATCH 1
|
||||
# define DISPATCH_PCLMUL_AVX 1
|
||||
# include "crc32_pclmul_template.h"
|
||||
#endif
|
||||
|
||||
/* PCLMUL implementation */
|
||||
#undef DISPATCH_PCLMUL
|
||||
#if !defined(DEFAULT_IMPL) && \
|
||||
(defined(__PCLMUL__) || (X86_CPU_FEATURES_ENABLED && \
|
||||
COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS))
|
||||
# define FUNCNAME crc32_pclmul
|
||||
# define FUNCNAME_ALIGNED crc32_pclmul_aligned
|
||||
# ifdef __PCLMUL__
|
||||
# define ATTRIBUTES
|
||||
# define DEFAULT_IMPL crc32_pclmul
|
||||
# else
|
||||
# define ATTRIBUTES __attribute__((target("pclmul")))
|
||||
# define DISPATCH 1
|
||||
# define DISPATCH_PCLMUL 1
|
||||
# endif
|
||||
# include "crc32_pclmul_template.h"
|
||||
#endif
|
||||
|
||||
#ifdef DISPATCH
|
||||
static inline crc32_func_t
|
||||
arch_select_crc32_func(void)
|
||||
{
|
||||
u32 features = get_cpu_features();
|
||||
|
||||
#ifdef DISPATCH_PCLMUL_AVX
|
||||
if ((features & X86_CPU_FEATURE_PCLMUL) &&
|
||||
(features & X86_CPU_FEATURE_AVX))
|
||||
return crc32_pclmul_avx;
|
||||
#endif
|
||||
#ifdef DISPATCH_PCLMUL
|
||||
if (features & X86_CPU_FEATURE_PCLMUL)
|
||||
return crc32_pclmul;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
#endif /* DISPATCH */
|
||||
|
||||
#endif /* LIB_X86_CRC32_IMPL_H */
|
||||
262
lib/gdeflate/libdeflate/lib/x86/crc32_pclmul_template.h
Normal file
262
lib/gdeflate/libdeflate/lib/x86/crc32_pclmul_template.h
Normal file
|
|
@ -0,0 +1,262 @@
|
|||
/*
|
||||
* x86/crc32_pclmul_template.h
|
||||
*
|
||||
* Copyright 2016 Eric Biggers
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
/*
|
||||
* CRC-32 folding with PCLMULQDQ.
|
||||
*
|
||||
* The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
|
||||
* producing an abbreviated message which is congruent the original message
|
||||
* modulo the generator polynomial G(x).
|
||||
*
|
||||
* Folding each 512 bits is implemented as eight 64-bit folds, each of which
|
||||
* uses one carryless multiplication instruction. It's expected that CPUs may
|
||||
* be able to execute some of these multiplications in parallel.
|
||||
*
|
||||
* Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
|
||||
* be 95 bits from a constant distance D later in the message. The relevant
|
||||
* portion of the message can be written as:
|
||||
*
|
||||
* M(x) = A(x)*x^D + B(x)
|
||||
*
|
||||
* ... where + and * represent addition and multiplication, respectively, of
|
||||
* polynomials over GF(2). Note that when implemented on a computer, these
|
||||
* operations are equivalent to XOR and carryless multiplication, respectively.
|
||||
*
|
||||
* For the purpose of CRC calculation, only the remainder modulo the generator
|
||||
* polynomial G(x) matters:
|
||||
*
|
||||
* M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
|
||||
*
|
||||
* Since the modulo operation can be applied anywhere in a sequence of additions
|
||||
* and multiplications without affecting the result, this is equivalent to:
|
||||
*
|
||||
* M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
|
||||
*
|
||||
* For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
|
||||
* a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
|
||||
* multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
|
||||
* product. Then, adding (XOR-ing) the product to B(x) produces a polynomial
|
||||
* with the same length as B(x) but with the same remainder as 'A(x)*x^D +
|
||||
* B(x)'. This is the basic fold operation with 64 bits.
|
||||
*
|
||||
* Note that the carryless multiplication instruction PCLMULQDQ actually takes
|
||||
* two 64-bit inputs and produces a 127-bit product in the low-order bits of a
|
||||
* 128-bit XMM register. This works fine, but care must be taken to account for
|
||||
* "bit endianness". With the CRC version implemented here, bits are always
|
||||
* ordered such that the lowest-order bit represents the coefficient of highest
|
||||
* power of x and the highest-order bit represents the coefficient of the lowest
|
||||
* power of x. This is backwards from the more intuitive order. Still,
|
||||
* carryless multiplication works essentially the same either way. It just must
|
||||
* be accounted for that when we XOR the 95-bit product in the low-order 95 bits
|
||||
* of a 128-bit XMM register into 128-bits of later data held in another XMM
|
||||
* register, we'll really be XOR-ing the product into the mathematically higher
|
||||
* degree end of those later bits, not the lower degree end as may be expected.
|
||||
*
|
||||
* So given that caveat and the fact that we process 512 bits per iteration, the
|
||||
* 'D' values we need for the two 64-bit halves of each 128 bits of data are:
|
||||
*
|
||||
* D = (512 + 95) - 64 for the higher-degree half of each 128 bits,
|
||||
* i.e. the lower order bits in the XMM register
|
||||
*
|
||||
* D = (512 + 95) - 128 for the lower-degree half of each 128 bits,
|
||||
* i.e. the higher order bits in the XMM register
|
||||
*
|
||||
* The required 'x^D mod G(x)' values were precomputed.
|
||||
*
|
||||
* When <= 512 bits remain in the message, we finish up by folding across
|
||||
* smaller distances. This works similarly; the distance D is just different,
|
||||
* so different constant multipliers must be used. Finally, once the remaining
|
||||
* message is just 64 bits, it is reduced to the CRC-32 using Barrett reduction
|
||||
* (explained later).
|
||||
*
|
||||
* For more information see the original paper from Intel:
|
||||
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
|
||||
* December 2009
|
||||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
||||
*/
|
||||
static u32 ATTRIBUTES
|
||||
FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t nr_segs)
|
||||
{
|
||||
/* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */
|
||||
const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
|
||||
const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
|
||||
const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
|
||||
const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
|
||||
const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
|
||||
const __v2di barrett_reduction_constants =
|
||||
(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
|
||||
|
||||
const __m128i * const end = p + nr_segs;
|
||||
const __m128i * const end512 = p + (nr_segs & ~3);
|
||||
__m128i x0, x1, x2, x3;
|
||||
|
||||
/*
|
||||
* Account for the current 'remainder', i.e. the CRC of the part of the
|
||||
* message already processed. Explanation: rewrite the message
|
||||
* polynomial M(x) in terms of the first part A(x), the second part
|
||||
* B(x), and the length of the second part in bits |B(x)| >= 32:
|
||||
*
|
||||
* M(x) = A(x)*x^|B(x)| + B(x)
|
||||
*
|
||||
* Then the CRC of M(x) is:
|
||||
*
|
||||
* CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
|
||||
* = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
|
||||
* = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
|
||||
*
|
||||
* Note: all arithmetic is modulo G(x), the generator polynomial; that's
|
||||
* why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
|
||||
*
|
||||
* So the CRC of the full message is the CRC of the second part of the
|
||||
* message where the first 32 bits of the second part of the message
|
||||
* have been XOR'ed with the CRC of the first part of the message.
|
||||
*/
|
||||
x0 = *p++;
|
||||
x0 ^= (__m128i)(__v4si){ remainder };
|
||||
|
||||
if (p > end512) /* only 128, 256, or 384 bits of input? */
|
||||
goto _128_bits_at_a_time;
|
||||
x1 = *p++;
|
||||
x2 = *p++;
|
||||
x3 = *p++;
|
||||
|
||||
/* Fold 512 bits at a time */
|
||||
for (; p != end512; p += 4) {
|
||||
__m128i y0, y1, y2, y3;
|
||||
|
||||
y0 = p[0];
|
||||
y1 = p[1];
|
||||
y2 = p[2];
|
||||
y3 = p[3];
|
||||
|
||||
/*
|
||||
* Note: the immediate constant for PCLMULQDQ specifies which
|
||||
* 64-bit halves of the 128-bit vectors to multiply:
|
||||
*
|
||||
* 0x00 means low halves (higher degree polynomial terms for us)
|
||||
* 0x11 means high halves (lower degree polynomial terms for us)
|
||||
*/
|
||||
y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
|
||||
y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
|
||||
y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
|
||||
y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
|
||||
y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
|
||||
y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
|
||||
y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
|
||||
y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
|
||||
|
||||
x0 = y0;
|
||||
x1 = y1;
|
||||
x2 = y2;
|
||||
x3 = y3;
|
||||
}
|
||||
|
||||
/* Fold 512 bits => 128 bits */
|
||||
x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
|
||||
x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
|
||||
x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
|
||||
x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
|
||||
x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
|
||||
x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
|
||||
x0 = x3;
|
||||
|
||||
_128_bits_at_a_time:
|
||||
while (p != end) {
|
||||
/* Fold 128 bits into next 128 bits */
|
||||
x1 = *p++;
|
||||
x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
|
||||
x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
|
||||
x0 = x1;
|
||||
}
|
||||
|
||||
/* Now there are just 128 bits left, stored in 'x0'. */
|
||||
|
||||
/*
|
||||
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
|
||||
* which is equivalent to multiplying by x^32. This is needed because
|
||||
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
|
||||
*/
|
||||
x0 = _mm_srli_si128(x0, 8) ^
|
||||
_mm_clmulepi64_si128(x0, multipliers_1, 0x10);
|
||||
|
||||
/* Fold 96 => 64 bits */
|
||||
x0 = _mm_srli_si128(x0, 4) ^
|
||||
_mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
|
||||
|
||||
/*
|
||||
* Finally, reduce 64 => 32 bits using Barrett reduction.
|
||||
*
|
||||
* Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
|
||||
* compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
|
||||
*
|
||||
* R(x) = (A(x)*x^32 + B(x)) mod G(x)
|
||||
* = (A(x)*x^32) mod G(x) + B(x)
|
||||
*
|
||||
* Then, by the Division Algorithm there exists a unique q(x) such that:
|
||||
*
|
||||
* A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
|
||||
*
|
||||
* Since the left-hand side is of maximum degree 31, the right-hand side
|
||||
* must be too. This implies that we can apply 'mod x^32' to the
|
||||
* right-hand side without changing its value:
|
||||
*
|
||||
* (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
|
||||
*
|
||||
* Note that '+' is equivalent to '-' in polynomials over GF(2).
|
||||
*
|
||||
* We also know that:
|
||||
*
|
||||
* / A(x)*x^32 \
|
||||
* q(x) = floor ( --------- )
|
||||
* \ G(x) /
|
||||
*
|
||||
* To compute this efficiently, we can multiply the top and bottom by
|
||||
* x^32 and move the division by G(x) to the top:
|
||||
*
|
||||
* / A(x) * floor(x^64 / G(x)) \
|
||||
* q(x) = floor ( ------------------------- )
|
||||
* \ x^32 /
|
||||
*
|
||||
* Note that floor(x^64 / G(x)) is a constant.
|
||||
*
|
||||
* So finally we have:
|
||||
*
|
||||
* / A(x) * floor(x^64 / G(x)) \
|
||||
* R(x) = B(x) + G(x)*floor ( ------------------------- )
|
||||
* \ x^32 /
|
||||
*/
|
||||
x1 = x0;
|
||||
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
|
||||
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
|
||||
return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
|
||||
}
|
||||
|
||||
#define IMPL_ALIGNMENT 16
|
||||
#define IMPL_SEGMENT_SIZE 16
|
||||
#include "../crc32_vec_template.h"
|
||||
35
lib/gdeflate/libdeflate/lib/x86/decompress_impl.h
Normal file
35
lib/gdeflate/libdeflate/lib/x86/decompress_impl.h
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
#ifndef LIB_X86_DECOMPRESS_IMPL_H
|
||||
#define LIB_X86_DECOMPRESS_IMPL_H
|
||||
|
||||
#include "cpu_features.h"
|
||||
|
||||
/* Include the BMI2-optimized version? */
|
||||
#undef DISPATCH_BMI2
|
||||
#if !defined(__BMI2__) && X86_CPU_FEATURES_ENABLED && \
|
||||
COMPILER_SUPPORTS_BMI2_TARGET
|
||||
# define FUNCNAME deflate_decompress_bmi2
|
||||
# define ATTRIBUTES __attribute__((target("bmi2")))
|
||||
# define DISPATCH 1
|
||||
# define DISPATCH_BMI2 1
|
||||
#ifdef GDEFLATE
|
||||
# include "../gdeflate_decompress_template.h"
|
||||
#else
|
||||
# include "../decompress_template.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef DISPATCH
|
||||
static inline decompress_func_t
|
||||
arch_select_decompress_func(void)
|
||||
{
|
||||
u32 features = get_cpu_features();
|
||||
|
||||
#ifdef DISPATCH_BMI2
|
||||
if (features & X86_CPU_FEATURE_BMI2)
|
||||
return deflate_decompress_bmi2;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
#endif /* DISPATCH */
|
||||
|
||||
#endif /* LIB_X86_DECOMPRESS_IMPL_H */
|
||||
122
lib/gdeflate/libdeflate/lib/x86/matchfinder_impl.h
Normal file
122
lib/gdeflate/libdeflate/lib/x86/matchfinder_impl.h
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
/*
|
||||
* x86/matchfinder_impl.h - x86 implementations of matchfinder functions
|
||||
*
|
||||
* Copyright 2016 Eric Biggers
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef LIB_X86_MATCHFINDER_IMPL_H
|
||||
#define LIB_X86_MATCHFINDER_IMPL_H
|
||||
|
||||
#ifdef __AVX2__
|
||||
# include <immintrin.h>
|
||||
static forceinline void
|
||||
matchfinder_init_avx2(mf_pos_t *data, size_t size)
|
||||
{
|
||||
__m256i *p = (__m256i *)data;
|
||||
__m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
|
||||
|
||||
STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
|
||||
STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
|
||||
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
||||
|
||||
do {
|
||||
p[0] = v;
|
||||
p[1] = v;
|
||||
p[2] = v;
|
||||
p[3] = v;
|
||||
p += 4;
|
||||
size -= 4 * sizeof(*p);
|
||||
} while (size != 0);
|
||||
}
|
||||
#define matchfinder_init matchfinder_init_avx2
|
||||
|
||||
static forceinline void
|
||||
matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
|
||||
{
|
||||
__m256i *p = (__m256i *)data;
|
||||
__m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
|
||||
|
||||
STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
|
||||
STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
|
||||
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
||||
|
||||
do {
|
||||
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
|
||||
p[0] = _mm256_adds_epi16(p[0], v);
|
||||
p[1] = _mm256_adds_epi16(p[1], v);
|
||||
p[2] = _mm256_adds_epi16(p[2], v);
|
||||
p[3] = _mm256_adds_epi16(p[3], v);
|
||||
p += 4;
|
||||
size -= 4 * sizeof(*p);
|
||||
} while (size != 0);
|
||||
}
|
||||
#define matchfinder_rebase matchfinder_rebase_avx2
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
# include <emmintrin.h>
|
||||
static forceinline void
|
||||
matchfinder_init_sse2(mf_pos_t *data, size_t size)
|
||||
{
|
||||
__m128i *p = (__m128i *)data;
|
||||
__m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL);
|
||||
|
||||
STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
|
||||
STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
|
||||
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
||||
|
||||
do {
|
||||
p[0] = v;
|
||||
p[1] = v;
|
||||
p[2] = v;
|
||||
p[3] = v;
|
||||
p += 4;
|
||||
size -= 4 * sizeof(*p);
|
||||
} while (size != 0);
|
||||
}
|
||||
#define matchfinder_init matchfinder_init_sse2
|
||||
|
||||
static forceinline void
|
||||
matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
|
||||
{
|
||||
__m128i *p = (__m128i *)data;
|
||||
__m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
|
||||
|
||||
STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
|
||||
STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
|
||||
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
|
||||
|
||||
do {
|
||||
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
|
||||
p[0] = _mm_adds_epi16(p[0], v);
|
||||
p[1] = _mm_adds_epi16(p[1], v);
|
||||
p[2] = _mm_adds_epi16(p[2], v);
|
||||
p[3] = _mm_adds_epi16(p[3], v);
|
||||
p += 4;
|
||||
size -= 4 * sizeof(*p);
|
||||
} while (size != 0);
|
||||
}
|
||||
#define matchfinder_rebase matchfinder_rebase_sse2
|
||||
#endif /* __SSE2__ */
|
||||
|
||||
#endif /* LIB_X86_MATCHFINDER_IMPL_H */
|
||||
Loading…
Add table
Add a link
Reference in a new issue