asset compression

2026-05-11 18:37:30 +02:00 · 2026-05-11 18:37:30 +02:00 · 30a283c1b3
commit 30a283c1b3
parent b9b9ecb84c
57 changed files with 13237 additions and 8 deletions
--- a/lib/gdeflate/libdeflate/lib/x86/adler32_impl.h
+++ b/lib/gdeflate/libdeflate/lib/x86/adler32_impl.h
@ -0,0 +1,337 @@
+/*
+ * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_ADLER32_IMPL_H
+#define LIB_X86_ADLER32_IMPL_H
+
+#include "cpu_features.h"
+
+/*
+ * The following macros horizontally sum the s1 counters and add them to the
+ * real s1, and likewise for s2.  They do this via a series of reductions, each
+ * of which halves the vector length, until just one counter remains.
+ *
+ * The s1 reductions don't depend on the s2 reductions and vice versa, so for
+ * efficiency they are interleaved.  Also, every other s1 counter is 0 due to
+ * the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than
+ * 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits.
+ */
+
+#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2)		    \
+{									    \
+	__v4su s1_last = (v_s1), s2_last = (v_s2);			    \
+									    \
+	/* 128 => 32 bits */						    \
+	s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x31);	    \
+	s1_last += (__v4su)_mm_shuffle_epi32((__m128i)s1_last, 0x02);	    \
+	s2_last += (__v4su)_mm_shuffle_epi32((__m128i)s2_last, 0x02);	    \
+									    \
+	*(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last);		    \
+	*(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last);		    \
+}
+
+#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2)		    \
+{									    \
+	__v4su s1_128bit, s2_128bit;					    \
+									    \
+	/* 256 => 128 bits */						    \
+	s1_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 0) +  \
+		    (__v4su)_mm256_extracti128_si256((__m256i)(v_s1), 1);   \
+	s2_128bit = (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 0) +  \
+		    (__v4su)_mm256_extracti128_si256((__m256i)(v_s2), 1);   \
+									    \
+	ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit);	    \
+}
+
+#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2)		    \
+{									    \
+	__v8su s1_256bit, s2_256bit;					    \
+									    \
+	/* 512 => 256 bits */						    \
+	s1_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 0) + \
+		    (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s1), 1);  \
+	s2_256bit = (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 0) + \
+		    (__v8su)_mm512_extracti64x4_epi64((__m512i)(v_s2), 1);  \
+									    \
+	ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit);	    \
+}
+
+/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */
+#undef DISPATCH_AVX512BW
+#if !defined(DEFAULT_IMPL) &&						\
+    /*
+     * clang before v3.9 is missing some AVX-512BW intrinsics including
+     * _mm512_sad_epu8(), a.k.a. __builtin_ia32_psadbw512.  So just make using
+     * AVX-512BW, even when __AVX512BW__ is defined, conditional on
+     * COMPILER_SUPPORTS_AVX512BW_TARGET where we check for that builtin.
+     */									\
+    COMPILER_SUPPORTS_AVX512BW_TARGET &&				\
+    (defined(__AVX512BW__) || (X86_CPU_FEATURES_ENABLED &&		\
+			       COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS))
+#  define FUNCNAME		adler32_avx512bw
+#  define FUNCNAME_CHUNK	adler32_avx512bw_chunk
+#  define IMPL_ALIGNMENT	64
+#  define IMPL_SEGMENT_SIZE	64
+#  define IMPL_MAX_CHUNK_SIZE	MAX_CHUNK_SIZE
+#  ifdef __AVX512BW__
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL	adler32_avx512bw
+#  else
+#    define ATTRIBUTES		__attribute__((target("avx512bw")))
+#    define DISPATCH		1
+#    define DISPATCH_AVX512BW	1
+#  endif
+#  include <immintrin.h>
+static forceinline ATTRIBUTES void
+adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
+		       u32 *s1, u32 *s2)
+{
+	const __m512i zeroes = _mm512_setzero_si512();
+	const __v64qi multipliers = (__v64qi){
+		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+	};
+	const __v32hi ones = (__v32hi)_mm512_set1_epi16(1);
+	__v16si v_s1 = (__v16si)zeroes;
+	__v16si v_s1_sums = (__v16si)zeroes;
+	__v16si v_s2 = (__v16si)zeroes;
+
+	do {
+		/* Load the next 64-byte segment */
+		__m512i bytes = *p++;
+
+		/* Multiply the bytes by 64...1 (the number of times they need
+		 * to be added to s2) and add adjacent products */
+		__v32hi sums = (__v32hi)_mm512_maddubs_epi16(
+						bytes, (__m512i)multipliers);
+
+		/* Keep sum of all previous s1 counters, for adding to s2 later.
+		 * This allows delaying the multiplication by 64 to the end. */
+		v_s1_sums += v_s1;
+
+		/* Add the sum of each group of 8 bytes to the corresponding s1
+		 * counter */
+		v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes);
+
+		/* Add the sum of each group of 4 products of the bytes by
+		 * 64...1 to the corresponding s2 counter */
+		v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums,
+						   (__m512i)ones);
+	} while (p != end);
+
+	/* Finish the s2 counters by adding the sum of the s1 values at the
+	 * beginning of each segment, multiplied by the segment size (64) */
+	v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6);
+
+	/* Add the counters to the real s1 and s2 */
+	ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
+}
+#  include "../adler32_vec_template.h"
+#endif /* AVX-512BW implementation */
+
+/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */
+#undef DISPATCH_AVX2
+#if !defined(DEFAULT_IMPL) &&	\
+	(defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED &&	\
+			       COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS))
+#  define FUNCNAME		adler32_avx2
+#  define FUNCNAME_CHUNK	adler32_avx2_chunk
+#  define IMPL_ALIGNMENT	32
+#  define IMPL_SEGMENT_SIZE	32
+#  define IMPL_MAX_CHUNK_SIZE	MAX_CHUNK_SIZE
+#  ifdef __AVX2__
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL	adler32_avx2
+#  else
+#    define ATTRIBUTES		__attribute__((target("avx2")))
+#    define DISPATCH		1
+#    define DISPATCH_AVX2	1
+#  endif
+#  include <immintrin.h>
+static forceinline ATTRIBUTES void
+adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
+{
+	const __m256i zeroes = _mm256_setzero_si256();
+	const __v32qu multipliers = (__v32qu){
+		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+	};
+	const __v16hu ones = (__v16hu)_mm256_set1_epi16(1);
+	__v8su v_s1 = (__v8su)zeroes;
+	__v8su v_s1_sums = (__v8su)zeroes;
+	__v8su v_s2 = (__v8su)zeroes;
+
+	do {
+		/* Load the next 32-byte segment */
+		__m256i bytes = *p++;
+
+		/* Multiply the bytes by 32...1 (the number of times they need
+		 * to be added to s2) and add adjacent products */
+		__v16hu sums = (__v16hu)_mm256_maddubs_epi16(
+						bytes, (__m256i)multipliers);
+
+		/* Keep sum of all previous s1 counters, for adding to s2 later.
+		 * This allows delaying the multiplication by 32 to the end. */
+		v_s1_sums += v_s1;
+
+		/* Add the sum of each group of 8 bytes to the corresponding s1
+		 * counter */
+		v_s1 += (__v8su)_mm256_sad_epu8(bytes, zeroes);
+
+		/* Add the sum of each group of 4 products of the bytes by
+		 * 32...1 to the corresponding s2 counter */
+		v_s2 += (__v8su)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
+	} while (p != end);
+
+	/* Finish the s2 counters by adding the sum of the s1 values at the
+	 * beginning of each segment, multiplied by the segment size (32) */
+	v_s2 += (__v8su)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
+
+	/* Add the counters to the real s1 and s2 */
+	ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
+}
+#  include "../adler32_vec_template.h"
+#endif /* AVX2 implementation */
+
+/* SSE2 implementation */
+#undef DISPATCH_SSE2
+#if !defined(DEFAULT_IMPL) &&	\
+	(defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED &&	\
+			       COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS))
+#  define FUNCNAME		adler32_sse2
+#  define FUNCNAME_CHUNK	adler32_sse2_chunk
+#  define IMPL_ALIGNMENT	16
+#  define IMPL_SEGMENT_SIZE	32
+/*
+ * The 16-bit precision byte counters must not be allowed to undergo *signed*
+ * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16)
+ * would behave incorrectly.
+ */
+#  define IMPL_MAX_CHUNK_SIZE	(32 * (0x7FFF / 0xFF))
+#  ifdef __SSE2__
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL	adler32_sse2
+#  else
+#    define ATTRIBUTES		__attribute__((target("sse2")))
+#    define DISPATCH		1
+#    define DISPATCH_SSE2	1
+#  endif
+#  include <emmintrin.h>
+static forceinline ATTRIBUTES void
+adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
+{
+	const __m128i zeroes = _mm_setzero_si128();
+
+	/* s1 counters: 32-bit, sum of bytes */
+	__v4su v_s1 = (__v4su)zeroes;
+
+	/* s2 counters: 32-bit, sum of s1 values */
+	__v4su v_s2 = (__v4su)zeroes;
+
+	/*
+	 * Thirty-two 16-bit counters for byte sums.  Each accumulates the bytes
+	 * that eventually need to be multiplied by a number 32...1 for addition
+	 * into s2.
+	 */
+	__v8hu v_byte_sums_a = (__v8hu)zeroes;
+	__v8hu v_byte_sums_b = (__v8hu)zeroes;
+	__v8hu v_byte_sums_c = (__v8hu)zeroes;
+	__v8hu v_byte_sums_d = (__v8hu)zeroes;
+
+	do {
+		/* Load the next 32 bytes */
+		const __m128i bytes1 = *p++;
+		const __m128i bytes2 = *p++;
+
+		/*
+		 * Accumulate the previous s1 counters into the s2 counters.
+		 * Logically, this really should be v_s2 += v_s1 * 32, but we
+		 * can do the multiplication (or left shift) later.
+		 */
+		v_s2 += v_s1;
+
+		/*
+		 * s1 update: use "Packed Sum of Absolute Differences" to add
+		 * the bytes horizontally with 8 bytes per sum.  Then add the
+		 * sums to the s1 counters.
+		 */
+		v_s1 += (__v4su)_mm_sad_epu8(bytes1, zeroes);
+		v_s1 += (__v4su)_mm_sad_epu8(bytes2, zeroes);
+
+		/*
+		 * Also accumulate the bytes into 32 separate counters that have
+		 * 16-bit precision.
+		 */
+		v_byte_sums_a += (__v8hu)_mm_unpacklo_epi8(bytes1, zeroes);
+		v_byte_sums_b += (__v8hu)_mm_unpackhi_epi8(bytes1, zeroes);
+		v_byte_sums_c += (__v8hu)_mm_unpacklo_epi8(bytes2, zeroes);
+		v_byte_sums_d += (__v8hu)_mm_unpackhi_epi8(bytes2, zeroes);
+
+	} while (p != end);
+
+	/* Finish calculating the s2 counters */
+	v_s2 = (__v4su)_mm_slli_epi32((__m128i)v_s2, 5);
+	v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_a,
+				       (__m128i)(__v8hu){ 32, 31, 30, 29, 28, 27, 26, 25 });
+	v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_b,
+				       (__m128i)(__v8hu){ 24, 23, 22, 21, 20, 19, 18, 17 });
+	v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_c,
+				       (__m128i)(__v8hu){ 16, 15, 14, 13, 12, 11, 10, 9 });
+	v_s2 += (__v4su)_mm_madd_epi16((__m128i)v_byte_sums_d,
+				       (__m128i)(__v8hu){ 8,  7,  6,  5,  4,  3,  2,  1 });
+
+	/* Add the counters to the real s1 and s2 */
+	ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
+}
+#  include "../adler32_vec_template.h"
+#endif /* SSE2 implementation */
+
+#ifdef DISPATCH
+static inline adler32_func_t
+arch_select_adler32_func(void)
+{
+	u32 features = get_cpu_features();
+
+#ifdef DISPATCH_AVX512BW
+	if (features & X86_CPU_FEATURE_AVX512BW)
+		return adler32_avx512bw;
+#endif
+#ifdef DISPATCH_AVX2
+	if (features & X86_CPU_FEATURE_AVX2)
+		return adler32_avx2;
+#endif
+#ifdef DISPATCH_SSE2
+	if (features & X86_CPU_FEATURE_SSE2)
+		return adler32_sse2;
+#endif
+	return NULL;
+}
+#endif /* DISPATCH */
+
+#endif /* LIB_X86_ADLER32_IMPL_H */
--- a/lib/gdeflate/libdeflate/lib/x86/cpu_features.c
+++ b/lib/gdeflate/libdeflate/lib/x86/cpu_features.c
@ -0,0 +1,152 @@
+/*
+ * x86/cpu_features.c - feature detection for x86 processors
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "../cpu_features_common.h" /* must be included first */
+#include "cpu_features.h"
+
+#if X86_CPU_FEATURES_ENABLED
+
+volatile u32 _cpu_features = 0;
+
+/* With old GCC versions we have to manually save and restore the x86_32 PIC
+ * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602  */
+#if defined(__i386__) && defined(__PIC__)
+#  define EBX_CONSTRAINT "=&r"
+#else
+#  define EBX_CONSTRAINT "=b"
+#endif
+
+/* Execute the CPUID instruction.  */
+static inline void
+cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
+{
+	__asm__(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
+		"cpuid                                  \n"
+		".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
+		: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
+		: "a" (leaf), "c" (subleaf));
+}
+
+/* Read an extended control register.  */
+static inline u64
+read_xcr(u32 index)
+{
+	u32 edx, eax;
+
+	/* Execute the "xgetbv" instruction.  Old versions of binutils do not
+	 * recognize this instruction, so list the raw bytes instead.  */
+	__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
+
+	return ((u64)edx << 32) | eax;
+}
+
+#undef BIT
+#define BIT(nr)			(1UL << (nr))
+
+#define XCR0_BIT_SSE		BIT(1)
+#define XCR0_BIT_AVX		BIT(2)
+#define XCR0_BIT_OPMASK		BIT(5)
+#define XCR0_BIT_ZMM_HI256	BIT(6)
+#define XCR0_BIT_HI16_ZMM	BIT(7)
+
+#define IS_SET(reg, nr)		((reg) & BIT(nr))
+#define IS_ALL_SET(reg, mask)	(((reg) & (mask)) == (mask))
+
+static const struct cpu_feature x86_cpu_feature_table[] = {
+	{X86_CPU_FEATURE_SSE2,		"sse2"},
+	{X86_CPU_FEATURE_PCLMUL,	"pclmul"},
+	{X86_CPU_FEATURE_AVX,		"avx"},
+	{X86_CPU_FEATURE_AVX2,		"avx2"},
+	{X86_CPU_FEATURE_BMI2,		"bmi2"},
+	{X86_CPU_FEATURE_AVX512BW,	"avx512bw"},
+};
+
+/* Initialize _cpu_features with bits for interesting processor features. */
+void setup_cpu_features(void)
+{
+	u32 features = 0;
+	u32 dummy1, dummy2, dummy3, dummy4;
+	u32 max_function;
+	u32 features_1, features_2, features_3, features_4;
+	bool os_avx_support = false;
+	bool os_avx512_support = false;
+
+	/* Get maximum supported function  */
+	cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
+	if (max_function < 1)
+		goto out;
+
+	/* Standard feature flags  */
+	cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
+
+	if (IS_SET(features_1, 26))
+		features |= X86_CPU_FEATURE_SSE2;
+
+	if (IS_SET(features_2, 1))
+		features |= X86_CPU_FEATURE_PCLMUL;
+
+	if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
+		u64 xcr0 = read_xcr(0);
+
+		os_avx_support = IS_ALL_SET(xcr0,
+					    XCR0_BIT_SSE |
+					    XCR0_BIT_AVX);
+
+		os_avx512_support = IS_ALL_SET(xcr0,
+					       XCR0_BIT_SSE |
+					       XCR0_BIT_AVX |
+					       XCR0_BIT_OPMASK |
+					       XCR0_BIT_ZMM_HI256 |
+					       XCR0_BIT_HI16_ZMM);
+	}
+
+	if (os_avx_support && IS_SET(features_2, 28))
+		features |= X86_CPU_FEATURE_AVX;
+
+	if (max_function < 7)
+		goto out;
+
+	/* Extended feature flags  */
+	cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
+
+	if (os_avx_support && IS_SET(features_3, 5))
+		features |= X86_CPU_FEATURE_AVX2;
+
+	if (IS_SET(features_3, 8))
+		features |= X86_CPU_FEATURE_BMI2;
+
+	if (os_avx512_support && IS_SET(features_3, 30))
+		features |= X86_CPU_FEATURE_AVX512BW;
+
+out:
+	disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
+					 ARRAY_LEN(x86_cpu_feature_table));
+
+	_cpu_features = features | X86_CPU_FEATURES_KNOWN;
+}
+
+#endif /* X86_CPU_FEATURES_ENABLED */
--- a/lib/gdeflate/libdeflate/lib/x86/cpu_features.h
+++ b/lib/gdeflate/libdeflate/lib/x86/cpu_features.h
@ -0,0 +1,41 @@
+/*
+ * x86/cpu_features.h - feature detection for x86 processors
+ */
+
+#ifndef LIB_X86_CPU_FEATURES_H
+#define LIB_X86_CPU_FEATURES_H
+
+#include "../lib_common.h"
+
+#if (defined(__i386__) || defined(__x86_64__)) && \
+	COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
+#  define X86_CPU_FEATURES_ENABLED 1
+#else
+#  define X86_CPU_FEATURES_ENABLED 0
+#endif
+
+#if X86_CPU_FEATURES_ENABLED
+
+#define X86_CPU_FEATURE_SSE2		0x00000001
+#define X86_CPU_FEATURE_PCLMUL		0x00000002
+#define X86_CPU_FEATURE_AVX		0x00000004
+#define X86_CPU_FEATURE_AVX2		0x00000008
+#define X86_CPU_FEATURE_BMI2		0x00000010
+#define X86_CPU_FEATURE_AVX512BW	0x00000020
+
+#define X86_CPU_FEATURES_KNOWN		0x80000000
+
+extern volatile u32 _cpu_features;
+
+void setup_cpu_features(void);
+
+static inline u32 get_cpu_features(void)
+{
+	if (_cpu_features == 0)
+		setup_cpu_features();
+	return _cpu_features;
+}
+
+#endif /* X86_CPU_FEATURES_ENABLED */
+
+#endif /* LIB_X86_CPU_FEATURES_H */
--- a/lib/gdeflate/libdeflate/lib/x86/crc32_impl.h
+++ b/lib/gdeflate/libdeflate/lib/x86/crc32_impl.h
@ -0,0 +1,92 @@
+/*
+ * x86/crc32_impl.h - x86 implementations of CRC-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_CRC32_IMPL_H
+#define LIB_X86_CRC32_IMPL_H
+
+#include "cpu_features.h"
+
+/*
+ * Include the PCLMUL/AVX implementation?  Although our PCLMUL-optimized CRC-32
+ * function doesn't use any AVX intrinsics specifically, it can benefit a lot
+ * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
+ * MB/s.  I expect this is related to the PCLMULQDQ instructions being assembled
+ * in the newer three-operand form rather than the older two-operand form.
+ *
+ * Note: this is only needed if __AVX__ is *not* defined, since otherwise the
+ * "regular" PCLMUL implementation would already be AVX enabled.
+ */
+#undef DISPATCH_PCLMUL_AVX
+#if !defined(DEFAULT_IMPL) && !defined(__AVX__) &&	\
+	X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET &&	\
+	(defined(__PCLMUL__) || COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS)
+#  define FUNCNAME		crc32_pclmul_avx
+#  define FUNCNAME_ALIGNED	crc32_pclmul_avx_aligned
+#  define ATTRIBUTES		__attribute__((target("pclmul,avx")))
+#  define DISPATCH		1
+#  define DISPATCH_PCLMUL_AVX	1
+#  include "crc32_pclmul_template.h"
+#endif
+
+/* PCLMUL implementation */
+#undef DISPATCH_PCLMUL
+#if !defined(DEFAULT_IMPL) &&	\
+	(defined(__PCLMUL__) || (X86_CPU_FEATURES_ENABLED &&	\
+				 COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS))
+#  define FUNCNAME		crc32_pclmul
+#  define FUNCNAME_ALIGNED	crc32_pclmul_aligned
+#  ifdef __PCLMUL__
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL	crc32_pclmul
+#  else
+#    define ATTRIBUTES		__attribute__((target("pclmul")))
+#    define DISPATCH		1
+#    define DISPATCH_PCLMUL	1
+#  endif
+#  include "crc32_pclmul_template.h"
+#endif
+
+#ifdef DISPATCH
+static inline crc32_func_t
+arch_select_crc32_func(void)
+{
+	u32 features = get_cpu_features();
+
+#ifdef DISPATCH_PCLMUL_AVX
+	if ((features & X86_CPU_FEATURE_PCLMUL) &&
+	    (features & X86_CPU_FEATURE_AVX))
+		return crc32_pclmul_avx;
+#endif
+#ifdef DISPATCH_PCLMUL
+	if (features & X86_CPU_FEATURE_PCLMUL)
+		return crc32_pclmul;
+#endif
+	return NULL;
+}
+#endif /* DISPATCH */
+
+#endif /* LIB_X86_CRC32_IMPL_H */
--- a/lib/gdeflate/libdeflate/lib/x86/crc32_pclmul_template.h
+++ b/lib/gdeflate/libdeflate/lib/x86/crc32_pclmul_template.h
@ -0,0 +1,262 @@
+/*
+ * x86/crc32_pclmul_template.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <immintrin.h>
+
+/*
+ * CRC-32 folding with PCLMULQDQ.
+ *
+ * The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
+ * producing an abbreviated message which is congruent the original message
+ * modulo the generator polynomial G(x).
+ *
+ * Folding each 512 bits is implemented as eight 64-bit folds, each of which
+ * uses one carryless multiplication instruction.  It's expected that CPUs may
+ * be able to execute some of these multiplications in parallel.
+ *
+ * Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
+ * be 95 bits from a constant distance D later in the message.  The relevant
+ * portion of the message can be written as:
+ *
+ *	M(x) = A(x)*x^D + B(x)
+ *
+ * ... where + and * represent addition and multiplication, respectively, of
+ * polynomials over GF(2).  Note that when implemented on a computer, these
+ * operations are equivalent to XOR and carryless multiplication, respectively.
+ *
+ * For the purpose of CRC calculation, only the remainder modulo the generator
+ * polynomial G(x) matters:
+ *
+ *	M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
+ *
+ * Since the modulo operation can be applied anywhere in a sequence of additions
+ * and multiplications without affecting the result, this is equivalent to:
+ *
+ *	M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
+ *
+ * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
+ * a 32-bit quantity.  So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
+ * multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
+ * product.  Then, adding (XOR-ing) the product to B(x) produces a polynomial
+ * with the same length as B(x) but with the same remainder as 'A(x)*x^D +
+ * B(x)'.  This is the basic fold operation with 64 bits.
+ *
+ * Note that the carryless multiplication instruction PCLMULQDQ actually takes
+ * two 64-bit inputs and produces a 127-bit product in the low-order bits of a
+ * 128-bit XMM register.  This works fine, but care must be taken to account for
+ * "bit endianness".  With the CRC version implemented here, bits are always
+ * ordered such that the lowest-order bit represents the coefficient of highest
+ * power of x and the highest-order bit represents the coefficient of the lowest
+ * power of x.  This is backwards from the more intuitive order.  Still,
+ * carryless multiplication works essentially the same either way.  It just must
+ * be accounted for that when we XOR the 95-bit product in the low-order 95 bits
+ * of a 128-bit XMM register into 128-bits of later data held in another XMM
+ * register, we'll really be XOR-ing the product into the mathematically higher
+ * degree end of those later bits, not the lower degree end as may be expected.
+ *
+ * So given that caveat and the fact that we process 512 bits per iteration, the
+ * 'D' values we need for the two 64-bit halves of each 128 bits of data are:
+ *
+ *	D = (512 + 95) - 64	 for the higher-degree half of each 128 bits,
+ *				 i.e. the lower order bits in the XMM register
+ *
+ *	D = (512 + 95) - 128	 for the lower-degree half of each 128 bits,
+ *				 i.e. the higher order bits in the XMM register
+ *
+ * The required 'x^D mod G(x)' values were precomputed.
+ *
+ * When <= 512 bits remain in the message, we finish up by folding across
+ * smaller distances.  This works similarly; the distance D is just different,
+ * so different constant multipliers must be used.  Finally, once the remaining
+ * message is just 64 bits, it is reduced to the CRC-32 using Barrett reduction
+ * (explained later).
+ *
+ * For more information see the original paper from Intel:
+ *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *	December 2009
+ *	http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ */
+static u32 ATTRIBUTES
+FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t nr_segs)
+{
+	/* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
+	const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
+	const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
+	const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
+	const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
+	const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
+	const __v2di barrett_reduction_constants =
+			(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
+
+	const __m128i * const end = p + nr_segs;
+	const __m128i * const end512 = p + (nr_segs & ~3);
+	__m128i x0, x1, x2, x3;
+
+	/*
+	 * Account for the current 'remainder', i.e. the CRC of the part of the
+	 * message already processed.  Explanation: rewrite the message
+	 * polynomial M(x) in terms of the first part A(x), the second part
+	 * B(x), and the length of the second part in bits |B(x)| >= 32:
+	 *
+	 *	M(x) = A(x)*x^|B(x)| + B(x)
+	 *
+	 * Then the CRC of M(x) is:
+	 *
+	 *	CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
+	 *	          = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
+	 *	          = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
+	 *
+	 * Note: all arithmetic is modulo G(x), the generator polynomial; that's
+	 * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
+	 *
+	 * So the CRC of the full message is the CRC of the second part of the
+	 * message where the first 32 bits of the second part of the message
+	 * have been XOR'ed with the CRC of the first part of the message.
+	 */
+	x0 = *p++;
+	x0 ^= (__m128i)(__v4si){ remainder };
+
+	if (p > end512) /* only 128, 256, or 384 bits of input? */
+		goto _128_bits_at_a_time;
+	x1 = *p++;
+	x2 = *p++;
+	x3 = *p++;
+
+	/* Fold 512 bits at a time */
+	for (; p != end512; p += 4) {
+		__m128i y0, y1, y2, y3;
+
+		y0 = p[0];
+		y1 = p[1];
+		y2 = p[2];
+		y3 = p[3];
+
+		/*
+		 * Note: the immediate constant for PCLMULQDQ specifies which
+		 * 64-bit halves of the 128-bit vectors to multiply:
+		 *
+		 * 0x00 means low halves (higher degree polynomial terms for us)
+		 * 0x11 means high halves (lower degree polynomial terms for us)
+		 */
+		y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
+		y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
+		y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
+		y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
+		y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
+		y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
+		y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
+		y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
+
+		x0 = y0;
+		x1 = y1;
+		x2 = y2;
+		x3 = y3;
+	}
+
+	/* Fold 512 bits => 128 bits */
+	x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
+	x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
+	x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
+	x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
+	x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
+	x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
+	x0 = x3;
+
+_128_bits_at_a_time:
+	while (p != end) {
+		/* Fold 128 bits into next 128 bits */
+		x1 = *p++;
+		x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
+		x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
+		x0 = x1;
+	}
+
+	/* Now there are just 128 bits left, stored in 'x0'. */
+
+	/*
+	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
+	 * which is equivalent to multiplying by x^32.  This is needed because
+	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
+	 */
+	x0 = _mm_srli_si128(x0, 8) ^
+	     _mm_clmulepi64_si128(x0, multipliers_1, 0x10);
+
+	/* Fold 96 => 64 bits */
+	x0 = _mm_srli_si128(x0, 4) ^
+	     _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
+
+        /*
+	 * Finally, reduce 64 => 32 bits using Barrett reduction.
+	 *
+	 * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
+	 * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
+	 *
+	 *	R(x) = (A(x)*x^32 + B(x)) mod G(x)
+	 *	     = (A(x)*x^32) mod G(x) + B(x)
+	 *
+	 * Then, by the Division Algorithm there exists a unique q(x) such that:
+	 *
+	 *	A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
+	 *
+	 * Since the left-hand side is of maximum degree 31, the right-hand side
+	 * must be too.  This implies that we can apply 'mod x^32' to the
+	 * right-hand side without changing its value:
+	 *
+	 *	(A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
+	 *
+	 * Note that '+' is equivalent to '-' in polynomials over GF(2).
+	 *
+	 * We also know that:
+	 *
+	 *	              / A(x)*x^32 \
+	 *	q(x) = floor (  ---------  )
+	 *	              \    G(x)   /
+	 *
+	 * To compute this efficiently, we can multiply the top and bottom by
+	 * x^32 and move the division by G(x) to the top:
+	 *
+	 *	              / A(x) * floor(x^64 / G(x)) \
+	 *	q(x) = floor (  -------------------------  )
+	 *	              \           x^32            /
+	 *
+	 * Note that floor(x^64 / G(x)) is a constant.
+	 *
+	 * So finally we have:
+	 *
+	 *	                          / A(x) * floor(x^64 / G(x)) \
+	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
+	 *	                          \           x^32            /
+	 */
+	x1 = x0;
+	x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
+	x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
+	return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
+}
+
+#define IMPL_ALIGNMENT		16
+#define IMPL_SEGMENT_SIZE	16
+#include "../crc32_vec_template.h"
--- a/lib/gdeflate/libdeflate/lib/x86/decompress_impl.h
+++ b/lib/gdeflate/libdeflate/lib/x86/decompress_impl.h
@ -0,0 +1,35 @@
+#ifndef LIB_X86_DECOMPRESS_IMPL_H
+#define LIB_X86_DECOMPRESS_IMPL_H
+
+#include "cpu_features.h"
+
+/* Include the BMI2-optimized version? */
+#undef DISPATCH_BMI2
+#if !defined(__BMI2__) && X86_CPU_FEATURES_ENABLED && \
+	COMPILER_SUPPORTS_BMI2_TARGET
+#  define FUNCNAME	deflate_decompress_bmi2
+#  define ATTRIBUTES	__attribute__((target("bmi2")))
+#  define DISPATCH	1
+#  define DISPATCH_BMI2	1
+#ifdef GDEFLATE
+#  include "../gdeflate_decompress_template.h"
+#else
+#  include "../decompress_template.h"
+#endif
+#endif
+
+#ifdef DISPATCH
+static inline decompress_func_t
+arch_select_decompress_func(void)
+{
+	u32 features = get_cpu_features();
+
+#ifdef DISPATCH_BMI2
+	if (features & X86_CPU_FEATURE_BMI2)
+		return deflate_decompress_bmi2;
+#endif
+	return NULL;
+}
+#endif /* DISPATCH */
+
+#endif /* LIB_X86_DECOMPRESS_IMPL_H */
--- a/lib/gdeflate/libdeflate/lib/x86/matchfinder_impl.h
+++ b/lib/gdeflate/libdeflate/lib/x86/matchfinder_impl.h
@ -0,0 +1,122 @@
+/*
+ * x86/matchfinder_impl.h - x86 implementations of matchfinder functions
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_MATCHFINDER_IMPL_H
+#define LIB_X86_MATCHFINDER_IMPL_H
+
+#ifdef __AVX2__
+#  include <immintrin.h>
+static forceinline void
+matchfinder_init_avx2(mf_pos_t *data, size_t size)
+{
+	__m256i *p = (__m256i *)data;
+	__m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		p[0] = v;
+		p[1] = v;
+		p[2] = v;
+		p[3] = v;
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_init matchfinder_init_avx2
+
+static forceinline void
+matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
+{
+	__m256i *p = (__m256i *)data;
+	__m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		/* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+		p[0] = _mm256_adds_epi16(p[0], v);
+		p[1] = _mm256_adds_epi16(p[1], v);
+		p[2] = _mm256_adds_epi16(p[2], v);
+		p[3] = _mm256_adds_epi16(p[3], v);
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_avx2
+
+#elif defined(__SSE2__)
+#  include <emmintrin.h>
+static forceinline void
+matchfinder_init_sse2(mf_pos_t *data, size_t size)
+{
+	__m128i *p = (__m128i *)data;
+	__m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		p[0] = v;
+		p[1] = v;
+		p[2] = v;
+		p[3] = v;
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_init matchfinder_init_sse2
+
+static forceinline void
+matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
+{
+	__m128i *p = (__m128i *)data;
+	__m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		/* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+		p[0] = _mm_adds_epi16(p[0], v);
+		p[1] = _mm_adds_epi16(p[1], v);
+		p[2] = _mm_adds_epi16(p[2], v);
+		p[3] = _mm_adds_epi16(p[3], v);
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_sse2
+#endif /* __SSE2__ */
+
+#endif /* LIB_X86_MATCHFINDER_IMPL_H */