aboutsummaryrefslogtreecommitdiff
path: root/src/liblzma/check/crc_common.h
blob: 867e53d91ff187a0a32f38248bcfc18160343d1a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
///////////////////////////////////////////////////////////////////////////////
//
/// \file       crc_common.h
/// \brief      Some functions and macros for CRC32 and CRC64
//
//  Authors:    Lasse Collin
//              Ilya Kurdyukov
//              Hans Jansen
//
//  This file has been put into the public domain.
//  You can do whatever you want with this file.
//
///////////////////////////////////////////////////////////////////////////////

#ifdef WORDS_BIGENDIAN
#	define A(x) ((x) >> 24)
#	define B(x) (((x) >> 16) & 0xFF)
#	define C(x) (((x) >> 8) & 0xFF)
#	define D(x) ((x) & 0xFF)

#	define S8(x) ((x) << 8)
#	define S32(x) ((x) << 32)

#else
#	define A(x) ((x) & 0xFF)
#	define B(x) (((x) >> 8) & 0xFF)
#	define C(x) (((x) >> 16) & 0xFF)
#	define D(x) ((x) >> 24)

#	define S8(x) ((x) >> 8)
#	define S32(x) ((x) >> 32)
#endif


#undef CRC_GENERIC
#undef CRC_CLMUL
#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS

// If CLMUL cannot be used then only the generic slice-by-four is built.
#if !defined(HAVE_USABLE_CLMUL)
#	define CRC_GENERIC 1

// If CLMUL is allowed unconditionally in the compiler options then the
// generic version can be omitted. Note that this doesn't work with MSVC
// as I don't know how to detect the features here.
//
// NOTE: Keep this this in sync with crc32_table.c.
#elif (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \
		|| (defined(__e2k__) && __iset__ >= 6)
#	define CRC_CLMUL 1

// Otherwise build both and detect at runtime which version to use.
#else
#	define CRC_GENERIC 1
#	define CRC_CLMUL 1

/*
	// The generic code is much faster with 1-8-byte inputs and has
	// similar performance up to 16 bytes  at least in microbenchmarks
	// (it depends on input buffer alignment too). If both versions are
	// built, this #define will use the generic version for inputs up to
	// 16 bytes and CLMUL for bigger inputs. It saves a little in code
	// size since the special cases for 0-16-byte inputs will be omitted
	// from the CLMUL code.
#	define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1
*/

#	if defined(_MSC_VER)
#		include <intrin.h>
#	elif defined(HAVE_CPUID_H)
#		include <cpuid.h>
#	endif
#endif

////////////////////////
// Detect CPU support //
////////////////////////

#if defined(CRC_GENERIC) && defined(CRC_CLMUL)
static inline bool
is_clmul_supported(void)
{
	int success = 1;
	uint32_t r[4]; // eax, ebx, ecx, edx

#if defined(_MSC_VER)
	// This needs <intrin.h> with MSVC. ICC has it as a built-in
	// on all platforms.
	__cpuid(r, 1);
#elif defined(HAVE_CPUID_H)
	// Compared to just using __asm__ to run CPUID, this also checks
	// that CPUID is supported and saves and restores ebx as that is
	// needed with GCC < 5 with position-independent code (PIC).
	success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
#else
	// Just a fallback that shouldn't be needed.
	__asm__("cpuid\n\t"
			: "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
			: "a"(1), "c"(0));
#endif

	// Returns true if these are supported:
	// CLMUL (bit 1 in ecx)
	// SSSE3 (bit 9 in ecx)
	// SSE4.1 (bit 19 in ecx)
	const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
	return success && (r[2] & ecx_mask) == ecx_mask;

	// Alternative methods that weren't used:
	//   - ICC's _may_i_use_cpu_feature: the other methods should work too.
	//   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
	//
	// CPUID decding is needed with MSVC anyway and older GCC. This keeps
	// the feature checks in the build system simpler too. The nice thing
	// about __builtin_cpu_supports would be that it generates very short
	// code as is it only reads a variable set at startup but a few bytes
	// doesn't matter here.
}
#endif


#define MASK_L(in, mask, r) r = _mm_shuffle_epi8(in, mask);
#define MASK_H(in, mask, r) \
	r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign));
#define MASK_LH(in, mask, low, high) \
	MASK_L(in, mask, low) MASK_H(in, mask, high)

#ifdef CRC_CLMUL

#include <immintrin.h>


#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
__attribute__((__target__("ssse3,sse4.1,pclmul")))
#endif
#if lzma_has_attribute(__no_sanitize_address__)
__attribute__((__no_sanitize_address__))
#endif
static inline void
crc_simd_body(const uint8_t *buf, size_t size, __m128i *v0, __m128i *v1,
		__m128i vfold16, __m128i crc2vec)
{
#if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__)
#	pragma GCC diagnostic push
#	pragma GCC diagnostic ignored "-Wsign-conversion"
#endif
	// Memory addresses A to D and the distances between them:
	//
	//     A           B     C         D
	//     [skip_start][size][skip_end]
	//     [     size2      ]
	//
	// A and D are 16-byte aligned. B and C are 1-byte aligned.
	// skip_start and skip_end are 0-15 bytes. size is at least 1 byte.
	//
	// A = aligned_buf will initially point to this address.
	// B = The address pointed by the caller-supplied buf.
	// C = buf + size == aligned_buf + size2
	// D = buf + size + skip_end == aligned_buf + size2 + skip_end
	uintptr_t skip_start = (uintptr_t)buf & 15;
	uintptr_t skip_end = -(uintptr_t)(buf + size) & 15;

	// Create a vector with 8-bit values 0 to 15.
	// This is used to construct control masks
	// for _mm_blendv_epi8 and _mm_shuffle_epi8.
	__m128i vramp = _mm_setr_epi32(
			0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c);

	// This is used to inverse the control mask of _mm_shuffle_epi8
	// so that bytes that wouldn't be picked with the original mask
	// will be picked and vice versa.
	__m128i vsign = _mm_set1_epi8(-0x80);

	// Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8
	// The first skip_start or skip_end bytes in the vectors will hav
	// the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi
	// will produce zeros for these positions. (Bitwise-xor of thes
	// masks with vsign will produce the opposite behavior.)
	__m128i mask_start = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_start));
	__m128i mask_end = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_end));

	// If size2 <= 16 then the whole input fits into a single 16-byte
	// vector. If size2 > 16 then at least two 16-byte vectors must
	// be processed. If size2 > 16 && size <= 16 then there is only
	// one 16-byte vector's worth of input but it is unaligned in memory.
	//
	// NOTE: There is no integer overflow here if the arguments
	// are valid. If this overflowed, buf + size would too.
	uintptr_t size2 = skip_start + size;
	const __m128i *aligned_buf = (const __m128i*)((uintptr_t)buf & -16);
	__m128i v2, v3, vcrc, data0;

	vcrc = crc2vec;

	// Get the first 1-16 bytes into data0. If loading less than 16
	// bytes, the bytes are loaded to the high bits of the vector and
	// the least significant positions are filled with zeros.
	data0 = _mm_load_si128(aligned_buf);
	data0 = _mm_blendv_epi8(data0, _mm_setzero_si128(), mask_start);
	aligned_buf++;
	if (size2 <= 16) {
		//  There are 1-16 bytes of input and it is all
		//  in data0. Copy the input bytes to v3. If there
		//  are fewer than 16 bytes, the low bytes in v3
		//  will be filled with zeros. That is, the input
		//  bytes are stored to the same position as
		//  (part of) initial_crc is in v0.
		__m128i mask_low = _mm_add_epi8(
				vramp, _mm_set1_epi8(size - 16));
		MASK_LH(vcrc, mask_low, *v0, *v1)
		MASK_L(data0, mask_end, v3)
		*v0 = _mm_xor_si128(*v0, v3);
		*v1 = _mm_alignr_epi8(*v1, *v0, 8);
	} else {
		__m128i data1 = _mm_load_si128(aligned_buf);
		if (size <= 16) {
			//  Collect the 2-16 input bytes from data0 and data1
			//  to v2 and v3, and bitwise-xor them with the
			//  low bits of initial_crc in v0. Note that the
			//  the second xor is below this else-block as it
			//  is shared with the other branch.
			__m128i mask_low = _mm_add_epi8(
					vramp, _mm_set1_epi8(size - 16));
			MASK_LH(vcrc, mask_low, *v0, *v1);
			MASK_H(data0, mask_end, v2)
			MASK_L(data1, mask_end, v3)
			*v0 = _mm_xor_si128(*v0, v2);
			*v0 = _mm_xor_si128(*v0, v3);

			*v1 = _mm_alignr_epi8(*v1, *v0, 8);
		} else {
			const __m128i *end = (const __m128i*)(
					(char*)aligned_buf++ - 16 + size2);
			MASK_LH(vcrc, mask_start, *v0, *v1)
			*v0 = _mm_xor_si128(*v0, data0);
			*v1 = _mm_xor_si128(*v1, data1);
			while (aligned_buf < end) {
                                *v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(*v0, vfold16, 0x00)); \
	                        *v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(*v0, vfold16, 0x11));
                                *v1 = _mm_load_si128(aligned_buf++);
                        }

			if (aligned_buf != end) {
				MASK_H(*v0, mask_end, v2)
				MASK_L(*v0, mask_end, *v0)
				MASK_L(*v1, mask_end, v3)
				*v1 = _mm_or_si128(v2, v3);
			}
			*v1 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(*v0, vfold16, 0x00));
	                *v0 = _mm_xor_si128(*v1, _mm_clmulepi64_si128(*v0, vfold16, 0x11));

			*v1 = _mm_srli_si128(*v0, 8);
		}
	}
}
#endif