aboutsummaryrefslogtreecommitdiff
path: root/src/liblzma/check/crc64_x86.s
blob: f9dc595f9faf3021989b0273dbd9669d10e91612 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/*
 * Speed-optimized CRC64 using slicing-by-four algorithm
 * Instruction set: i386
 * Optimized for:   i686
 *
 * This code has been put into the public domain by its authors:
 * Igor Pavlov <http://7-zip.org/>
 * Lasse Collin <lasse.collin@tukaani.org>
 *
 * This code needs lzma_crc64_table, which can be created using the
 * following C code:

uint64_t lzma_crc64_table[4][256];

void
init_table(void)
{
	static const uint64_t poly64 = UINT64_C(0xC96C5795D7870F42);

	for (size_t s = 0; s < 4; ++s) {
		for (size_t b = 0; b < 256; ++b) {
			uint64_t r = s == 0 ? b : lzma_crc64_table[s - 1][b];

			for (size_t i = 0; i < 8; ++i) {
				if (r & 1)
					r = (r >> 1) ^ poly64;
				else
					r >>= 1;
			}

			lzma_crc64_table[s][b] = r;
		}
	}
}

 * The prototype of the CRC64 function:
 * extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);
 */

	.text
	.globl	lzma_crc64
	.type	lzma_crc64, @function

	.align	16
lzma_crc64:
	/*
	 * Register usage:
	 * %eax crc LSB
	 * %edx crc MSB
	 * %esi buf
	 * %edi size or buf + size
	 * %ebx lzma_crc64_table
	 * %ebp Table index
	 * %ecx Temporary
	 */
	pushl	%ebx
	pushl	%esi
	pushl	%edi
	pushl	%ebp
	movl	0x14(%esp), %esi /* buf */
	movl	0x18(%esp), %edi /* size */
	movl	0x1C(%esp), %eax /* crc LSB */
	movl	0x20(%esp), %edx /* crc MSB */

	/*
	 * Store the address of lzma_crc64_table to %ebx. This is needed to
	 * get position-independent code (PIC).
	 */
	call	.L_PIC
.L_PIC:
	popl	%ebx
	addl	$_GLOBAL_OFFSET_TABLE_+[.-.L_PIC], %ebx
	movl	lzma_crc64_table@GOT(%ebx), %ebx

	/* Complement the initial value. */
	notl	%eax
	notl	%edx

.L_align:
	/*
	 * Check if there is enough input to use slicing-by-four.
	 * We need eight bytes, because the loop pre-reads four bytes.
	 */
	cmpl	$8, %edi
	jl	.L_rest

	/* Check if we have reached alignment of four bytes. */
	testl	$3, %esi
	jz	.L_slice

	/* Calculate CRC of the next input byte. */
	movzbl	(%esi), %ebp
	incl	%esi
	movzbl	%al, %ecx
	xorl	%ecx, %ebp
	shrdl	$8, %edx, %eax
	xorl	(%ebx, %ebp, 8), %eax
	shrl	$8, %edx
	xorl	4(%ebx, %ebp, 8), %edx
	decl	%edi
	jmp	.L_align

.L_slice:
	/*
	 * If we get here, there's at least eight bytes of aligned input
	 * available. Make %edi multiple of four bytes. Store the possible
	 * remainder over the "size" variable in the argument stack.
	 */
	movl	%edi, 0x18(%esp)
	andl	$-4, %edi
	subl	%edi, 0x18(%esp)

	/*
	 * Let %edi be buf + size - 4 while running the main loop. This way
	 * we can compare for equality to determine when exit the loop.
	 */
	addl	%esi, %edi
	subl	$4, %edi

	/* Read in the first four aligned bytes. */
	movl	(%esi), %ecx

.L_loop:
	xorl	%eax, %ecx
	movzbl	%cl, %ebp
	movl	0x1800(%ebx, %ebp, 8), %eax
	xorl	%edx, %eax
	movl	0x1804(%ebx, %ebp, 8), %edx
	movzbl	%ch, %ebp
	xorl	0x1000(%ebx, %ebp, 8), %eax
	xorl	0x1004(%ebx, %ebp, 8), %edx
	shrl	$16, %ecx
	movzbl	%cl, %ebp
	xorl	0x0800(%ebx, %ebp, 8), %eax
	xorl	0x0804(%ebx, %ebp, 8), %edx
	movzbl	%ch, %ebp
	addl	$4, %esi
	xorl	(%ebx, %ebp, 8), %eax
	xorl	4(%ebx, %ebp, 8), %edx

	/* Check for end of aligned input. */
	cmpl	%edi, %esi

	/*
	 * Copy the next input byte to %ecx. It is slightly faster to
	 * read it here than at the top of the loop.
	 */
	movl	(%esi), %ecx
	jl	.L_loop

	/*
	 * Process the remaining four bytes, which we have already
	 * copied to %ecx.
	 */
	xorl	%eax, %ecx
	movzbl	%cl, %ebp
	movl	0x1800(%ebx, %ebp, 8), %eax
	xorl	%edx, %eax
	movl	0x1804(%ebx, %ebp, 8), %edx
	movzbl	%ch, %ebp
	xorl	0x1000(%ebx, %ebp, 8), %eax
	xorl	0x1004(%ebx, %ebp, 8), %edx
	shrl	$16, %ecx
	movzbl	%cl, %ebp
	xorl	0x0800(%ebx, %ebp, 8), %eax
	xorl	0x0804(%ebx, %ebp, 8), %edx
	movzbl	%ch, %ebp
	addl	$4, %esi
	xorl	(%ebx, %ebp, 8), %eax
	xorl	4(%ebx, %ebp, 8), %edx

	/* Copy the number of remaining bytes to %edi. */
	movl	0x18(%esp), %edi

.L_rest:
	/* Check for end of input. */
	testl	%edi, %edi
	jz	.L_return

	/* Calculate CRC of the next input byte. */
	movzbl	(%esi), %ebp
	incl	%esi
	movzbl	%al, %ecx
	xorl	%ecx, %ebp
	shrdl	$8, %edx, %eax
	xorl	(%ebx, %ebp, 8), %eax
	shrl	$8, %edx
	xorl	4(%ebx, %ebp, 8), %edx
	decl	%edi
	jmp	.L_rest

.L_return:
	/* Complement the final value. */
	notl	%eax
	notl	%edx

	popl	%ebp
	popl	%edi
	popl	%esi
	popl	%ebx
	ret

	.size	lzma_crc32, .-lzma_crc32