1 ########################################################################
2 # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
4 # Copyright (c) 2013, Intel Corporation
7 # Erdinc Ozturk <erdinc.ozturk@intel.com>
8 # Vinodh Gopal <vinodh.gopal@intel.com>
9 # James Guilford <james.guilford@intel.com>
10 # Tim Chen <tim.c.chen@linux.intel.com>
12 # This software is available to you under a choice of one of two
13 # licenses. You may choose to be licensed under the terms of the GNU
14 # General Public License (GPL) Version 2, available from the file
15 # COPYING in the main directory of this source tree, or the
16 # OpenIB.org BSD license below:
18 # Redistribution and use in source and binary forms, with or without
19 # modification, are permitted provided that the following conditions are
22 # * Redistributions of source code must retain the above copyright
23 # notice, this list of conditions and the following disclaimer.
25 # * Redistributions in binary form must reproduce the above copyright
26 # notice, this list of conditions and the following disclaimer in the
27 # documentation and/or other materials provided with the
30 # * Neither the name of the Intel Corporation nor the names of its
31 # contributors may be used to endorse or promote products derived from
32 # this software without specific prior written permission.
35 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
36 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
39 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46 ########################################################################
48 # UINT16 crc_t10dif_pcl(
49 # UINT16 init_crc, //initial CRC value, 16 bits
50 # const unsigned char *buf, //buffer pointer to calculate CRC on
51 # UINT64 len //buffer length in bytes (64-bit data)
54 # Reference paper titled "Fast CRC Computation for Generic
55 # Polynomials Using PCLMULQDQ Instruction"
56 # URL: http://www.intel.com/content/dam/www/public/us/en/documents
57 # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
61 #include <linux/linkage.h>
69 #define arg1_low32 %edi
74 # adjust the 16-bit initial_crc value, scale it to 32 bits
77 # Allocate Stack Space
80 # align stack to 16 byte boundary
81 and $~(0x10 - 1), %rsp
83 # check if smaller than 256
86 # for sizes less than 128, we can't fold 64B at a time...
90 # load the initial crc value
91 movd arg1_low32, %xmm10 # initial crc
93 # crc value does not need to be byte-reflected, but it needs
94 # to be moved to the high part of the register.
95 # because data will be byte-reflected and will align with
96 # initial crc at correct place.
99 movdqa SHUF_MASK(%rip), %xmm11
100 # receive the initial 64B data, xor the initial crc value
101 movdqu 16*0(arg2), %xmm0
102 movdqu 16*1(arg2), %xmm1
103 movdqu 16*2(arg2), %xmm2
104 movdqu 16*3(arg2), %xmm3
105 movdqu 16*4(arg2), %xmm4
106 movdqu 16*5(arg2), %xmm5
107 movdqu 16*6(arg2), %xmm6
108 movdqu 16*7(arg2), %xmm7
111 # XOR the initial_crc value
121 movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
122 #imm value of pclmulqdq instruction
123 #will determine which constant to use
125 #################################################################
126 # we subtract 256 instead of 128 to save one instruction from the loop
129 # at this section of the code, there is 64*x+y (0<=y<64) bytes of
130 # buffer. The _fold_64_B_loop will fold 64B at a time
131 # until we have 64+y Bytes of buffer
134 # fold 64B at a time. This section of the code folds 4 xmm
135 # registers in parallel
138 # update the buffer pointer
139 add $128, arg2 # buf += 64#
141 movdqu 16*0(arg2), %xmm9
142 movdqu 16*1(arg2), %xmm12
144 pshufb %xmm11, %xmm12
147 pclmulqdq $0x0 , %xmm10, %xmm0
148 pclmulqdq $0x11, %xmm10, %xmm8
149 pclmulqdq $0x0 , %xmm10, %xmm1
150 pclmulqdq $0x11, %xmm10, %xmm13
156 movdqu 16*2(arg2), %xmm9
157 movdqu 16*3(arg2), %xmm12
159 pshufb %xmm11, %xmm12
162 pclmulqdq $0x0, %xmm10, %xmm2
163 pclmulqdq $0x11, %xmm10, %xmm8
164 pclmulqdq $0x0, %xmm10, %xmm3
165 pclmulqdq $0x11, %xmm10, %xmm13
171 movdqu 16*4(arg2), %xmm9
172 movdqu 16*5(arg2), %xmm12
174 pshufb %xmm11, %xmm12
177 pclmulqdq $0x0, %xmm10, %xmm4
178 pclmulqdq $0x11, %xmm10, %xmm8
179 pclmulqdq $0x0, %xmm10, %xmm5
180 pclmulqdq $0x11, %xmm10, %xmm13
186 movdqu 16*6(arg2), %xmm9
187 movdqu 16*7(arg2), %xmm12
189 pshufb %xmm11, %xmm12
191 movdqa %xmm7 , %xmm13
192 pclmulqdq $0x0 , %xmm10, %xmm6
193 pclmulqdq $0x11, %xmm10, %xmm8
194 pclmulqdq $0x0 , %xmm10, %xmm7
195 pclmulqdq $0x11, %xmm10, %xmm13
203 # check if there is another 64B in the buffer to be able to fold
205 ##################################################################
209 # at this point, the buffer pointer is pointing at the last y Bytes
210 # of the buffer the 64B of folded data is in 4 of the xmm
211 # registers: xmm0, xmm1, xmm2, xmm3
214 # fold the 8 xmm registers to 1 xmm register with different constants
216 movdqa rk9(%rip), %xmm10
218 pclmulqdq $0x11, %xmm10, %xmm0
219 pclmulqdq $0x0 , %xmm10, %xmm8
223 movdqa rk11(%rip), %xmm10
225 pclmulqdq $0x11, %xmm10, %xmm1
226 pclmulqdq $0x0 , %xmm10, %xmm8
230 movdqa rk13(%rip), %xmm10
232 pclmulqdq $0x11, %xmm10, %xmm2
233 pclmulqdq $0x0 , %xmm10, %xmm8
237 movdqa rk15(%rip), %xmm10
239 pclmulqdq $0x11, %xmm10, %xmm3
240 pclmulqdq $0x0 , %xmm10, %xmm8
244 movdqa rk17(%rip), %xmm10
246 pclmulqdq $0x11, %xmm10, %xmm4
247 pclmulqdq $0x0 , %xmm10, %xmm8
251 movdqa rk19(%rip), %xmm10
253 pclmulqdq $0x11, %xmm10, %xmm5
254 pclmulqdq $0x0 , %xmm10, %xmm8
258 movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
259 #imm value of pclmulqdq instruction
260 #will determine which constant to use
262 pclmulqdq $0x11, %xmm10, %xmm6
263 pclmulqdq $0x0 , %xmm10, %xmm8
268 # instead of 64, we add 48 to the loop counter to save 1 instruction
269 # from the loop instead of a cmp instruction, we use the negative
270 # flag with the jl instruction
272 jl _final_reduction_for_128
274 # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
275 # and the rest is in memory. We can fold 16 bytes at a time if y>=16
276 # continue folding 16B at a time
280 pclmulqdq $0x11, %xmm10, %xmm7
281 pclmulqdq $0x0 , %xmm10, %xmm8
288 # instead of a cmp instruction, we utilize the flags with the
289 # jge instruction equivalent of: cmp arg3, 16-16
290 # check if there is any more 16B in the buffer to be able to fold
291 jge _16B_reduction_loop
293 #now we have 16+z bytes left to reduce, where 0<= z < 16.
294 #first, we reduce the data in the xmm7 register
297 _final_reduction_for_128:
298 # check if any more data to fold. If not, compute the CRC of
303 # here we are getting data that is less than 16 bytes.
304 # since we know that there was data before the pointer, we can
305 # offset the input pointer before the actual point, to receive
306 # exactly 16 bytes. after that the registers need to be adjusted.
310 movdqu -16(arg2, arg3), %xmm1
313 # get rid of the extra data that was loaded before
314 # load the shift constant
315 lea pshufb_shf_table+16(%rip), %rax
319 # shift xmm2 to the left by arg3 bytes
322 # shift xmm7 to the right by 16-arg3 bytes
323 pxor mask1(%rip), %xmm0
325 pblendvb %xmm2, %xmm1 #xmm0 is implicit
330 pclmulqdq $0x11, %xmm10, %xmm7
331 pclmulqdq $0x0 , %xmm10, %xmm8
336 # compute crc of a 128-bit value
337 movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
341 pclmulqdq $0x1, %xmm10, %xmm7
348 pand mask2(%rip), %xmm0
351 pclmulqdq $0x10, %xmm10, %xmm7
356 movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
358 pclmulqdq $0x01, %xmm10, %xmm7
360 pclmulqdq $0x11, %xmm10, %xmm7
364 pextrd $1, %xmm7, %eax
367 # scale the result back to 16 bits
372 ########################################################################
377 # check if there is enough buffer to be able to fold 16B at a time
380 movdqa SHUF_MASK(%rip), %xmm11
382 # now if there is, load the constants
383 movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
385 movd arg1_low32, %xmm0 # get the initial crc value
386 pslldq $12, %xmm0 # align it to its correct place
387 movdqu (arg2), %xmm7 # load the plaintext
388 pshufb %xmm11, %xmm7 # byte-reflect the plaintext
392 # update the buffer pointer
395 # update the counter. subtract 32 instead of 16 to save one
396 # instruction from the loop
399 jmp _16B_reduction_loop
404 # mov initial crc to the return value. this is necessary for
405 # zero-length buffers.
410 movdqa SHUF_MASK(%rip), %xmm11
412 movd arg1_low32, %xmm0 # get the initial crc value
413 pslldq $12, %xmm0 # align it to its correct place
417 jl _less_than_16_left
419 movdqu (arg2), %xmm7 # load the plaintext
420 pshufb %xmm11, %xmm7 # byte-reflect the plaintext
421 pxor %xmm0 , %xmm7 # xor the initial crc value
424 movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
425 jmp _get_last_two_xmms
430 # use stack space to load data less than 16 bytes, zero-out
431 # the 16B in memory first.
440 # backup the counter value
483 pxor %xmm0 , %xmm7 # xor the initial crc value
486 lea pshufb_shf_table+16(%rip), %rax
489 pxor mask1(%rip), %xmm0
498 pxor %xmm0 , %xmm7 # xor the initial crc value
518 pxor %xmm0 , %xmm7 # xor the initial crc value
536 pxor %xmm0 , %xmm7 # xor the initial crc value
549 pxor %xmm0 , %xmm7 # xor the initial crc value
555 ENDPROC(crc_t10dif_pcl)
557 .section .rodata, "a", @progbits
559 # precomputed constants
560 # these constants are precomputed from the poly:
561 # 0x8bb70000 (0x8bb7 scaled to 32 bits)
563 # rk1 = 2^(32*3) mod Q << 32
564 # rk2 = 2^(32*5) mod Q << 32
565 # rk3 = 2^(32*15) mod Q << 32
566 # rk4 = 2^(32*17) mod Q << 32
567 # rk5 = 2^(32*3) mod Q << 32
568 # rk6 = 2^(32*2) mod Q << 32
569 # rk7 = floor(2^64/Q)
572 .quad 0x2d56000000000000
574 .quad 0x06df000000000000
576 .quad 0x9d9d000000000000
578 .quad 0x7cf5000000000000
580 .quad 0x2d56000000000000
582 .quad 0x1368000000000000
584 .quad 0x00000001f65a57f8
586 .quad 0x000000018bb70000
589 .quad 0xceae000000000000
591 .quad 0xbfd6000000000000
593 .quad 0x1e16000000000000
595 .quad 0x713c000000000000
597 .quad 0xf7f9000000000000
599 .quad 0x80a6000000000000
601 .quad 0x044c000000000000
603 .quad 0xe658000000000000
605 .quad 0xad18000000000000
607 .quad 0xa497000000000000
609 .quad 0x6ee3000000000000
611 .quad 0xe7b5000000000000
615 .section .rodata.cst16.mask1, "aM", @progbits, 16
618 .octa 0x80808080808080808080808080808080
620 .section .rodata.cst16.mask2, "aM", @progbits, 16
623 .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
625 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
628 .octa 0x000102030405060708090A0B0C0D0E0F
630 .section .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
633 # use these values for shift constants for the pshufb instruction
634 # different alignments result in values as shown:
635 # DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
636 # DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
637 # DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
638 # DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
639 # DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
640 # DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
641 # DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
642 # DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
643 # DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
644 # DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
645 # DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
646 # DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
647 # DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
648 # DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
649 # DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
650 .octa 0x8f8e8d8c8b8a89888786858483828100
651 .octa 0x000e0d0c0b0a09080706050403020100