2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
34 #include <asm/nospec-branch.h>
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register. This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned). It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released. However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
51 .Lgf128mul_x_ble_mask:
52 .octa 0x00000000000000010000000000000087
53 POLY: .octa 0xC2000000000000000000000000000001
54 TWOONE: .octa 0x00000001000000000000000000000001
56 # order of these constants should not change.
57 # more specifically, ALL_F should follow SHIFT_MASK,
58 # and ZERO should follow ALL_F
60 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
61 MASK1: .octa 0x0000000000000000ffffffffffffffff
62 MASK2: .octa 0xffffffffffffffff0000000000000000
63 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
64 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
65 ZERO: .octa 0x00000000000000000000000000000000
66 ONE: .octa 0x00000000000000000000000000000001
67 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
75 #define STACK_OFFSET 8*3
76 #define HashKey 16*0 // store HashKey <<1 mod poly here
77 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
78 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
79 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
80 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
81 // bits of HashKey <<1 mod poly here
82 //(for Karatsuba purposes)
83 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
84 // bits of HashKey^2 <<1 mod poly here
85 // (for Karatsuba purposes)
86 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
87 // bits of HashKey^3 <<1 mod poly here
88 // (for Karatsuba purposes)
89 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
90 // bits of HashKey^4 <<1 mod poly here
91 // (for Karatsuba purposes)
92 #define VARIABLE_OFFSET 16*8
100 #define arg7 STACK_OFFSET+8(%r14)
101 #define arg8 STACK_OFFSET+16(%r14)
102 #define arg9 STACK_OFFSET+24(%r14)
103 #define arg10 STACK_OFFSET+32(%r14)
104 #define keysize 2*15*16(%arg1)
121 #define BSWAP_MASK %xmm10
125 #define GF128MUL_MASK %xmm10
155 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
158 * Input: A and B (128-bits each, bit-reflected)
159 * Output: C = A*B*x mod poly, (i.e. >>1 )
160 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
161 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
164 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
166 pshufd $78, \GH, \TMP2
167 pshufd $78, \HK, \TMP3
168 pxor \GH, \TMP2 # TMP2 = a1+a0
169 pxor \HK, \TMP3 # TMP3 = b1+b0
170 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
171 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
172 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
174 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
176 pslldq $8, \TMP3 # left shift TMP3 2 DWs
177 psrldq $8, \TMP2 # right shift TMP2 2 DWs
179 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
181 # first phase of the reduction
185 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
186 # in in order to perform
188 pslld $31, \TMP2 # packed right shift <<31
189 pslld $30, \TMP3 # packed right shift <<30
190 pslld $25, \TMP4 # packed right shift <<25
191 pxor \TMP3, \TMP2 # xor the shifted versions
194 psrldq $4, \TMP5 # right shift TMP5 1 DW
195 pslldq $12, \TMP2 # left shift TMP2 3 DWs
198 # second phase of the reduction
200 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
201 # in in order to perform
205 psrld $1,\TMP2 # packed left shift >>1
206 psrld $2,\TMP3 # packed left shift >>2
207 psrld $7,\TMP4 # packed left shift >>7
208 pxor \TMP3,\TMP2 # xor the shifted versions
212 pxor \TMP1, \GH # result is in TMP1
216 * if a = number of total plaintext bytes
218 * num_initial_blocks = b mod 4
219 * encrypt the initial num_initial_blocks blocks and apply ghash on
221 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
223 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
227 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
228 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
229 MOVADQ SHUF_MASK(%rip), %xmm14
230 mov arg7, %r10 # %r10 = AAD
231 mov arg8, %r12 # %r12 = aadLen
235 _get_AAD_loop\num_initial_blocks\operation:
242 jne _get_AAD_loop\num_initial_blocks\operation
245 je _get_AAD_loop2_done\num_initial_blocks\operation
248 _get_AAD_loop2\num_initial_blocks\operation:
252 jne _get_AAD_loop2\num_initial_blocks\operation
254 _get_AAD_loop2_done\num_initial_blocks\operation:
255 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
257 xor %r11, %r11 # initialise the data pointer offset as zero
259 # start AES for num_initial_blocks blocks
261 mov %arg5, %rax # %rax = *Y0
262 movdqu (%rax), \XMM0 # XMM0 = Y0
263 PSHUFB_XMM %xmm14, \XMM0
265 .if (\i == 5) || (\i == 6) || (\i == 7)
266 MOVADQ ONE(%RIP),\TMP1
269 paddd \TMP1, \XMM0 # INCR Y0
270 movdqa \XMM0, %xmm\index
271 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
272 pxor \TMP2, %xmm\index
276 shr $2,%eax # 128->4, 192->6, 256->8
277 add $5,%eax # 128->9, 192->11, 256->13
279 aes_loop_initial_dec\num_initial_blocks:
282 AESENC \TMP1, %xmm\index
286 jnz aes_loop_initial_dec\num_initial_blocks
290 AESENCLAST \TMP1, %xmm\index # Last Round
293 movdqu (%arg3 , %r11, 1), \TMP1
294 pxor \TMP1, %xmm\index
295 movdqu %xmm\index, (%arg2 , %r11, 1)
296 # write back plaintext/ciphertext for num_initial_blocks
299 movdqa \TMP1, %xmm\index
300 PSHUFB_XMM %xmm14, %xmm\index
301 # prepare plaintext/ciphertext for GHASH computation
304 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
305 # apply GHASH on num_initial_blocks blocks
309 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
311 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
313 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
324 jl _initial_blocks_done\num_initial_blocks\operation
325 # no need for precomputed values
328 * Precomputations for HashKey parallel with encryption of first 4 blocks.
329 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
331 MOVADQ ONE(%rip), \TMP1
332 paddd \TMP1, \XMM0 # INCR Y0
334 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
336 paddd \TMP1, \XMM0 # INCR Y0
338 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
340 paddd \TMP1, \XMM0 # INCR Y0
342 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
344 paddd \TMP1, \XMM0 # INCR Y0
346 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
348 MOVADQ 0(%arg1),\TMP1
354 pshufd $78, \TMP3, \TMP1
356 movdqa \TMP1, HashKey_k(%rsp)
357 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
358 # TMP5 = HashKey^2<<1 (mod poly)
359 movdqa \TMP5, HashKey_2(%rsp)
360 # HashKey_2 = HashKey^2<<1 (mod poly)
361 pshufd $78, \TMP5, \TMP1
363 movdqa \TMP1, HashKey_2_k(%rsp)
364 .irpc index, 1234 # do 4 rounds
365 movaps 0x10*\index(%arg1), \TMP1
371 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
372 # TMP5 = HashKey^3<<1 (mod poly)
373 movdqa \TMP5, HashKey_3(%rsp)
374 pshufd $78, \TMP5, \TMP1
376 movdqa \TMP1, HashKey_3_k(%rsp)
377 .irpc index, 56789 # do next 5 rounds
378 movaps 0x10*\index(%arg1), \TMP1
384 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
385 # TMP5 = HashKey^3<<1 (mod poly)
386 movdqa \TMP5, HashKey_4(%rsp)
387 pshufd $78, \TMP5, \TMP1
389 movdqa \TMP1, HashKey_4_k(%rsp)
392 shr $2,%eax # 128->4, 192->6, 256->8
393 sub $4,%eax # 128->0, 192->2, 256->4
394 jz aes_loop_pre_dec_done\num_initial_blocks
396 aes_loop_pre_dec\num_initial_blocks:
399 AESENC \TMP2, %xmm\index
403 jnz aes_loop_pre_dec\num_initial_blocks
405 aes_loop_pre_dec_done\num_initial_blocks:
407 AESENCLAST \TMP2, \XMM1
408 AESENCLAST \TMP2, \XMM2
409 AESENCLAST \TMP2, \XMM3
410 AESENCLAST \TMP2, \XMM4
411 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
413 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
415 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
417 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
419 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
421 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
423 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
425 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
428 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
430 # combine GHASHed value with the corresponding ciphertext
431 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
432 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
433 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
435 _initial_blocks_done\num_initial_blocks\operation:
441 * if a = number of total plaintext bytes
443 * num_initial_blocks = b mod 4
444 * encrypt the initial num_initial_blocks blocks and apply ghash on
446 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
448 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
452 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
453 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
454 MOVADQ SHUF_MASK(%rip), %xmm14
455 mov arg7, %r10 # %r10 = AAD
456 mov arg8, %r12 # %r12 = aadLen
459 _get_AAD_loop\num_initial_blocks\operation:
466 jne _get_AAD_loop\num_initial_blocks\operation
468 je _get_AAD_loop2_done\num_initial_blocks\operation
470 _get_AAD_loop2\num_initial_blocks\operation:
474 jne _get_AAD_loop2\num_initial_blocks\operation
475 _get_AAD_loop2_done\num_initial_blocks\operation:
476 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
478 xor %r11, %r11 # initialise the data pointer offset as zero
480 # start AES for num_initial_blocks blocks
482 mov %arg5, %rax # %rax = *Y0
483 movdqu (%rax), \XMM0 # XMM0 = Y0
484 PSHUFB_XMM %xmm14, \XMM0
486 .if (\i == 5) || (\i == 6) || (\i == 7)
488 MOVADQ ONE(%RIP),\TMP1
489 MOVADQ 0(%arg1),\TMP2
491 paddd \TMP1, \XMM0 # INCR Y0
492 MOVADQ \XMM0, %xmm\index
493 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
494 pxor \TMP2, %xmm\index
498 shr $2,%eax # 128->4, 192->6, 256->8
499 add $5,%eax # 128->9, 192->11, 256->13
501 aes_loop_initial_enc\num_initial_blocks:
504 AESENC \TMP1, %xmm\index
508 jnz aes_loop_initial_enc\num_initial_blocks
512 AESENCLAST \TMP1, %xmm\index # Last Round
515 movdqu (%arg3 , %r11, 1), \TMP1
516 pxor \TMP1, %xmm\index
517 movdqu %xmm\index, (%arg2 , %r11, 1)
518 # write back plaintext/ciphertext for num_initial_blocks
520 PSHUFB_XMM %xmm14, %xmm\index
522 # prepare plaintext/ciphertext for GHASH computation
525 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
526 # apply GHASH on num_initial_blocks blocks
530 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
532 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
534 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
537 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
539 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
542 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
545 jl _initial_blocks_done\num_initial_blocks\operation
546 # no need for precomputed values
549 * Precomputations for HashKey parallel with encryption of first 4 blocks.
550 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
552 MOVADQ ONE(%RIP),\TMP1
553 paddd \TMP1, \XMM0 # INCR Y0
555 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
557 paddd \TMP1, \XMM0 # INCR Y0
559 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
561 paddd \TMP1, \XMM0 # INCR Y0
563 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
565 paddd \TMP1, \XMM0 # INCR Y0
567 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
569 MOVADQ 0(%arg1),\TMP1
575 pshufd $78, \TMP3, \TMP1
577 movdqa \TMP1, HashKey_k(%rsp)
578 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
579 # TMP5 = HashKey^2<<1 (mod poly)
580 movdqa \TMP5, HashKey_2(%rsp)
581 # HashKey_2 = HashKey^2<<1 (mod poly)
582 pshufd $78, \TMP5, \TMP1
584 movdqa \TMP1, HashKey_2_k(%rsp)
585 .irpc index, 1234 # do 4 rounds
586 movaps 0x10*\index(%arg1), \TMP1
592 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
593 # TMP5 = HashKey^3<<1 (mod poly)
594 movdqa \TMP5, HashKey_3(%rsp)
595 pshufd $78, \TMP5, \TMP1
597 movdqa \TMP1, HashKey_3_k(%rsp)
598 .irpc index, 56789 # do next 5 rounds
599 movaps 0x10*\index(%arg1), \TMP1
605 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
606 # TMP5 = HashKey^3<<1 (mod poly)
607 movdqa \TMP5, HashKey_4(%rsp)
608 pshufd $78, \TMP5, \TMP1
610 movdqa \TMP1, HashKey_4_k(%rsp)
613 shr $2,%eax # 128->4, 192->6, 256->8
614 sub $4,%eax # 128->0, 192->2, 256->4
615 jz aes_loop_pre_enc_done\num_initial_blocks
617 aes_loop_pre_enc\num_initial_blocks:
620 AESENC \TMP2, %xmm\index
624 jnz aes_loop_pre_enc\num_initial_blocks
626 aes_loop_pre_enc_done\num_initial_blocks:
628 AESENCLAST \TMP2, \XMM1
629 AESENCLAST \TMP2, \XMM2
630 AESENCLAST \TMP2, \XMM3
631 AESENCLAST \TMP2, \XMM4
632 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
634 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
636 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
638 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
640 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
641 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
642 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
643 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
646 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
648 # combine GHASHed value with the corresponding ciphertext
649 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
650 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
651 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
653 _initial_blocks_done\num_initial_blocks\operation:
658 * encrypt 4 blocks at a time
659 * ghash the 4 previously encrypted ciphertext blocks
660 * arg1, %arg2, %arg3 are used as pointers only, not modified
661 * %r11 is the data offset value
663 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
664 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
671 movdqa SHUF_MASK(%rip), %xmm15
672 # multiply TMP5 * HashKey using karatsuba
675 pshufd $78, \XMM5, \TMP6
677 paddd ONE(%rip), \XMM0 # INCR CNT
678 movdqa HashKey_4(%rsp), \TMP5
679 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
681 paddd ONE(%rip), \XMM0 # INCR CNT
683 paddd ONE(%rip), \XMM0 # INCR CNT
685 paddd ONE(%rip), \XMM0 # INCR CNT
687 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
688 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
689 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
690 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
691 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
697 movdqa HashKey_4_k(%rsp), \TMP5
698 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
699 movaps 0x10(%arg1), \TMP1
700 AESENC \TMP1, \XMM1 # Round 1
704 movaps 0x20(%arg1), \TMP1
705 AESENC \TMP1, \XMM1 # Round 2
710 pshufd $78, \XMM6, \TMP2
712 movdqa HashKey_3(%rsp), \TMP5
713 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
714 movaps 0x30(%arg1), \TMP3
715 AESENC \TMP3, \XMM1 # Round 3
719 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
720 movaps 0x40(%arg1), \TMP3
721 AESENC \TMP3, \XMM1 # Round 4
725 movdqa HashKey_3_k(%rsp), \TMP5
726 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
727 movaps 0x50(%arg1), \TMP3
728 AESENC \TMP3, \XMM1 # Round 5
733 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
737 pshufd $78, \XMM7, \TMP2
739 movdqa HashKey_2(%rsp ), \TMP5
741 # Multiply TMP5 * HashKey using karatsuba
743 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
744 movaps 0x60(%arg1), \TMP3
745 AESENC \TMP3, \XMM1 # Round 6
749 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
750 movaps 0x70(%arg1), \TMP3
751 AESENC \TMP3, \XMM1 # Round 7
755 movdqa HashKey_2_k(%rsp), \TMP5
756 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
757 movaps 0x80(%arg1), \TMP3
758 AESENC \TMP3, \XMM1 # Round 8
763 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
767 # Multiply XMM8 * HashKey
768 # XMM8 and TMP5 hold the values for the two operands
771 pshufd $78, \XMM8, \TMP2
773 movdqa HashKey(%rsp), \TMP5
774 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
775 movaps 0x90(%arg1), \TMP3
776 AESENC \TMP3, \XMM1 # Round 9
780 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
783 shr $2,%eax # 128->4, 192->6, 256->8
784 sub $4,%eax # 128->0, 192->2, 256->4
785 jz aes_loop_par_enc_done
790 AESENC \TMP3, %xmm\index
796 aes_loop_par_enc_done:
798 AESENCLAST \TMP3, \XMM1 # Round 10
799 AESENCLAST \TMP3, \XMM2
800 AESENCLAST \TMP3, \XMM3
801 AESENCLAST \TMP3, \XMM4
802 movdqa HashKey_k(%rsp), \TMP5
803 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
804 movdqu (%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
806 movdqu 16(%arg3,%r11,1), \TMP3
807 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
808 movdqu 32(%arg3,%r11,1), \TMP3
809 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
810 movdqu 48(%arg3,%r11,1), \TMP3
811 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
812 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
813 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
814 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
815 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
816 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
817 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
818 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
819 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
827 pslldq $8, \TMP3 # left shift TMP3 2 DWs
828 psrldq $8, \TMP2 # right shift TMP2 2 DWs
830 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
832 # first phase of reduction
837 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
838 pslld $31, \TMP2 # packed right shift << 31
839 pslld $30, \TMP3 # packed right shift << 30
840 pslld $25, \TMP4 # packed right shift << 25
841 pxor \TMP3, \TMP2 # xor the shifted versions
844 psrldq $4, \TMP5 # right shift T5 1 DW
845 pslldq $12, \TMP2 # left shift T2 3 DWs
848 # second phase of reduction
850 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
853 psrld $1, \TMP2 # packed left shift >>1
854 psrld $2, \TMP3 # packed left shift >>2
855 psrld $7, \TMP4 # packed left shift >>7
856 pxor \TMP3,\TMP2 # xor the shifted versions
860 pxor \TMP1, \XMM5 # result is in TMP1
866 * decrypt 4 blocks at a time
867 * ghash the 4 previously decrypted ciphertext blocks
868 * arg1, %arg2, %arg3 are used as pointers only, not modified
869 * %r11 is the data offset value
871 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
872 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
879 movdqa SHUF_MASK(%rip), %xmm15
880 # multiply TMP5 * HashKey using karatsuba
883 pshufd $78, \XMM5, \TMP6
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa HashKey_4(%rsp), \TMP5
887 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
889 paddd ONE(%rip), \XMM0 # INCR CNT
891 paddd ONE(%rip), \XMM0 # INCR CNT
893 paddd ONE(%rip), \XMM0 # INCR CNT
895 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
896 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
897 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
898 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
899 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
905 movdqa HashKey_4_k(%rsp), \TMP5
906 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
907 movaps 0x10(%arg1), \TMP1
908 AESENC \TMP1, \XMM1 # Round 1
912 movaps 0x20(%arg1), \TMP1
913 AESENC \TMP1, \XMM1 # Round 2
918 pshufd $78, \XMM6, \TMP2
920 movdqa HashKey_3(%rsp), \TMP5
921 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
922 movaps 0x30(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 3
927 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
928 movaps 0x40(%arg1), \TMP3
929 AESENC \TMP3, \XMM1 # Round 4
933 movdqa HashKey_3_k(%rsp), \TMP5
934 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
935 movaps 0x50(%arg1), \TMP3
936 AESENC \TMP3, \XMM1 # Round 5
941 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
945 pshufd $78, \XMM7, \TMP2
947 movdqa HashKey_2(%rsp ), \TMP5
949 # Multiply TMP5 * HashKey using karatsuba
951 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
952 movaps 0x60(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 6
957 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
958 movaps 0x70(%arg1), \TMP3
959 AESENC \TMP3, \XMM1 # Round 7
963 movdqa HashKey_2_k(%rsp), \TMP5
964 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
965 movaps 0x80(%arg1), \TMP3
966 AESENC \TMP3, \XMM1 # Round 8
971 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
975 # Multiply XMM8 * HashKey
976 # XMM8 and TMP5 hold the values for the two operands
979 pshufd $78, \XMM8, \TMP2
981 movdqa HashKey(%rsp), \TMP5
982 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
983 movaps 0x90(%arg1), \TMP3
984 AESENC \TMP3, \XMM1 # Round 9
988 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
991 shr $2,%eax # 128->4, 192->6, 256->8
992 sub $4,%eax # 128->0, 192->2, 256->4
993 jz aes_loop_par_dec_done
998 AESENC \TMP3, %xmm\index
1002 jnz aes_loop_par_dec
1004 aes_loop_par_dec_done:
1005 MOVADQ (%r10), \TMP3
1006 AESENCLAST \TMP3, \XMM1 # last round
1007 AESENCLAST \TMP3, \XMM2
1008 AESENCLAST \TMP3, \XMM3
1009 AESENCLAST \TMP3, \XMM4
1010 movdqa HashKey_k(%rsp), \TMP5
1011 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1012 movdqu (%arg3,%r11,1), \TMP3
1013 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1014 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1016 movdqu 16(%arg3,%r11,1), \TMP3
1017 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1018 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1020 movdqu 32(%arg3,%r11,1), \TMP3
1021 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1022 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1024 movdqu 48(%arg3,%r11,1), \TMP3
1025 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1026 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1028 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1029 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1030 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1031 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1039 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1040 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1042 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1044 # first phase of reduction
1049 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1050 pslld $31, \TMP2 # packed right shift << 31
1051 pslld $30, \TMP3 # packed right shift << 30
1052 pslld $25, \TMP4 # packed right shift << 25
1053 pxor \TMP3, \TMP2 # xor the shifted versions
1056 psrldq $4, \TMP5 # right shift T5 1 DW
1057 pslldq $12, \TMP2 # left shift T2 3 DWs
1060 # second phase of reduction
1062 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1065 psrld $1, \TMP2 # packed left shift >>1
1066 psrld $2, \TMP3 # packed left shift >>2
1067 psrld $7, \TMP4 # packed left shift >>7
1068 pxor \TMP3,\TMP2 # xor the shifted versions
1072 pxor \TMP1, \XMM5 # result is in TMP1
1077 /* GHASH the last 4 ciphertext blocks. */
1078 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1079 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1081 # Multiply TMP6 * HashKey (using Karatsuba)
1084 pshufd $78, \XMM1, \TMP2
1086 movdqa HashKey_4(%rsp), \TMP5
1087 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1088 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1089 movdqa HashKey_4_k(%rsp), \TMP4
1090 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1091 movdqa \XMM1, \XMMDst
1092 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1094 # Multiply TMP1 * HashKey (using Karatsuba)
1097 pshufd $78, \XMM2, \TMP2
1099 movdqa HashKey_3(%rsp), \TMP5
1100 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1101 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1102 movdqa HashKey_3_k(%rsp), \TMP4
1103 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1107 # results accumulated in TMP6, XMMDst, XMM1
1109 # Multiply TMP1 * HashKey (using Karatsuba)
1112 pshufd $78, \XMM3, \TMP2
1114 movdqa HashKey_2(%rsp), \TMP5
1115 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1116 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1117 movdqa HashKey_2_k(%rsp), \TMP4
1118 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1121 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1123 # Multiply TMP1 * HashKey (using Karatsuba)
1125 pshufd $78, \XMM4, \TMP2
1127 movdqa HashKey(%rsp), \TMP5
1128 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1129 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1130 movdqa HashKey_k(%rsp), \TMP4
1131 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1137 # middle section of the temp results combined as in karatsuba algorithm
1139 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1140 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1143 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1144 # first phase of the reduction
1145 movdqa \XMMDst, \TMP2
1146 movdqa \XMMDst, \TMP3
1147 movdqa \XMMDst, \TMP4
1148 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1149 pslld $31, \TMP2 # packed right shifting << 31
1150 pslld $30, \TMP3 # packed right shifting << 30
1151 pslld $25, \TMP4 # packed right shifting << 25
1152 pxor \TMP3, \TMP2 # xor the shifted versions
1155 psrldq $4, \TMP7 # right shift TMP7 1 DW
1156 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1159 # second phase of the reduction
1160 movdqa \XMMDst, \TMP2
1161 # make 3 copies of XMMDst for doing 3 shift operations
1162 movdqa \XMMDst, \TMP3
1163 movdqa \XMMDst, \TMP4
1164 psrld $1, \TMP2 # packed left shift >> 1
1165 psrld $2, \TMP3 # packed left shift >> 2
1166 psrld $7, \TMP4 # packed left shift >> 7
1167 pxor \TMP3, \TMP2 # xor the shifted versions
1171 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1175 /* Encryption of a single block
1179 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1183 shr $2,%eax # 128->4, 192->6, 256->8
1184 add $5,%eax # 128->9, 192->11, 256->13
1185 lea 16(%arg1), %r10 # get first expanded key address
1195 AESENCLAST \TMP1,\XMM0
1197 /*****************************************************************************
1198 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1199 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1200 * const u8 *in, // Ciphertext input
1201 * u64 plaintext_len, // Length of data in bytes for decryption.
1202 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1203 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1204 * // concatenated with 0x00000001. 16-byte aligned pointer.
1205 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1206 * const u8 *aad, // Additional Authentication Data (AAD)
1207 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1208 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1209 * // given authentication tag and only return the plaintext if they match.
1210 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1211 * // (most likely), 12 or 8.
1216 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1217 * set of 11 keys in the data structure void *aes_ctx
1221 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1222 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1223 * | Salt (From the SA) |
1224 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1225 * | Initialization Vector |
1226 * | (This is the sequence number from IPSec header) |
1227 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1229 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1234 * AAD padded to 128 bits with 0
1235 * for example, assume AAD is a u32 vector
1237 * if AAD is 8 bytes:
1238 * AAD[3] = {A0, A1};
1239 * padded AAD in xmm register = {A1 A0 0 0}
1242 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1245 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246 * | 32-bit Sequence Number (A0) |
1247 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251 * AAD Format with 32-bit Sequence Number
1253 * if AAD is 12 bytes:
1254 * AAD[3] = {A0, A1, A2};
1255 * padded AAD in xmm register = {A2 A1 A0 0}
1258 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1259 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1260 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1261 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1263 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1264 * | 64-bit Extended Sequence Number {A1,A0} |
1266 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1268 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1270 * AAD Format with 64-bit Extended Sequence Number
1273 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1274 * The code supports 16 too but for other sizes, the code will fail.
1277 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1278 * For other sizes, the code will fail.
1280 * poly = x^128 + x^127 + x^126 + x^121 + 1
1282 *****************************************************************************/
1283 ENTRY(aesni_gcm_dec)
1289 * states of %xmm registers %xmm6:%xmm15 not saved
1290 * all %xmm registers are clobbered
1292 sub $VARIABLE_OFFSET, %rsp
1293 and $~63, %rsp # align rsp to 64 bytes
1295 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1296 movdqa SHUF_MASK(%rip), %xmm2
1297 PSHUFB_XMM %xmm2, %xmm13
1300 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1302 movdqa %xmm13, %xmm2
1312 pshufd $0x24, %xmm1, %xmm2
1313 pcmpeqd TWOONE(%rip), %xmm2
1314 pand POLY(%rip), %xmm2
1315 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1318 # Decrypt first few blocks
1320 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1321 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1322 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1325 jz _initial_num_blocks_is_0_decrypt
1327 jb _initial_num_blocks_is_1_decrypt
1328 je _initial_num_blocks_is_2_decrypt
1329 _initial_num_blocks_is_3_decrypt:
1330 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1331 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1333 jmp _initial_blocks_decrypted
1334 _initial_num_blocks_is_2_decrypt:
1335 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1336 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1338 jmp _initial_blocks_decrypted
1339 _initial_num_blocks_is_1_decrypt:
1340 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1341 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1343 jmp _initial_blocks_decrypted
1344 _initial_num_blocks_is_0_decrypt:
1345 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1346 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1347 _initial_blocks_decrypted:
1349 je _zero_cipher_left_decrypt
1351 je _four_cipher_left_decrypt
1353 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1354 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1358 _four_cipher_left_decrypt:
1359 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1360 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1361 _zero_cipher_left_decrypt:
1363 and $15, %r13 # %r13 = arg4 (mod 16)
1364 je _multiple_of_16_bytes_decrypt
1366 # Handle the last <16 byte block separately
1368 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1369 movdqa SHUF_MASK(%rip), %xmm10
1370 PSHUFB_XMM %xmm10, %xmm0
1372 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1375 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1376 lea SHIFT_MASK+16(%rip), %r12
1378 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1379 # (%r13 is the number of bytes in plaintext mod 16)
1380 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1381 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1384 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1385 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1386 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1387 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1389 movdqa SHUF_MASK(%rip), %xmm10
1390 PSHUFB_XMM %xmm10 ,%xmm2
1393 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1394 # GHASH computation for the last <16 byte block
1399 MOVQ_R64_XMM %xmm0, %rax
1401 jle _less_than_8_bytes_left_decrypt
1402 mov %rax, (%arg2 , %r11, 1)
1405 MOVQ_R64_XMM %xmm0, %rax
1407 _less_than_8_bytes_left_decrypt:
1408 mov %al, (%arg2, %r11, 1)
1412 jne _less_than_8_bytes_left_decrypt
1413 _multiple_of_16_bytes_decrypt:
1414 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1415 shl $3, %r12 # convert into number of bits
1416 movd %r12d, %xmm15 # len(A) in %xmm15
1417 shl $3, %arg4 # len(C) in bits (*128)
1418 MOVQ_R64_XMM %arg4, %xmm1
1419 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1420 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1422 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1423 # final GHASH computation
1424 movdqa SHUF_MASK(%rip), %xmm10
1425 PSHUFB_XMM %xmm10, %xmm8
1427 mov %arg5, %rax # %rax = *Y0
1428 movdqu (%rax), %xmm0 # %xmm0 = Y0
1429 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1432 mov arg9, %r10 # %r10 = authTag
1433 mov arg10, %r11 # %r11 = auth_tag_len
1439 MOVQ_R64_XMM %xmm0, %rax
1441 jmp _return_T_done_decrypt
1443 MOVQ_R64_XMM %xmm0, %rax
1448 jmp _return_T_done_decrypt
1450 movdqu %xmm0, (%r10)
1451 _return_T_done_decrypt:
1457 ENDPROC(aesni_gcm_dec)
1460 /*****************************************************************************
1461 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1462 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1463 * const u8 *in, // Plaintext input
1464 * u64 plaintext_len, // Length of data in bytes for encryption.
1465 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1466 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1467 * // concatenated with 0x00000001. 16-byte aligned pointer.
1468 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1469 * const u8 *aad, // Additional Authentication Data (AAD)
1470 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1471 * u8 *auth_tag, // Authenticated Tag output.
1472 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1478 * keys are pre-expanded and aligned to 16 bytes. we are using the
1479 * first set of 11 keys in the data structure void *aes_ctx
1484 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1485 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1486 * | Salt (From the SA) |
1487 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1488 * | Initialization Vector |
1489 * | (This is the sequence number from IPSec header) |
1490 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1492 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1497 * AAD padded to 128 bits with 0
1498 * for example, assume AAD is a u32 vector
1500 * if AAD is 8 bytes:
1501 * AAD[3] = {A0, A1};
1502 * padded AAD in xmm register = {A1 A0 0 0}
1505 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1506 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1508 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1509 * | 32-bit Sequence Number (A0) |
1510 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1514 * AAD Format with 32-bit Sequence Number
1516 * if AAD is 12 bytes:
1517 * AAD[3] = {A0, A1, A2};
1518 * padded AAD in xmm register = {A2 A1 A0 0}
1521 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1522 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1524 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1525 * | 64-bit Extended Sequence Number {A1,A0} |
1527 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1529 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1531 * AAD Format with 64-bit Extended Sequence Number
1534 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1535 * The code supports 16 too but for other sizes, the code will fail.
1538 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1539 * For other sizes, the code will fail.
1541 * poly = x^128 + x^127 + x^126 + x^121 + 1
1542 ***************************************************************************/
1543 ENTRY(aesni_gcm_enc)
1549 # states of %xmm registers %xmm6:%xmm15 not saved
1550 # all %xmm registers are clobbered
1552 sub $VARIABLE_OFFSET, %rsp
1555 movdqu (%r12), %xmm13
1556 movdqa SHUF_MASK(%rip), %xmm2
1557 PSHUFB_XMM %xmm2, %xmm13
1560 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1562 movdqa %xmm13, %xmm2
1572 pshufd $0x24, %xmm1, %xmm2
1573 pcmpeqd TWOONE(%rip), %xmm2
1574 pand POLY(%rip), %xmm2
1576 movdqa %xmm13, HashKey(%rsp)
1577 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1581 # Encrypt first few blocks
1584 jz _initial_num_blocks_is_0_encrypt
1586 jb _initial_num_blocks_is_1_encrypt
1587 je _initial_num_blocks_is_2_encrypt
1588 _initial_num_blocks_is_3_encrypt:
1589 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1590 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1592 jmp _initial_blocks_encrypted
1593 _initial_num_blocks_is_2_encrypt:
1594 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1595 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1597 jmp _initial_blocks_encrypted
1598 _initial_num_blocks_is_1_encrypt:
1599 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1600 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1602 jmp _initial_blocks_encrypted
1603 _initial_num_blocks_is_0_encrypt:
1604 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1605 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1606 _initial_blocks_encrypted:
1608 # Main loop - Encrypt remaining blocks
1611 je _zero_cipher_left_encrypt
1613 je _four_cipher_left_encrypt
1614 _encrypt_by_4_encrypt:
1615 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1616 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1619 jne _encrypt_by_4_encrypt
1620 _four_cipher_left_encrypt:
1621 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1622 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1623 _zero_cipher_left_encrypt:
1625 and $15, %r13 # %r13 = arg4 (mod 16)
1626 je _multiple_of_16_bytes_encrypt
1628 # Handle the last <16 Byte block separately
1629 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10, %xmm0
1634 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1637 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1638 lea SHIFT_MASK+16(%rip), %r12
1640 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1641 # (%r13 is the number of bytes in plaintext mod 16)
1642 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1643 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1644 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1645 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1646 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1647 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1648 movdqa SHUF_MASK(%rip), %xmm10
1649 PSHUFB_XMM %xmm10,%xmm0
1652 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1653 # GHASH computation for the last <16 byte block
1657 movdqa SHUF_MASK(%rip), %xmm10
1658 PSHUFB_XMM %xmm10, %xmm0
1660 # shuffle xmm0 back to output as ciphertext
1663 MOVQ_R64_XMM %xmm0, %rax
1665 jle _less_than_8_bytes_left_encrypt
1666 mov %rax, (%arg2 , %r11, 1)
1669 MOVQ_R64_XMM %xmm0, %rax
1671 _less_than_8_bytes_left_encrypt:
1672 mov %al, (%arg2, %r11, 1)
1676 jne _less_than_8_bytes_left_encrypt
1677 _multiple_of_16_bytes_encrypt:
1678 mov arg8, %r12 # %r12 = addLen (number of bytes)
1680 movd %r12d, %xmm15 # len(A) in %xmm15
1681 shl $3, %arg4 # len(C) in bits (*128)
1682 MOVQ_R64_XMM %arg4, %xmm1
1683 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1684 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1686 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1687 # final GHASH computation
1688 movdqa SHUF_MASK(%rip), %xmm10
1689 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1691 mov %arg5, %rax # %rax = *Y0
1692 movdqu (%rax), %xmm0 # %xmm0 = Y0
1693 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1696 mov arg9, %r10 # %r10 = authTag
1697 mov arg10, %r11 # %r11 = auth_tag_len
1703 MOVQ_R64_XMM %xmm0, %rax
1705 jmp _return_T_done_encrypt
1707 MOVQ_R64_XMM %xmm0, %rax
1712 jmp _return_T_done_encrypt
1714 movdqu %xmm0, (%r10)
1715 _return_T_done_encrypt:
1721 ENDPROC(aesni_gcm_enc)
1728 _key_expansion_256a:
1729 pshufd $0b11111111, %xmm1, %xmm1
1730 shufps $0b00010000, %xmm0, %xmm4
1732 shufps $0b10001100, %xmm0, %xmm4
1735 movaps %xmm0, (TKEYP)
1738 ENDPROC(_key_expansion_128)
1739 ENDPROC(_key_expansion_256a)
1742 _key_expansion_192a:
1743 pshufd $0b01010101, %xmm1, %xmm1
1744 shufps $0b00010000, %xmm0, %xmm4
1746 shufps $0b10001100, %xmm0, %xmm4
1753 pshufd $0b11111111, %xmm0, %xmm3
1758 shufps $0b01000100, %xmm0, %xmm6
1759 movaps %xmm6, (TKEYP)
1760 shufps $0b01001110, %xmm2, %xmm1
1761 movaps %xmm1, 0x10(TKEYP)
1764 ENDPROC(_key_expansion_192a)
1767 _key_expansion_192b:
1768 pshufd $0b01010101, %xmm1, %xmm1
1769 shufps $0b00010000, %xmm0, %xmm4
1771 shufps $0b10001100, %xmm0, %xmm4
1777 pshufd $0b11111111, %xmm0, %xmm3
1781 movaps %xmm0, (TKEYP)
1784 ENDPROC(_key_expansion_192b)
1787 _key_expansion_256b:
1788 pshufd $0b10101010, %xmm1, %xmm1
1789 shufps $0b00010000, %xmm2, %xmm4
1791 shufps $0b10001100, %xmm2, %xmm4
1794 movaps %xmm2, (TKEYP)
1797 ENDPROC(_key_expansion_256b)
1800 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1801 * unsigned int key_len)
1803 ENTRY(aesni_set_key)
1806 movl 8(%esp), KEYP # ctx
1807 movl 12(%esp), UKEYP # in_key
1808 movl 16(%esp), %edx # key_len
1810 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1811 movaps %xmm0, (KEYP)
1812 lea 0x10(KEYP), TKEYP # key addr
1813 movl %edx, 480(KEYP)
1814 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1818 movups 0x10(UKEYP), %xmm2 # other user key
1819 movaps %xmm2, (TKEYP)
1821 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1822 call _key_expansion_256a
1823 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1824 call _key_expansion_256b
1825 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1826 call _key_expansion_256a
1827 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1828 call _key_expansion_256b
1829 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1830 call _key_expansion_256a
1831 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1832 call _key_expansion_256b
1833 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1834 call _key_expansion_256a
1835 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1836 call _key_expansion_256b
1837 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1838 call _key_expansion_256a
1839 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1840 call _key_expansion_256b
1841 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1842 call _key_expansion_256a
1843 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1844 call _key_expansion_256b
1845 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1846 call _key_expansion_256a
1849 movq 0x10(UKEYP), %xmm2 # other user key
1850 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1851 call _key_expansion_192a
1852 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1853 call _key_expansion_192b
1854 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1855 call _key_expansion_192a
1856 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1857 call _key_expansion_192b
1858 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1859 call _key_expansion_192a
1860 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1861 call _key_expansion_192b
1862 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1863 call _key_expansion_192a
1864 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1865 call _key_expansion_192b
1868 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1869 call _key_expansion_128
1870 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1871 call _key_expansion_128
1872 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1873 call _key_expansion_128
1874 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1875 call _key_expansion_128
1876 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1877 call _key_expansion_128
1878 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1879 call _key_expansion_128
1880 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1881 call _key_expansion_128
1882 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1883 call _key_expansion_128
1884 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1885 call _key_expansion_128
1886 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1887 call _key_expansion_128
1890 movaps (KEYP), %xmm0
1891 movaps (TKEYP), %xmm1
1892 movaps %xmm0, 240(TKEYP)
1893 movaps %xmm1, 240(KEYP)
1895 lea 240-16(TKEYP), UKEYP
1898 movaps (KEYP), %xmm0
1900 movaps %xmm1, (UKEYP)
1910 ENDPROC(aesni_set_key)
1913 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1923 movl 480(KEYP), KLEN # key length
1924 movups (INP), STATE # input
1926 movups STATE, (OUTP) # output
1935 * _aesni_enc1: internal ABI
1937 * KEYP: key struct pointer
1939 * STATE: initial state (input)
1941 * STATE: finial state (output)
1948 movaps (KEYP), KEY # key
1950 pxor KEY, STATE # round 0
1954 lea 0x20(TKEYP), TKEYP
1957 movaps -0x60(TKEYP), KEY
1959 movaps -0x50(TKEYP), KEY
1963 movaps -0x40(TKEYP), KEY
1965 movaps -0x30(TKEYP), KEY
1969 movaps -0x20(TKEYP), KEY
1971 movaps -0x10(TKEYP), KEY
1975 movaps 0x10(TKEYP), KEY
1977 movaps 0x20(TKEYP), KEY
1979 movaps 0x30(TKEYP), KEY
1981 movaps 0x40(TKEYP), KEY
1983 movaps 0x50(TKEYP), KEY
1985 movaps 0x60(TKEYP), KEY
1987 movaps 0x70(TKEYP), KEY
1988 AESENCLAST KEY STATE
1990 ENDPROC(_aesni_enc1)
1993 * _aesni_enc4: internal ABI
1995 * KEYP: key struct pointer
1997 * STATE1: initial state (input)
2002 * STATE1: finial state (output)
2012 movaps (KEYP), KEY # key
2014 pxor KEY, STATE1 # round 0
2021 lea 0x20(TKEYP), TKEYP
2024 movaps -0x60(TKEYP), KEY
2029 movaps -0x50(TKEYP), KEY
2036 movaps -0x40(TKEYP), KEY
2041 movaps -0x30(TKEYP), KEY
2048 movaps -0x20(TKEYP), KEY
2053 movaps -0x10(TKEYP), KEY
2063 movaps 0x10(TKEYP), KEY
2068 movaps 0x20(TKEYP), KEY
2073 movaps 0x30(TKEYP), KEY
2078 movaps 0x40(TKEYP), KEY
2083 movaps 0x50(TKEYP), KEY
2088 movaps 0x60(TKEYP), KEY
2093 movaps 0x70(TKEYP), KEY
2094 AESENCLAST KEY STATE1 # last round
2095 AESENCLAST KEY STATE2
2096 AESENCLAST KEY STATE3
2097 AESENCLAST KEY STATE4
2099 ENDPROC(_aesni_enc4)
2102 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2112 mov 480(KEYP), KLEN # key length
2114 movups (INP), STATE # input
2116 movups STATE, (OUTP) #output
2125 * _aesni_dec1: internal ABI
2127 * KEYP: key struct pointer
2129 * STATE: initial state (input)
2131 * STATE: finial state (output)
2138 movaps (KEYP), KEY # key
2140 pxor KEY, STATE # round 0
2144 lea 0x20(TKEYP), TKEYP
2147 movaps -0x60(TKEYP), KEY
2149 movaps -0x50(TKEYP), KEY
2153 movaps -0x40(TKEYP), KEY
2155 movaps -0x30(TKEYP), KEY
2159 movaps -0x20(TKEYP), KEY
2161 movaps -0x10(TKEYP), KEY
2165 movaps 0x10(TKEYP), KEY
2167 movaps 0x20(TKEYP), KEY
2169 movaps 0x30(TKEYP), KEY
2171 movaps 0x40(TKEYP), KEY
2173 movaps 0x50(TKEYP), KEY
2175 movaps 0x60(TKEYP), KEY
2177 movaps 0x70(TKEYP), KEY
2178 AESDECLAST KEY STATE
2180 ENDPROC(_aesni_dec1)
2183 * _aesni_dec4: internal ABI
2185 * KEYP: key struct pointer
2187 * STATE1: initial state (input)
2192 * STATE1: finial state (output)
2202 movaps (KEYP), KEY # key
2204 pxor KEY, STATE1 # round 0
2211 lea 0x20(TKEYP), TKEYP
2214 movaps -0x60(TKEYP), KEY
2219 movaps -0x50(TKEYP), KEY
2226 movaps -0x40(TKEYP), KEY
2231 movaps -0x30(TKEYP), KEY
2238 movaps -0x20(TKEYP), KEY
2243 movaps -0x10(TKEYP), KEY
2253 movaps 0x10(TKEYP), KEY
2258 movaps 0x20(TKEYP), KEY
2263 movaps 0x30(TKEYP), KEY
2268 movaps 0x40(TKEYP), KEY
2273 movaps 0x50(TKEYP), KEY
2278 movaps 0x60(TKEYP), KEY
2283 movaps 0x70(TKEYP), KEY
2284 AESDECLAST KEY STATE1 # last round
2285 AESDECLAST KEY STATE2
2286 AESDECLAST KEY STATE3
2287 AESDECLAST KEY STATE4
2289 ENDPROC(_aesni_dec4)
2292 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2295 ENTRY(aesni_ecb_enc)
2305 test LEN, LEN # check length
2314 movups (INP), STATE1
2315 movups 0x10(INP), STATE2
2316 movups 0x20(INP), STATE3
2317 movups 0x30(INP), STATE4
2319 movups STATE1, (OUTP)
2320 movups STATE2, 0x10(OUTP)
2321 movups STATE3, 0x20(OUTP)
2322 movups STATE4, 0x30(OUTP)
2332 movups (INP), STATE1
2334 movups STATE1, (OUTP)
2347 ENDPROC(aesni_ecb_enc)
2350 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2353 ENTRY(aesni_ecb_dec)
2373 movups (INP), STATE1
2374 movups 0x10(INP), STATE2
2375 movups 0x20(INP), STATE3
2376 movups 0x30(INP), STATE4
2378 movups STATE1, (OUTP)
2379 movups STATE2, 0x10(OUTP)
2380 movups STATE3, 0x20(OUTP)
2381 movups STATE4, 0x30(OUTP)
2391 movups (INP), STATE1
2393 movups STATE1, (OUTP)
2406 ENDPROC(aesni_ecb_dec)
2409 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2410 * size_t len, u8 *iv)
2412 ENTRY(aesni_cbc_enc)
2427 movups (IVP), STATE # load iv as initial state
2430 movups (INP), IN # load input
2433 movups STATE, (OUTP) # store output
2448 ENDPROC(aesni_cbc_enc)
2451 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2452 * size_t len, u8 *iv)
2454 ENTRY(aesni_cbc_dec)
2467 jb .Lcbc_dec_just_ret
2477 movups 0x10(INP), IN2
2480 movups 0x20(INP), IN3
2482 movups 0x30(INP), IN4
2485 movups 0x20(INP), IN1
2487 movups 0x30(INP), IN2
2502 movups 0x10(INP), IN2
2505 movups STATE1, (OUTP)
2506 movups STATE2, 0x10(OUTP)
2507 movups STATE3, 0x20(OUTP)
2508 movups STATE4, 0x30(OUTP)
2522 movups STATE, (OUTP)
2539 ENDPROC(aesni_cbc_dec)
2544 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2547 * _aesni_inc_init: internal ABI
2548 * setup registers used by _aesni_inc
2552 * CTR: == IV, in little endian
2553 * TCTR_LOW: == lower qword of CTR
2554 * INC: == 1, in little endian
2555 * BSWAP_MASK == endian swapping mask
2559 movaps .Lbswap_mask, BSWAP_MASK
2561 PSHUFB_XMM BSWAP_MASK CTR
2563 MOVQ_R64_XMM TCTR_LOW INC
2564 MOVQ_R64_XMM CTR TCTR_LOW
2566 ENDPROC(_aesni_inc_init)
2569 * _aesni_inc: internal ABI
2570 * Increase IV by 1, IV is in big endian
2573 * CTR: == IV, in little endian
2574 * TCTR_LOW: == lower qword of CTR
2575 * INC: == 1, in little endian
2576 * BSWAP_MASK == endian swapping mask
2580 * CTR: == output IV, in little endian
2581 * TCTR_LOW: == lower qword of CTR
2593 PSHUFB_XMM BSWAP_MASK IV
2598 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2599 * size_t len, u8 *iv)
2601 ENTRY(aesni_ctr_enc)
2603 jb .Lctr_enc_just_ret
2606 call _aesni_inc_init
2616 movups 0x10(INP), IN2
2619 movups 0x20(INP), IN3
2622 movups 0x30(INP), IN4
2625 movups STATE1, (OUTP)
2627 movups STATE2, 0x10(OUTP)
2629 movups STATE3, 0x20(OUTP)
2631 movups STATE4, 0x30(OUTP)
2646 movups STATE, (OUTP)
2656 ENDPROC(aesni_ctr_enc)
2659 * _aesni_gf128mul_x_ble: internal ABI
2660 * Multiply in GF(2^128) for XTS IVs
2663 * GF128MUL_MASK == mask with 0x87 and 0x01
2667 * CTR: == temporary value
2669 #define _aesni_gf128mul_x_ble() \
2670 pshufd $0x13, IV, CTR; \
2673 pand GF128MUL_MASK, CTR; \
2677 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2680 ENTRY(aesni_xts_crypt8)
2684 leaq _aesni_enc4, %r11
2685 leaq _aesni_dec4, %rax
2689 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2696 movdqu 0x00(INP), INC
2698 movdqu IV, 0x00(OUTP)
2700 _aesni_gf128mul_x_ble()
2702 movdqu 0x10(INP), INC
2704 movdqu IV, 0x10(OUTP)
2706 _aesni_gf128mul_x_ble()
2708 movdqu 0x20(INP), INC
2710 movdqu IV, 0x20(OUTP)
2712 _aesni_gf128mul_x_ble()
2714 movdqu 0x30(INP), INC
2716 movdqu IV, 0x30(OUTP)
2720 movdqu 0x00(OUTP), INC
2722 movdqu STATE1, 0x00(OUTP)
2724 _aesni_gf128mul_x_ble()
2726 movdqu 0x40(INP), INC
2728 movdqu IV, 0x40(OUTP)
2730 movdqu 0x10(OUTP), INC
2732 movdqu STATE2, 0x10(OUTP)
2734 _aesni_gf128mul_x_ble()
2736 movdqu 0x50(INP), INC
2738 movdqu IV, 0x50(OUTP)
2740 movdqu 0x20(OUTP), INC
2742 movdqu STATE3, 0x20(OUTP)
2744 _aesni_gf128mul_x_ble()
2746 movdqu 0x60(INP), INC
2748 movdqu IV, 0x60(OUTP)
2750 movdqu 0x30(OUTP), INC
2752 movdqu STATE4, 0x30(OUTP)
2754 _aesni_gf128mul_x_ble()
2756 movdqu 0x70(INP), INC
2758 movdqu IV, 0x70(OUTP)
2760 _aesni_gf128mul_x_ble()
2765 movdqu 0x40(OUTP), INC
2767 movdqu STATE1, 0x40(OUTP)
2769 movdqu 0x50(OUTP), INC
2771 movdqu STATE2, 0x50(OUTP)
2773 movdqu 0x60(OUTP), INC
2775 movdqu STATE3, 0x60(OUTP)
2777 movdqu 0x70(OUTP), INC
2779 movdqu STATE4, 0x70(OUTP)
2782 ENDPROC(aesni_xts_crypt8)