2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
34 #include <asm/frame.h>
35 #include <asm/nospec-branch.h>
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
52 .Lgf128mul_x_ble_mask:
53 .octa 0x00000000000000010000000000000087
54 POLY: .octa 0xC2000000000000000000000000000001
55 TWOONE: .octa 0x00000001000000000000000000000001
57 # order of these constants should not change.
58 # more specifically, ALL_F should follow SHIFT_MASK,
59 # and ZERO should follow ALL_F
61 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
62 MASK1: .octa 0x0000000000000000ffffffffffffffff
63 MASK2: .octa 0xffffffffffffffff0000000000000000
64 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
65 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
66 ZERO: .octa 0x00000000000000000000000000000000
67 ONE: .octa 0x00000000000000000000000000000001
68 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
76 #define STACK_OFFSET 8*3
77 #define HashKey 16*0 // store HashKey <<1 mod poly here
78 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
79 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
80 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
81 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
82 // bits of HashKey <<1 mod poly here
83 //(for Karatsuba purposes)
84 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
85 // bits of HashKey^2 <<1 mod poly here
86 // (for Karatsuba purposes)
87 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
88 // bits of HashKey^3 <<1 mod poly here
89 // (for Karatsuba purposes)
90 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
91 // bits of HashKey^4 <<1 mod poly here
92 // (for Karatsuba purposes)
93 #define VARIABLE_OFFSET 16*8
101 #define arg7 STACK_OFFSET+8(%r14)
102 #define arg8 STACK_OFFSET+16(%r14)
103 #define arg9 STACK_OFFSET+24(%r14)
104 #define arg10 STACK_OFFSET+32(%r14)
105 #define keysize 2*15*16(%arg1)
122 #define BSWAP_MASK %xmm10
126 #define GF128MUL_MASK %xmm10
156 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
159 * Input: A and B (128-bits each, bit-reflected)
160 * Output: C = A*B*x mod poly, (i.e. >>1 )
161 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
162 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
165 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
167 pshufd $78, \GH, \TMP2
168 pshufd $78, \HK, \TMP3
169 pxor \GH, \TMP2 # TMP2 = a1+a0
170 pxor \HK, \TMP3 # TMP3 = b1+b0
171 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
172 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
173 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
175 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
177 pslldq $8, \TMP3 # left shift TMP3 2 DWs
178 psrldq $8, \TMP2 # right shift TMP2 2 DWs
180 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
182 # first phase of the reduction
186 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
187 # in in order to perform
189 pslld $31, \TMP2 # packed right shift <<31
190 pslld $30, \TMP3 # packed right shift <<30
191 pslld $25, \TMP4 # packed right shift <<25
192 pxor \TMP3, \TMP2 # xor the shifted versions
195 psrldq $4, \TMP5 # right shift TMP5 1 DW
196 pslldq $12, \TMP2 # left shift TMP2 3 DWs
199 # second phase of the reduction
201 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
202 # in in order to perform
206 psrld $1,\TMP2 # packed left shift >>1
207 psrld $2,\TMP3 # packed left shift >>2
208 psrld $7,\TMP4 # packed left shift >>7
209 pxor \TMP3,\TMP2 # xor the shifted versions
213 pxor \TMP1, \GH # result is in TMP1
217 * if a = number of total plaintext bytes
219 * num_initial_blocks = b mod 4
220 * encrypt the initial num_initial_blocks blocks and apply ghash on
222 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
224 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
228 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
229 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
230 MOVADQ SHUF_MASK(%rip), %xmm14
231 mov arg7, %r10 # %r10 = AAD
232 mov arg8, %r12 # %r12 = aadLen
236 _get_AAD_loop\num_initial_blocks\operation:
243 jne _get_AAD_loop\num_initial_blocks\operation
246 je _get_AAD_loop2_done\num_initial_blocks\operation
249 _get_AAD_loop2\num_initial_blocks\operation:
253 jne _get_AAD_loop2\num_initial_blocks\operation
255 _get_AAD_loop2_done\num_initial_blocks\operation:
256 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
258 xor %r11, %r11 # initialise the data pointer offset as zero
260 # start AES for num_initial_blocks blocks
262 mov %arg5, %rax # %rax = *Y0
263 movdqu (%rax), \XMM0 # XMM0 = Y0
264 PSHUFB_XMM %xmm14, \XMM0
266 .if (\i == 5) || (\i == 6) || (\i == 7)
267 MOVADQ ONE(%RIP),\TMP1
270 paddd \TMP1, \XMM0 # INCR Y0
271 movdqa \XMM0, %xmm\index
272 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
273 pxor \TMP2, %xmm\index
277 shr $2,%eax # 128->4, 192->6, 256->8
278 add $5,%eax # 128->9, 192->11, 256->13
280 aes_loop_initial_dec\num_initial_blocks:
283 AESENC \TMP1, %xmm\index
287 jnz aes_loop_initial_dec\num_initial_blocks
291 AESENCLAST \TMP1, %xmm\index # Last Round
294 movdqu (%arg3 , %r11, 1), \TMP1
295 pxor \TMP1, %xmm\index
296 movdqu %xmm\index, (%arg2 , %r11, 1)
297 # write back plaintext/ciphertext for num_initial_blocks
300 movdqa \TMP1, %xmm\index
301 PSHUFB_XMM %xmm14, %xmm\index
302 # prepare plaintext/ciphertext for GHASH computation
305 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
306 # apply GHASH on num_initial_blocks blocks
310 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
312 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
317 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
319 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
325 jl _initial_blocks_done\num_initial_blocks\operation
326 # no need for precomputed values
329 * Precomputations for HashKey parallel with encryption of first 4 blocks.
330 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
332 MOVADQ ONE(%rip), \TMP1
333 paddd \TMP1, \XMM0 # INCR Y0
335 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
337 paddd \TMP1, \XMM0 # INCR Y0
339 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
341 paddd \TMP1, \XMM0 # INCR Y0
343 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
345 paddd \TMP1, \XMM0 # INCR Y0
347 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
349 MOVADQ 0(%arg1),\TMP1
355 pshufd $78, \TMP3, \TMP1
357 movdqa \TMP1, HashKey_k(%rsp)
358 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
359 # TMP5 = HashKey^2<<1 (mod poly)
360 movdqa \TMP5, HashKey_2(%rsp)
361 # HashKey_2 = HashKey^2<<1 (mod poly)
362 pshufd $78, \TMP5, \TMP1
364 movdqa \TMP1, HashKey_2_k(%rsp)
365 .irpc index, 1234 # do 4 rounds
366 movaps 0x10*\index(%arg1), \TMP1
372 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
373 # TMP5 = HashKey^3<<1 (mod poly)
374 movdqa \TMP5, HashKey_3(%rsp)
375 pshufd $78, \TMP5, \TMP1
377 movdqa \TMP1, HashKey_3_k(%rsp)
378 .irpc index, 56789 # do next 5 rounds
379 movaps 0x10*\index(%arg1), \TMP1
385 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
386 # TMP5 = HashKey^3<<1 (mod poly)
387 movdqa \TMP5, HashKey_4(%rsp)
388 pshufd $78, \TMP5, \TMP1
390 movdqa \TMP1, HashKey_4_k(%rsp)
393 shr $2,%eax # 128->4, 192->6, 256->8
394 sub $4,%eax # 128->0, 192->2, 256->4
395 jz aes_loop_pre_dec_done\num_initial_blocks
397 aes_loop_pre_dec\num_initial_blocks:
400 AESENC \TMP2, %xmm\index
404 jnz aes_loop_pre_dec\num_initial_blocks
406 aes_loop_pre_dec_done\num_initial_blocks:
408 AESENCLAST \TMP2, \XMM1
409 AESENCLAST \TMP2, \XMM2
410 AESENCLAST \TMP2, \XMM3
411 AESENCLAST \TMP2, \XMM4
412 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
414 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
416 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
418 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
420 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
422 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
424 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
426 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
429 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
431 # combine GHASHed value with the corresponding ciphertext
432 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
433 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
434 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
436 _initial_blocks_done\num_initial_blocks\operation:
442 * if a = number of total plaintext bytes
444 * num_initial_blocks = b mod 4
445 * encrypt the initial num_initial_blocks blocks and apply ghash on
447 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
449 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
453 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
454 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
455 MOVADQ SHUF_MASK(%rip), %xmm14
456 mov arg7, %r10 # %r10 = AAD
457 mov arg8, %r12 # %r12 = aadLen
460 _get_AAD_loop\num_initial_blocks\operation:
467 jne _get_AAD_loop\num_initial_blocks\operation
469 je _get_AAD_loop2_done\num_initial_blocks\operation
471 _get_AAD_loop2\num_initial_blocks\operation:
475 jne _get_AAD_loop2\num_initial_blocks\operation
476 _get_AAD_loop2_done\num_initial_blocks\operation:
477 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
479 xor %r11, %r11 # initialise the data pointer offset as zero
481 # start AES for num_initial_blocks blocks
483 mov %arg5, %rax # %rax = *Y0
484 movdqu (%rax), \XMM0 # XMM0 = Y0
485 PSHUFB_XMM %xmm14, \XMM0
487 .if (\i == 5) || (\i == 6) || (\i == 7)
489 MOVADQ ONE(%RIP),\TMP1
490 MOVADQ 0(%arg1),\TMP2
492 paddd \TMP1, \XMM0 # INCR Y0
493 MOVADQ \XMM0, %xmm\index
494 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
495 pxor \TMP2, %xmm\index
499 shr $2,%eax # 128->4, 192->6, 256->8
500 add $5,%eax # 128->9, 192->11, 256->13
502 aes_loop_initial_enc\num_initial_blocks:
505 AESENC \TMP1, %xmm\index
509 jnz aes_loop_initial_enc\num_initial_blocks
513 AESENCLAST \TMP1, %xmm\index # Last Round
516 movdqu (%arg3 , %r11, 1), \TMP1
517 pxor \TMP1, %xmm\index
518 movdqu %xmm\index, (%arg2 , %r11, 1)
519 # write back plaintext/ciphertext for num_initial_blocks
521 PSHUFB_XMM %xmm14, %xmm\index
523 # prepare plaintext/ciphertext for GHASH computation
526 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
527 # apply GHASH on num_initial_blocks blocks
531 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
533 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
535 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
538 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
540 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
543 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 jl _initial_blocks_done\num_initial_blocks\operation
547 # no need for precomputed values
550 * Precomputations for HashKey parallel with encryption of first 4 blocks.
551 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
553 MOVADQ ONE(%RIP),\TMP1
554 paddd \TMP1, \XMM0 # INCR Y0
556 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
558 paddd \TMP1, \XMM0 # INCR Y0
560 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
562 paddd \TMP1, \XMM0 # INCR Y0
564 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
566 paddd \TMP1, \XMM0 # INCR Y0
568 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
570 MOVADQ 0(%arg1),\TMP1
576 pshufd $78, \TMP3, \TMP1
578 movdqa \TMP1, HashKey_k(%rsp)
579 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
580 # TMP5 = HashKey^2<<1 (mod poly)
581 movdqa \TMP5, HashKey_2(%rsp)
582 # HashKey_2 = HashKey^2<<1 (mod poly)
583 pshufd $78, \TMP5, \TMP1
585 movdqa \TMP1, HashKey_2_k(%rsp)
586 .irpc index, 1234 # do 4 rounds
587 movaps 0x10*\index(%arg1), \TMP1
593 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
594 # TMP5 = HashKey^3<<1 (mod poly)
595 movdqa \TMP5, HashKey_3(%rsp)
596 pshufd $78, \TMP5, \TMP1
598 movdqa \TMP1, HashKey_3_k(%rsp)
599 .irpc index, 56789 # do next 5 rounds
600 movaps 0x10*\index(%arg1), \TMP1
606 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
607 # TMP5 = HashKey^3<<1 (mod poly)
608 movdqa \TMP5, HashKey_4(%rsp)
609 pshufd $78, \TMP5, \TMP1
611 movdqa \TMP1, HashKey_4_k(%rsp)
614 shr $2,%eax # 128->4, 192->6, 256->8
615 sub $4,%eax # 128->0, 192->2, 256->4
616 jz aes_loop_pre_enc_done\num_initial_blocks
618 aes_loop_pre_enc\num_initial_blocks:
621 AESENC \TMP2, %xmm\index
625 jnz aes_loop_pre_enc\num_initial_blocks
627 aes_loop_pre_enc_done\num_initial_blocks:
629 AESENCLAST \TMP2, \XMM1
630 AESENCLAST \TMP2, \XMM2
631 AESENCLAST \TMP2, \XMM3
632 AESENCLAST \TMP2, \XMM4
633 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
635 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
637 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
639 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
641 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
642 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
643 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
644 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
647 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
649 # combine GHASHed value with the corresponding ciphertext
650 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
651 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
652 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
654 _initial_blocks_done\num_initial_blocks\operation:
659 * encrypt 4 blocks at a time
660 * ghash the 4 previously encrypted ciphertext blocks
661 * arg1, %arg2, %arg3 are used as pointers only, not modified
662 * %r11 is the data offset value
664 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
665 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
672 movdqa SHUF_MASK(%rip), %xmm15
673 # multiply TMP5 * HashKey using karatsuba
676 pshufd $78, \XMM5, \TMP6
678 paddd ONE(%rip), \XMM0 # INCR CNT
679 movdqa HashKey_4(%rsp), \TMP5
680 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
682 paddd ONE(%rip), \XMM0 # INCR CNT
684 paddd ONE(%rip), \XMM0 # INCR CNT
686 paddd ONE(%rip), \XMM0 # INCR CNT
688 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
689 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
690 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
691 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
692 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
698 movdqa HashKey_4_k(%rsp), \TMP5
699 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
700 movaps 0x10(%arg1), \TMP1
701 AESENC \TMP1, \XMM1 # Round 1
705 movaps 0x20(%arg1), \TMP1
706 AESENC \TMP1, \XMM1 # Round 2
711 pshufd $78, \XMM6, \TMP2
713 movdqa HashKey_3(%rsp), \TMP5
714 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
715 movaps 0x30(%arg1), \TMP3
716 AESENC \TMP3, \XMM1 # Round 3
720 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
721 movaps 0x40(%arg1), \TMP3
722 AESENC \TMP3, \XMM1 # Round 4
726 movdqa HashKey_3_k(%rsp), \TMP5
727 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
728 movaps 0x50(%arg1), \TMP3
729 AESENC \TMP3, \XMM1 # Round 5
734 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
738 pshufd $78, \XMM7, \TMP2
740 movdqa HashKey_2(%rsp ), \TMP5
742 # Multiply TMP5 * HashKey using karatsuba
744 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
745 movaps 0x60(%arg1), \TMP3
746 AESENC \TMP3, \XMM1 # Round 6
750 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
751 movaps 0x70(%arg1), \TMP3
752 AESENC \TMP3, \XMM1 # Round 7
756 movdqa HashKey_2_k(%rsp), \TMP5
757 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
758 movaps 0x80(%arg1), \TMP3
759 AESENC \TMP3, \XMM1 # Round 8
764 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
768 # Multiply XMM8 * HashKey
769 # XMM8 and TMP5 hold the values for the two operands
772 pshufd $78, \XMM8, \TMP2
774 movdqa HashKey(%rsp), \TMP5
775 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
776 movaps 0x90(%arg1), \TMP3
777 AESENC \TMP3, \XMM1 # Round 9
781 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
784 shr $2,%eax # 128->4, 192->6, 256->8
785 sub $4,%eax # 128->0, 192->2, 256->4
786 jz aes_loop_par_enc_done
791 AESENC \TMP3, %xmm\index
797 aes_loop_par_enc_done:
799 AESENCLAST \TMP3, \XMM1 # Round 10
800 AESENCLAST \TMP3, \XMM2
801 AESENCLAST \TMP3, \XMM3
802 AESENCLAST \TMP3, \XMM4
803 movdqa HashKey_k(%rsp), \TMP5
804 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
805 movdqu (%arg3,%r11,1), \TMP3
806 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
807 movdqu 16(%arg3,%r11,1), \TMP3
808 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
809 movdqu 32(%arg3,%r11,1), \TMP3
810 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
811 movdqu 48(%arg3,%r11,1), \TMP3
812 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
813 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
814 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
815 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
816 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
817 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
818 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
819 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
820 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
828 pslldq $8, \TMP3 # left shift TMP3 2 DWs
829 psrldq $8, \TMP2 # right shift TMP2 2 DWs
831 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
833 # first phase of reduction
838 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
839 pslld $31, \TMP2 # packed right shift << 31
840 pslld $30, \TMP3 # packed right shift << 30
841 pslld $25, \TMP4 # packed right shift << 25
842 pxor \TMP3, \TMP2 # xor the shifted versions
845 psrldq $4, \TMP5 # right shift T5 1 DW
846 pslldq $12, \TMP2 # left shift T2 3 DWs
849 # second phase of reduction
851 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
854 psrld $1, \TMP2 # packed left shift >>1
855 psrld $2, \TMP3 # packed left shift >>2
856 psrld $7, \TMP4 # packed left shift >>7
857 pxor \TMP3,\TMP2 # xor the shifted versions
861 pxor \TMP1, \XMM5 # result is in TMP1
867 * decrypt 4 blocks at a time
868 * ghash the 4 previously decrypted ciphertext blocks
869 * arg1, %arg2, %arg3 are used as pointers only, not modified
870 * %r11 is the data offset value
872 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
873 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
880 movdqa SHUF_MASK(%rip), %xmm15
881 # multiply TMP5 * HashKey using karatsuba
884 pshufd $78, \XMM5, \TMP6
886 paddd ONE(%rip), \XMM0 # INCR CNT
887 movdqa HashKey_4(%rsp), \TMP5
888 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
890 paddd ONE(%rip), \XMM0 # INCR CNT
892 paddd ONE(%rip), \XMM0 # INCR CNT
894 paddd ONE(%rip), \XMM0 # INCR CNT
896 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
897 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
898 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
899 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
900 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
906 movdqa HashKey_4_k(%rsp), \TMP5
907 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
908 movaps 0x10(%arg1), \TMP1
909 AESENC \TMP1, \XMM1 # Round 1
913 movaps 0x20(%arg1), \TMP1
914 AESENC \TMP1, \XMM1 # Round 2
919 pshufd $78, \XMM6, \TMP2
921 movdqa HashKey_3(%rsp), \TMP5
922 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
923 movaps 0x30(%arg1), \TMP3
924 AESENC \TMP3, \XMM1 # Round 3
928 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
929 movaps 0x40(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 4
934 movdqa HashKey_3_k(%rsp), \TMP5
935 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
936 movaps 0x50(%arg1), \TMP3
937 AESENC \TMP3, \XMM1 # Round 5
942 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
946 pshufd $78, \XMM7, \TMP2
948 movdqa HashKey_2(%rsp ), \TMP5
950 # Multiply TMP5 * HashKey using karatsuba
952 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
953 movaps 0x60(%arg1), \TMP3
954 AESENC \TMP3, \XMM1 # Round 6
958 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
959 movaps 0x70(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 7
964 movdqa HashKey_2_k(%rsp), \TMP5
965 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
966 movaps 0x80(%arg1), \TMP3
967 AESENC \TMP3, \XMM1 # Round 8
972 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
976 # Multiply XMM8 * HashKey
977 # XMM8 and TMP5 hold the values for the two operands
980 pshufd $78, \XMM8, \TMP2
982 movdqa HashKey(%rsp), \TMP5
983 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
984 movaps 0x90(%arg1), \TMP3
985 AESENC \TMP3, \XMM1 # Round 9
989 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
992 shr $2,%eax # 128->4, 192->6, 256->8
993 sub $4,%eax # 128->0, 192->2, 256->4
994 jz aes_loop_par_dec_done
999 AESENC \TMP3, %xmm\index
1003 jnz aes_loop_par_dec
1005 aes_loop_par_dec_done:
1006 MOVADQ (%r10), \TMP3
1007 AESENCLAST \TMP3, \XMM1 # last round
1008 AESENCLAST \TMP3, \XMM2
1009 AESENCLAST \TMP3, \XMM3
1010 AESENCLAST \TMP3, \XMM4
1011 movdqa HashKey_k(%rsp), \TMP5
1012 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1013 movdqu (%arg3,%r11,1), \TMP3
1014 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1015 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1017 movdqu 16(%arg3,%r11,1), \TMP3
1018 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1019 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1021 movdqu 32(%arg3,%r11,1), \TMP3
1022 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1023 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1025 movdqu 48(%arg3,%r11,1), \TMP3
1026 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1027 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1029 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1030 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1031 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1032 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1040 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1041 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1043 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1045 # first phase of reduction
1050 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1051 pslld $31, \TMP2 # packed right shift << 31
1052 pslld $30, \TMP3 # packed right shift << 30
1053 pslld $25, \TMP4 # packed right shift << 25
1054 pxor \TMP3, \TMP2 # xor the shifted versions
1057 psrldq $4, \TMP5 # right shift T5 1 DW
1058 pslldq $12, \TMP2 # left shift T2 3 DWs
1061 # second phase of reduction
1063 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1066 psrld $1, \TMP2 # packed left shift >>1
1067 psrld $2, \TMP3 # packed left shift >>2
1068 psrld $7, \TMP4 # packed left shift >>7
1069 pxor \TMP3,\TMP2 # xor the shifted versions
1073 pxor \TMP1, \XMM5 # result is in TMP1
1078 /* GHASH the last 4 ciphertext blocks. */
1079 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1080 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1082 # Multiply TMP6 * HashKey (using Karatsuba)
1085 pshufd $78, \XMM1, \TMP2
1087 movdqa HashKey_4(%rsp), \TMP5
1088 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1089 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1090 movdqa HashKey_4_k(%rsp), \TMP4
1091 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1092 movdqa \XMM1, \XMMDst
1093 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1095 # Multiply TMP1 * HashKey (using Karatsuba)
1098 pshufd $78, \XMM2, \TMP2
1100 movdqa HashKey_3(%rsp), \TMP5
1101 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1102 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1103 movdqa HashKey_3_k(%rsp), \TMP4
1104 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1108 # results accumulated in TMP6, XMMDst, XMM1
1110 # Multiply TMP1 * HashKey (using Karatsuba)
1113 pshufd $78, \XMM3, \TMP2
1115 movdqa HashKey_2(%rsp), \TMP5
1116 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1117 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1118 movdqa HashKey_2_k(%rsp), \TMP4
1119 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1122 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1124 # Multiply TMP1 * HashKey (using Karatsuba)
1126 pshufd $78, \XMM4, \TMP2
1128 movdqa HashKey(%rsp), \TMP5
1129 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1130 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1131 movdqa HashKey_k(%rsp), \TMP4
1132 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1138 # middle section of the temp results combined as in karatsuba algorithm
1140 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1141 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1144 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1145 # first phase of the reduction
1146 movdqa \XMMDst, \TMP2
1147 movdqa \XMMDst, \TMP3
1148 movdqa \XMMDst, \TMP4
1149 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1150 pslld $31, \TMP2 # packed right shifting << 31
1151 pslld $30, \TMP3 # packed right shifting << 30
1152 pslld $25, \TMP4 # packed right shifting << 25
1153 pxor \TMP3, \TMP2 # xor the shifted versions
1156 psrldq $4, \TMP7 # right shift TMP7 1 DW
1157 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1160 # second phase of the reduction
1161 movdqa \XMMDst, \TMP2
1162 # make 3 copies of XMMDst for doing 3 shift operations
1163 movdqa \XMMDst, \TMP3
1164 movdqa \XMMDst, \TMP4
1165 psrld $1, \TMP2 # packed left shift >> 1
1166 psrld $2, \TMP3 # packed left shift >> 2
1167 psrld $7, \TMP4 # packed left shift >> 7
1168 pxor \TMP3, \TMP2 # xor the shifted versions
1172 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1176 /* Encryption of a single block
1180 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1184 shr $2,%eax # 128->4, 192->6, 256->8
1185 add $5,%eax # 128->9, 192->11, 256->13
1186 lea 16(%arg1), %r10 # get first expanded key address
1196 AESENCLAST \TMP1,\XMM0
1198 /*****************************************************************************
1199 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1200 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1201 * const u8 *in, // Ciphertext input
1202 * u64 plaintext_len, // Length of data in bytes for decryption.
1203 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1204 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1205 * // concatenated with 0x00000001. 16-byte aligned pointer.
1206 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1207 * const u8 *aad, // Additional Authentication Data (AAD)
1208 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1209 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1210 * // given authentication tag and only return the plaintext if they match.
1211 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1212 * // (most likely), 12 or 8.
1217 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1218 * set of 11 keys in the data structure void *aes_ctx
1222 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1223 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1224 * | Salt (From the SA) |
1225 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226 * | Initialization Vector |
1227 * | (This is the sequence number from IPSec header) |
1228 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1235 * AAD padded to 128 bits with 0
1236 * for example, assume AAD is a u32 vector
1238 * if AAD is 8 bytes:
1239 * AAD[3] = {A0, A1};
1240 * padded AAD in xmm register = {A1 A0 0 0}
1243 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1244 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1247 * | 32-bit Sequence Number (A0) |
1248 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1250 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1252 * AAD Format with 32-bit Sequence Number
1254 * if AAD is 12 bytes:
1255 * AAD[3] = {A0, A1, A2};
1256 * padded AAD in xmm register = {A2 A1 A0 0}
1259 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1260 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1261 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1262 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1264 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1265 * | 64-bit Extended Sequence Number {A1,A0} |
1267 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1269 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1271 * AAD Format with 64-bit Extended Sequence Number
1274 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1275 * The code supports 16 too but for other sizes, the code will fail.
1278 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1279 * For other sizes, the code will fail.
1281 * poly = x^128 + x^127 + x^126 + x^121 + 1
1283 *****************************************************************************/
1284 ENTRY(aesni_gcm_dec)
1290 * states of %xmm registers %xmm6:%xmm15 not saved
1291 * all %xmm registers are clobbered
1293 sub $VARIABLE_OFFSET, %rsp
1294 and $~63, %rsp # align rsp to 64 bytes
1296 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1297 movdqa SHUF_MASK(%rip), %xmm2
1298 PSHUFB_XMM %xmm2, %xmm13
1301 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1303 movdqa %xmm13, %xmm2
1313 pshufd $0x24, %xmm1, %xmm2
1314 pcmpeqd TWOONE(%rip), %xmm2
1315 pand POLY(%rip), %xmm2
1316 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1319 # Decrypt first few blocks
1321 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1322 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1323 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1326 jz _initial_num_blocks_is_0_decrypt
1328 jb _initial_num_blocks_is_1_decrypt
1329 je _initial_num_blocks_is_2_decrypt
1330 _initial_num_blocks_is_3_decrypt:
1331 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1332 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1334 jmp _initial_blocks_decrypted
1335 _initial_num_blocks_is_2_decrypt:
1336 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1337 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1339 jmp _initial_blocks_decrypted
1340 _initial_num_blocks_is_1_decrypt:
1341 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1342 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1344 jmp _initial_blocks_decrypted
1345 _initial_num_blocks_is_0_decrypt:
1346 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1347 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1348 _initial_blocks_decrypted:
1350 je _zero_cipher_left_decrypt
1352 je _four_cipher_left_decrypt
1354 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1355 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1359 _four_cipher_left_decrypt:
1360 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1361 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1362 _zero_cipher_left_decrypt:
1364 and $15, %r13 # %r13 = arg4 (mod 16)
1365 je _multiple_of_16_bytes_decrypt
1367 # Handle the last <16 byte block separately
1369 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1370 movdqa SHUF_MASK(%rip), %xmm10
1371 PSHUFB_XMM %xmm10, %xmm0
1373 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1376 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1377 lea SHIFT_MASK+16(%rip), %r12
1379 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1380 # (%r13 is the number of bytes in plaintext mod 16)
1381 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1382 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1385 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1386 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1387 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1388 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1390 movdqa SHUF_MASK(%rip), %xmm10
1391 PSHUFB_XMM %xmm10 ,%xmm2
1394 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1395 # GHASH computation for the last <16 byte block
1400 MOVQ_R64_XMM %xmm0, %rax
1402 jle _less_than_8_bytes_left_decrypt
1403 mov %rax, (%arg2 , %r11, 1)
1406 MOVQ_R64_XMM %xmm0, %rax
1408 _less_than_8_bytes_left_decrypt:
1409 mov %al, (%arg2, %r11, 1)
1413 jne _less_than_8_bytes_left_decrypt
1414 _multiple_of_16_bytes_decrypt:
1415 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1416 shl $3, %r12 # convert into number of bits
1417 movd %r12d, %xmm15 # len(A) in %xmm15
1418 shl $3, %arg4 # len(C) in bits (*128)
1419 MOVQ_R64_XMM %arg4, %xmm1
1420 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1421 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1423 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1424 # final GHASH computation
1425 movdqa SHUF_MASK(%rip), %xmm10
1426 PSHUFB_XMM %xmm10, %xmm8
1428 mov %arg5, %rax # %rax = *Y0
1429 movdqu (%rax), %xmm0 # %xmm0 = Y0
1430 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1433 mov arg9, %r10 # %r10 = authTag
1434 mov arg10, %r11 # %r11 = auth_tag_len
1440 MOVQ_R64_XMM %xmm0, %rax
1442 jmp _return_T_done_decrypt
1444 MOVQ_R64_XMM %xmm0, %rax
1449 jmp _return_T_done_decrypt
1451 movdqu %xmm0, (%r10)
1452 _return_T_done_decrypt:
1458 ENDPROC(aesni_gcm_dec)
1461 /*****************************************************************************
1462 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1463 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1464 * const u8 *in, // Plaintext input
1465 * u64 plaintext_len, // Length of data in bytes for encryption.
1466 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1467 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1468 * // concatenated with 0x00000001. 16-byte aligned pointer.
1469 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1470 * const u8 *aad, // Additional Authentication Data (AAD)
1471 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1472 * u8 *auth_tag, // Authenticated Tag output.
1473 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1479 * keys are pre-expanded and aligned to 16 bytes. we are using the
1480 * first set of 11 keys in the data structure void *aes_ctx
1485 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1486 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1487 * | Salt (From the SA) |
1488 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489 * | Initialization Vector |
1490 * | (This is the sequence number from IPSec header) |
1491 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1498 * AAD padded to 128 bits with 0
1499 * for example, assume AAD is a u32 vector
1501 * if AAD is 8 bytes:
1502 * AAD[3] = {A0, A1};
1503 * padded AAD in xmm register = {A1 A0 0 0}
1506 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1507 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1509 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510 * | 32-bit Sequence Number (A0) |
1511 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1513 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1515 * AAD Format with 32-bit Sequence Number
1517 * if AAD is 12 bytes:
1518 * AAD[3] = {A0, A1, A2};
1519 * padded AAD in xmm register = {A2 A1 A0 0}
1522 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1523 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1525 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1526 * | 64-bit Extended Sequence Number {A1,A0} |
1528 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1530 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1532 * AAD Format with 64-bit Extended Sequence Number
1535 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1536 * The code supports 16 too but for other sizes, the code will fail.
1539 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1540 * For other sizes, the code will fail.
1542 * poly = x^128 + x^127 + x^126 + x^121 + 1
1543 ***************************************************************************/
1544 ENTRY(aesni_gcm_enc)
1550 # states of %xmm registers %xmm6:%xmm15 not saved
1551 # all %xmm registers are clobbered
1553 sub $VARIABLE_OFFSET, %rsp
1556 movdqu (%r12), %xmm13
1557 movdqa SHUF_MASK(%rip), %xmm2
1558 PSHUFB_XMM %xmm2, %xmm13
1561 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1563 movdqa %xmm13, %xmm2
1573 pshufd $0x24, %xmm1, %xmm2
1574 pcmpeqd TWOONE(%rip), %xmm2
1575 pand POLY(%rip), %xmm2
1577 movdqa %xmm13, HashKey(%rsp)
1578 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1582 # Encrypt first few blocks
1585 jz _initial_num_blocks_is_0_encrypt
1587 jb _initial_num_blocks_is_1_encrypt
1588 je _initial_num_blocks_is_2_encrypt
1589 _initial_num_blocks_is_3_encrypt:
1590 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1591 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1593 jmp _initial_blocks_encrypted
1594 _initial_num_blocks_is_2_encrypt:
1595 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1596 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1598 jmp _initial_blocks_encrypted
1599 _initial_num_blocks_is_1_encrypt:
1600 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1601 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1603 jmp _initial_blocks_encrypted
1604 _initial_num_blocks_is_0_encrypt:
1605 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1606 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1607 _initial_blocks_encrypted:
1609 # Main loop - Encrypt remaining blocks
1612 je _zero_cipher_left_encrypt
1614 je _four_cipher_left_encrypt
1615 _encrypt_by_4_encrypt:
1616 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1617 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1620 jne _encrypt_by_4_encrypt
1621 _four_cipher_left_encrypt:
1622 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1623 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1624 _zero_cipher_left_encrypt:
1626 and $15, %r13 # %r13 = arg4 (mod 16)
1627 je _multiple_of_16_bytes_encrypt
1629 # Handle the last <16 Byte block separately
1630 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1631 movdqa SHUF_MASK(%rip), %xmm10
1632 PSHUFB_XMM %xmm10, %xmm0
1635 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1638 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1639 lea SHIFT_MASK+16(%rip), %r12
1641 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1642 # (%r13 is the number of bytes in plaintext mod 16)
1643 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1644 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1645 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1646 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1647 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1648 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1649 movdqa SHUF_MASK(%rip), %xmm10
1650 PSHUFB_XMM %xmm10,%xmm0
1653 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1654 # GHASH computation for the last <16 byte block
1658 movdqa SHUF_MASK(%rip), %xmm10
1659 PSHUFB_XMM %xmm10, %xmm0
1661 # shuffle xmm0 back to output as ciphertext
1664 MOVQ_R64_XMM %xmm0, %rax
1666 jle _less_than_8_bytes_left_encrypt
1667 mov %rax, (%arg2 , %r11, 1)
1670 MOVQ_R64_XMM %xmm0, %rax
1672 _less_than_8_bytes_left_encrypt:
1673 mov %al, (%arg2, %r11, 1)
1677 jne _less_than_8_bytes_left_encrypt
1678 _multiple_of_16_bytes_encrypt:
1679 mov arg8, %r12 # %r12 = addLen (number of bytes)
1681 movd %r12d, %xmm15 # len(A) in %xmm15
1682 shl $3, %arg4 # len(C) in bits (*128)
1683 MOVQ_R64_XMM %arg4, %xmm1
1684 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1685 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1687 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1688 # final GHASH computation
1689 movdqa SHUF_MASK(%rip), %xmm10
1690 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1692 mov %arg5, %rax # %rax = *Y0
1693 movdqu (%rax), %xmm0 # %xmm0 = Y0
1694 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1697 mov arg9, %r10 # %r10 = authTag
1698 mov arg10, %r11 # %r11 = auth_tag_len
1704 MOVQ_R64_XMM %xmm0, %rax
1706 jmp _return_T_done_encrypt
1708 MOVQ_R64_XMM %xmm0, %rax
1713 jmp _return_T_done_encrypt
1715 movdqu %xmm0, (%r10)
1716 _return_T_done_encrypt:
1722 ENDPROC(aesni_gcm_enc)
1729 _key_expansion_256a:
1730 pshufd $0b11111111, %xmm1, %xmm1
1731 shufps $0b00010000, %xmm0, %xmm4
1733 shufps $0b10001100, %xmm0, %xmm4
1736 movaps %xmm0, (TKEYP)
1739 ENDPROC(_key_expansion_128)
1740 ENDPROC(_key_expansion_256a)
1743 _key_expansion_192a:
1744 pshufd $0b01010101, %xmm1, %xmm1
1745 shufps $0b00010000, %xmm0, %xmm4
1747 shufps $0b10001100, %xmm0, %xmm4
1754 pshufd $0b11111111, %xmm0, %xmm3
1759 shufps $0b01000100, %xmm0, %xmm6
1760 movaps %xmm6, (TKEYP)
1761 shufps $0b01001110, %xmm2, %xmm1
1762 movaps %xmm1, 0x10(TKEYP)
1765 ENDPROC(_key_expansion_192a)
1768 _key_expansion_192b:
1769 pshufd $0b01010101, %xmm1, %xmm1
1770 shufps $0b00010000, %xmm0, %xmm4
1772 shufps $0b10001100, %xmm0, %xmm4
1778 pshufd $0b11111111, %xmm0, %xmm3
1782 movaps %xmm0, (TKEYP)
1785 ENDPROC(_key_expansion_192b)
1788 _key_expansion_256b:
1789 pshufd $0b10101010, %xmm1, %xmm1
1790 shufps $0b00010000, %xmm2, %xmm4
1792 shufps $0b10001100, %xmm2, %xmm4
1795 movaps %xmm2, (TKEYP)
1798 ENDPROC(_key_expansion_256b)
1801 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1802 * unsigned int key_len)
1804 ENTRY(aesni_set_key)
1808 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1809 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1810 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1812 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1813 movaps %xmm0, (KEYP)
1814 lea 0x10(KEYP), TKEYP # key addr
1815 movl %edx, 480(KEYP)
1816 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1820 movups 0x10(UKEYP), %xmm2 # other user key
1821 movaps %xmm2, (TKEYP)
1823 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1824 call _key_expansion_256a
1825 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1826 call _key_expansion_256b
1827 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1828 call _key_expansion_256a
1829 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1830 call _key_expansion_256b
1831 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1832 call _key_expansion_256a
1833 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1834 call _key_expansion_256b
1835 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1836 call _key_expansion_256a
1837 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1838 call _key_expansion_256b
1839 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1840 call _key_expansion_256a
1841 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1842 call _key_expansion_256b
1843 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1844 call _key_expansion_256a
1845 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1846 call _key_expansion_256b
1847 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1848 call _key_expansion_256a
1851 movq 0x10(UKEYP), %xmm2 # other user key
1852 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1853 call _key_expansion_192a
1854 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1855 call _key_expansion_192b
1856 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1857 call _key_expansion_192a
1858 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1859 call _key_expansion_192b
1860 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1861 call _key_expansion_192a
1862 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1863 call _key_expansion_192b
1864 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1865 call _key_expansion_192a
1866 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1867 call _key_expansion_192b
1870 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1871 call _key_expansion_128
1872 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1873 call _key_expansion_128
1874 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1875 call _key_expansion_128
1876 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1877 call _key_expansion_128
1878 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1879 call _key_expansion_128
1880 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1881 call _key_expansion_128
1882 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1883 call _key_expansion_128
1884 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1885 call _key_expansion_128
1886 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1887 call _key_expansion_128
1888 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1889 call _key_expansion_128
1892 movaps (KEYP), %xmm0
1893 movaps (TKEYP), %xmm1
1894 movaps %xmm0, 240(TKEYP)
1895 movaps %xmm1, 240(KEYP)
1897 lea 240-16(TKEYP), UKEYP
1900 movaps (KEYP), %xmm0
1902 movaps %xmm1, (UKEYP)
1913 ENDPROC(aesni_set_key)
1916 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1923 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1924 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1925 movl (FRAME_OFFSET+20)(%esp), INP # src
1927 movl 480(KEYP), KLEN # key length
1928 movups (INP), STATE # input
1930 movups STATE, (OUTP) # output
1940 * _aesni_enc1: internal ABI
1942 * KEYP: key struct pointer
1944 * STATE: initial state (input)
1946 * STATE: finial state (output)
1953 movaps (KEYP), KEY # key
1955 pxor KEY, STATE # round 0
1959 lea 0x20(TKEYP), TKEYP
1962 movaps -0x60(TKEYP), KEY
1964 movaps -0x50(TKEYP), KEY
1968 movaps -0x40(TKEYP), KEY
1970 movaps -0x30(TKEYP), KEY
1974 movaps -0x20(TKEYP), KEY
1976 movaps -0x10(TKEYP), KEY
1980 movaps 0x10(TKEYP), KEY
1982 movaps 0x20(TKEYP), KEY
1984 movaps 0x30(TKEYP), KEY
1986 movaps 0x40(TKEYP), KEY
1988 movaps 0x50(TKEYP), KEY
1990 movaps 0x60(TKEYP), KEY
1992 movaps 0x70(TKEYP), KEY
1993 AESENCLAST KEY STATE
1995 ENDPROC(_aesni_enc1)
1998 * _aesni_enc4: internal ABI
2000 * KEYP: key struct pointer
2002 * STATE1: initial state (input)
2007 * STATE1: finial state (output)
2017 movaps (KEYP), KEY # key
2019 pxor KEY, STATE1 # round 0
2026 lea 0x20(TKEYP), TKEYP
2029 movaps -0x60(TKEYP), KEY
2034 movaps -0x50(TKEYP), KEY
2041 movaps -0x40(TKEYP), KEY
2046 movaps -0x30(TKEYP), KEY
2053 movaps -0x20(TKEYP), KEY
2058 movaps -0x10(TKEYP), KEY
2068 movaps 0x10(TKEYP), KEY
2073 movaps 0x20(TKEYP), KEY
2078 movaps 0x30(TKEYP), KEY
2083 movaps 0x40(TKEYP), KEY
2088 movaps 0x50(TKEYP), KEY
2093 movaps 0x60(TKEYP), KEY
2098 movaps 0x70(TKEYP), KEY
2099 AESENCLAST KEY STATE1 # last round
2100 AESENCLAST KEY STATE2
2101 AESENCLAST KEY STATE3
2102 AESENCLAST KEY STATE4
2104 ENDPROC(_aesni_enc4)
2107 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2114 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2115 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2116 movl (FRAME_OFFSET+20)(%esp), INP # src
2118 mov 480(KEYP), KLEN # key length
2120 movups (INP), STATE # input
2122 movups STATE, (OUTP) #output
2132 * _aesni_dec1: internal ABI
2134 * KEYP: key struct pointer
2136 * STATE: initial state (input)
2138 * STATE: finial state (output)
2145 movaps (KEYP), KEY # key
2147 pxor KEY, STATE # round 0
2151 lea 0x20(TKEYP), TKEYP
2154 movaps -0x60(TKEYP), KEY
2156 movaps -0x50(TKEYP), KEY
2160 movaps -0x40(TKEYP), KEY
2162 movaps -0x30(TKEYP), KEY
2166 movaps -0x20(TKEYP), KEY
2168 movaps -0x10(TKEYP), KEY
2172 movaps 0x10(TKEYP), KEY
2174 movaps 0x20(TKEYP), KEY
2176 movaps 0x30(TKEYP), KEY
2178 movaps 0x40(TKEYP), KEY
2180 movaps 0x50(TKEYP), KEY
2182 movaps 0x60(TKEYP), KEY
2184 movaps 0x70(TKEYP), KEY
2185 AESDECLAST KEY STATE
2187 ENDPROC(_aesni_dec1)
2190 * _aesni_dec4: internal ABI
2192 * KEYP: key struct pointer
2194 * STATE1: initial state (input)
2199 * STATE1: finial state (output)
2209 movaps (KEYP), KEY # key
2211 pxor KEY, STATE1 # round 0
2218 lea 0x20(TKEYP), TKEYP
2221 movaps -0x60(TKEYP), KEY
2226 movaps -0x50(TKEYP), KEY
2233 movaps -0x40(TKEYP), KEY
2238 movaps -0x30(TKEYP), KEY
2245 movaps -0x20(TKEYP), KEY
2250 movaps -0x10(TKEYP), KEY
2260 movaps 0x10(TKEYP), KEY
2265 movaps 0x20(TKEYP), KEY
2270 movaps 0x30(TKEYP), KEY
2275 movaps 0x40(TKEYP), KEY
2280 movaps 0x50(TKEYP), KEY
2285 movaps 0x60(TKEYP), KEY
2290 movaps 0x70(TKEYP), KEY
2291 AESDECLAST KEY STATE1 # last round
2292 AESDECLAST KEY STATE2
2293 AESDECLAST KEY STATE3
2294 AESDECLAST KEY STATE4
2296 ENDPROC(_aesni_dec4)
2299 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2302 ENTRY(aesni_ecb_enc)
2308 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2309 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2310 movl (FRAME_OFFSET+24)(%esp), INP # src
2311 movl (FRAME_OFFSET+28)(%esp), LEN # len
2313 test LEN, LEN # check length
2322 movups (INP), STATE1
2323 movups 0x10(INP), STATE2
2324 movups 0x20(INP), STATE3
2325 movups 0x30(INP), STATE4
2327 movups STATE1, (OUTP)
2328 movups STATE2, 0x10(OUTP)
2329 movups STATE3, 0x20(OUTP)
2330 movups STATE4, 0x30(OUTP)
2340 movups (INP), STATE1
2342 movups STATE1, (OUTP)
2356 ENDPROC(aesni_ecb_enc)
2359 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2362 ENTRY(aesni_ecb_dec)
2368 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2369 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2370 movl (FRAME_OFFSET+24)(%esp), INP # src
2371 movl (FRAME_OFFSET+28)(%esp), LEN # len
2383 movups (INP), STATE1
2384 movups 0x10(INP), STATE2
2385 movups 0x20(INP), STATE3
2386 movups 0x30(INP), STATE4
2388 movups STATE1, (OUTP)
2389 movups STATE2, 0x10(OUTP)
2390 movups STATE3, 0x20(OUTP)
2391 movups STATE4, 0x30(OUTP)
2401 movups (INP), STATE1
2403 movups STATE1, (OUTP)
2417 ENDPROC(aesni_ecb_dec)
2420 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2421 * size_t len, u8 *iv)
2423 ENTRY(aesni_cbc_enc)
2430 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2431 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2432 movl (FRAME_OFFSET+28)(%esp), INP # src
2433 movl (FRAME_OFFSET+32)(%esp), LEN # len
2434 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2439 movups (IVP), STATE # load iv as initial state
2442 movups (INP), IN # load input
2445 movups STATE, (OUTP) # store output
2461 ENDPROC(aesni_cbc_enc)
2464 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2465 * size_t len, u8 *iv)
2467 ENTRY(aesni_cbc_dec)
2474 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2475 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2476 movl (FRAME_OFFSET+28)(%esp), INP # src
2477 movl (FRAME_OFFSET+32)(%esp), LEN # len
2478 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2481 jb .Lcbc_dec_just_ret
2491 movups 0x10(INP), IN2
2494 movups 0x20(INP), IN3
2496 movups 0x30(INP), IN4
2499 movups 0x20(INP), IN1
2501 movups 0x30(INP), IN2
2516 movups 0x10(INP), IN2
2519 movups STATE1, (OUTP)
2520 movups STATE2, 0x10(OUTP)
2521 movups STATE3, 0x20(OUTP)
2522 movups STATE4, 0x30(OUTP)
2536 movups STATE, (OUTP)
2554 ENDPROC(aesni_cbc_dec)
2557 .pushsection .rodata
2560 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2564 * _aesni_inc_init: internal ABI
2565 * setup registers used by _aesni_inc
2569 * CTR: == IV, in little endian
2570 * TCTR_LOW: == lower qword of CTR
2571 * INC: == 1, in little endian
2572 * BSWAP_MASK == endian swapping mask
2576 movaps .Lbswap_mask, BSWAP_MASK
2578 PSHUFB_XMM BSWAP_MASK CTR
2580 MOVQ_R64_XMM TCTR_LOW INC
2581 MOVQ_R64_XMM CTR TCTR_LOW
2583 ENDPROC(_aesni_inc_init)
2586 * _aesni_inc: internal ABI
2587 * Increase IV by 1, IV is in big endian
2590 * CTR: == IV, in little endian
2591 * TCTR_LOW: == lower qword of CTR
2592 * INC: == 1, in little endian
2593 * BSWAP_MASK == endian swapping mask
2597 * CTR: == output IV, in little endian
2598 * TCTR_LOW: == lower qword of CTR
2610 PSHUFB_XMM BSWAP_MASK IV
2615 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2616 * size_t len, u8 *iv)
2618 ENTRY(aesni_ctr_enc)
2621 jb .Lctr_enc_just_ret
2624 call _aesni_inc_init
2634 movups 0x10(INP), IN2
2637 movups 0x20(INP), IN3
2640 movups 0x30(INP), IN4
2643 movups STATE1, (OUTP)
2645 movups STATE2, 0x10(OUTP)
2647 movups STATE3, 0x20(OUTP)
2649 movups STATE4, 0x30(OUTP)
2664 movups STATE, (OUTP)
2675 ENDPROC(aesni_ctr_enc)
2678 * _aesni_gf128mul_x_ble: internal ABI
2679 * Multiply in GF(2^128) for XTS IVs
2682 * GF128MUL_MASK == mask with 0x87 and 0x01
2686 * CTR: == temporary value
2688 #define _aesni_gf128mul_x_ble() \
2689 pshufd $0x13, IV, CTR; \
2692 pand GF128MUL_MASK, CTR; \
2696 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2699 ENTRY(aesni_xts_crypt8)
2704 leaq _aesni_enc4, %r11
2705 leaq _aesni_dec4, %rax
2709 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2716 movdqu 0x00(INP), INC
2718 movdqu IV, 0x00(OUTP)
2720 _aesni_gf128mul_x_ble()
2722 movdqu 0x10(INP), INC
2724 movdqu IV, 0x10(OUTP)
2726 _aesni_gf128mul_x_ble()
2728 movdqu 0x20(INP), INC
2730 movdqu IV, 0x20(OUTP)
2732 _aesni_gf128mul_x_ble()
2734 movdqu 0x30(INP), INC
2736 movdqu IV, 0x30(OUTP)
2740 movdqu 0x00(OUTP), INC
2742 movdqu STATE1, 0x00(OUTP)
2744 _aesni_gf128mul_x_ble()
2746 movdqu 0x40(INP), INC
2748 movdqu IV, 0x40(OUTP)
2750 movdqu 0x10(OUTP), INC
2752 movdqu STATE2, 0x10(OUTP)
2754 _aesni_gf128mul_x_ble()
2756 movdqu 0x50(INP), INC
2758 movdqu IV, 0x50(OUTP)
2760 movdqu 0x20(OUTP), INC
2762 movdqu STATE3, 0x20(OUTP)
2764 _aesni_gf128mul_x_ble()
2766 movdqu 0x60(INP), INC
2768 movdqu IV, 0x60(OUTP)
2770 movdqu 0x30(OUTP), INC
2772 movdqu STATE4, 0x30(OUTP)
2774 _aesni_gf128mul_x_ble()
2776 movdqu 0x70(INP), INC
2778 movdqu IV, 0x70(OUTP)
2780 _aesni_gf128mul_x_ble()
2785 movdqu 0x40(OUTP), INC
2787 movdqu STATE1, 0x40(OUTP)
2789 movdqu 0x50(OUTP), INC
2791 movdqu STATE2, 0x50(OUTP)
2793 movdqu 0x60(OUTP), INC
2795 movdqu STATE3, 0x60(OUTP)
2797 movdqu 0x70(OUTP), INC
2799 movdqu STATE4, 0x70(OUTP)
2803 ENDPROC(aesni_xts_crypt8)