2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
34 #include <asm/frame.h>
35 #include <asm/nospec-branch.h>
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
50 # constants in mergeable sections, linker can reorder and merge
51 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
53 .Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
55 .section .rodata.cst16.POLY, "aM", @progbits, 16
57 POLY: .octa 0xC2000000000000000000000000000001
58 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
60 TWOONE: .octa 0x00000001000000000000000000000001
62 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
64 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65 .section .rodata.cst16.MASK1, "aM", @progbits, 16
67 MASK1: .octa 0x0000000000000000ffffffffffffffff
68 .section .rodata.cst16.MASK2, "aM", @progbits, 16
70 MASK2: .octa 0xffffffffffffffff0000000000000000
71 .section .rodata.cst16.ONE, "aM", @progbits, 16
73 ONE: .octa 0x00000000000000000000000000000001
74 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
76 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77 .section .rodata.cst16.dec, "aM", @progbits, 16
80 .section .rodata.cst16.enc, "aM", @progbits, 16
84 # order of these constants should not change.
85 # more specifically, ALL_F should follow SHIFT_MASK,
86 # and zero should follow ALL_F
87 .section .rodata, "a", @progbits
89 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
96 #define STACK_OFFSET 8*3
97 #define HashKey 16*0 // store HashKey <<1 mod poly here
98 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
99 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
100 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
101 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
102 // bits of HashKey <<1 mod poly here
103 //(for Karatsuba purposes)
104 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
105 // bits of HashKey^2 <<1 mod poly here
106 // (for Karatsuba purposes)
107 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^3 <<1 mod poly here
109 // (for Karatsuba purposes)
110 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^4 <<1 mod poly here
112 // (for Karatsuba purposes)
113 #define VARIABLE_OFFSET 16*8
121 #define arg7 STACK_OFFSET+8(%r14)
122 #define arg8 STACK_OFFSET+16(%r14)
123 #define arg9 STACK_OFFSET+24(%r14)
124 #define arg10 STACK_OFFSET+32(%r14)
125 #define keysize 2*15*16(%arg1)
142 #define BSWAP_MASK %xmm10
146 #define GF128MUL_MASK %xmm10
176 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
179 * Input: A and B (128-bits each, bit-reflected)
180 * Output: C = A*B*x mod poly, (i.e. >>1 )
181 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
182 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
185 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
187 pshufd $78, \GH, \TMP2
188 pshufd $78, \HK, \TMP3
189 pxor \GH, \TMP2 # TMP2 = a1+a0
190 pxor \HK, \TMP3 # TMP3 = b1+b0
191 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
192 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
193 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
195 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
197 pslldq $8, \TMP3 # left shift TMP3 2 DWs
198 psrldq $8, \TMP2 # right shift TMP2 2 DWs
200 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
202 # first phase of the reduction
206 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
207 # in in order to perform
209 pslld $31, \TMP2 # packed right shift <<31
210 pslld $30, \TMP3 # packed right shift <<30
211 pslld $25, \TMP4 # packed right shift <<25
212 pxor \TMP3, \TMP2 # xor the shifted versions
215 psrldq $4, \TMP5 # right shift TMP5 1 DW
216 pslldq $12, \TMP2 # left shift TMP2 3 DWs
219 # second phase of the reduction
221 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
222 # in in order to perform
226 psrld $1,\TMP2 # packed left shift >>1
227 psrld $2,\TMP3 # packed left shift >>2
228 psrld $7,\TMP4 # packed left shift >>7
229 pxor \TMP3,\TMP2 # xor the shifted versions
233 pxor \TMP1, \GH # result is in TMP1
236 # Reads DLEN bytes starting at DPTR and stores in XMMDst
237 # where 0 < DLEN < 16
238 # Clobbers %rax, DLEN and XMM1
239 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
243 MOVQ_R64_XMM %rax, \XMMDst
245 jz _done_read_partial_block_\@
249 mov 7(\DPTR, \DLEN, 1), %al
251 jnz _read_next_byte_\@
252 MOVQ_R64_XMM %rax, \XMM1
255 jmp _done_read_partial_block_\@
258 _read_next_byte_lt8_\@:
260 mov -1(\DPTR, \DLEN, 1), %al
262 jnz _read_next_byte_lt8_\@
263 MOVQ_R64_XMM %rax, \XMMDst
264 _done_read_partial_block_\@:
268 * if a = number of total plaintext bytes
270 * num_initial_blocks = b mod 4
271 * encrypt the initial num_initial_blocks blocks and apply ghash on
273 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
275 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
279 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
280 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
281 MOVADQ SHUF_MASK(%rip), %xmm14
282 mov arg7, %r10 # %r10 = AAD
283 mov arg8, %r11 # %r11 = aadLen
288 jl _get_AAD_rest\num_initial_blocks\operation
289 _get_AAD_blocks\num_initial_blocks\operation:
290 movdqu (%r10), %xmm\i
291 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
293 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
297 jge _get_AAD_blocks\num_initial_blocks\operation
301 /* read the last <16B of AAD */
302 _get_AAD_rest\num_initial_blocks\operation:
304 je _get_AAD_done\num_initial_blocks\operation
306 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
307 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
309 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
311 _get_AAD_done\num_initial_blocks\operation:
312 xor %r11, %r11 # initialise the data pointer offset as zero
313 # start AES for num_initial_blocks blocks
315 mov %arg5, %rax # %rax = *Y0
316 movdqu (%rax), \XMM0 # XMM0 = Y0
317 PSHUFB_XMM %xmm14, \XMM0
319 .if (\i == 5) || (\i == 6) || (\i == 7)
320 MOVADQ ONE(%RIP),\TMP1
323 paddd \TMP1, \XMM0 # INCR Y0
324 movdqa \XMM0, %xmm\index
325 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
326 pxor \TMP2, %xmm\index
330 shr $2,%eax # 128->4, 192->6, 256->8
331 add $5,%eax # 128->9, 192->11, 256->13
333 aes_loop_initial_dec\num_initial_blocks:
336 AESENC \TMP1, %xmm\index
340 jnz aes_loop_initial_dec\num_initial_blocks
344 AESENCLAST \TMP1, %xmm\index # Last Round
347 movdqu (%arg3 , %r11, 1), \TMP1
348 pxor \TMP1, %xmm\index
349 movdqu %xmm\index, (%arg2 , %r11, 1)
350 # write back plaintext/ciphertext for num_initial_blocks
353 movdqa \TMP1, %xmm\index
354 PSHUFB_XMM %xmm14, %xmm\index
355 # prepare plaintext/ciphertext for GHASH computation
359 # apply GHASH on num_initial_blocks blocks
363 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
365 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
367 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
370 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
372 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
375 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
378 jl _initial_blocks_done\num_initial_blocks\operation
379 # no need for precomputed values
382 * Precomputations for HashKey parallel with encryption of first 4 blocks.
383 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
385 MOVADQ ONE(%rip), \TMP1
386 paddd \TMP1, \XMM0 # INCR Y0
388 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
390 paddd \TMP1, \XMM0 # INCR Y0
392 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
394 paddd \TMP1, \XMM0 # INCR Y0
396 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
398 paddd \TMP1, \XMM0 # INCR Y0
400 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
402 MOVADQ 0(%arg1),\TMP1
408 pshufd $78, \TMP3, \TMP1
410 movdqa \TMP1, HashKey_k(%rsp)
411 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
412 # TMP5 = HashKey^2<<1 (mod poly)
413 movdqa \TMP5, HashKey_2(%rsp)
414 # HashKey_2 = HashKey^2<<1 (mod poly)
415 pshufd $78, \TMP5, \TMP1
417 movdqa \TMP1, HashKey_2_k(%rsp)
418 .irpc index, 1234 # do 4 rounds
419 movaps 0x10*\index(%arg1), \TMP1
425 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
426 # TMP5 = HashKey^3<<1 (mod poly)
427 movdqa \TMP5, HashKey_3(%rsp)
428 pshufd $78, \TMP5, \TMP1
430 movdqa \TMP1, HashKey_3_k(%rsp)
431 .irpc index, 56789 # do next 5 rounds
432 movaps 0x10*\index(%arg1), \TMP1
438 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
439 # TMP5 = HashKey^3<<1 (mod poly)
440 movdqa \TMP5, HashKey_4(%rsp)
441 pshufd $78, \TMP5, \TMP1
443 movdqa \TMP1, HashKey_4_k(%rsp)
446 shr $2,%eax # 128->4, 192->6, 256->8
447 sub $4,%eax # 128->0, 192->2, 256->4
448 jz aes_loop_pre_dec_done\num_initial_blocks
450 aes_loop_pre_dec\num_initial_blocks:
453 AESENC \TMP2, %xmm\index
457 jnz aes_loop_pre_dec\num_initial_blocks
459 aes_loop_pre_dec_done\num_initial_blocks:
461 AESENCLAST \TMP2, \XMM1
462 AESENCLAST \TMP2, \XMM2
463 AESENCLAST \TMP2, \XMM3
464 AESENCLAST \TMP2, \XMM4
465 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
467 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
469 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
471 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
473 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
475 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
477 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
479 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
482 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
484 # combine GHASHed value with the corresponding ciphertext
485 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
486 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
487 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
489 _initial_blocks_done\num_initial_blocks\operation:
495 * if a = number of total plaintext bytes
497 * num_initial_blocks = b mod 4
498 * encrypt the initial num_initial_blocks blocks and apply ghash on
500 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
502 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
506 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
507 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
508 MOVADQ SHUF_MASK(%rip), %xmm14
509 mov arg7, %r10 # %r10 = AAD
510 mov arg8, %r11 # %r11 = aadLen
515 jl _get_AAD_rest\num_initial_blocks\operation
516 _get_AAD_blocks\num_initial_blocks\operation:
517 movdqu (%r10), %xmm\i
518 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
520 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
524 jge _get_AAD_blocks\num_initial_blocks\operation
528 /* read the last <16B of AAD */
529 _get_AAD_rest\num_initial_blocks\operation:
531 je _get_AAD_done\num_initial_blocks\operation
533 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
534 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
536 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
538 _get_AAD_done\num_initial_blocks\operation:
539 xor %r11, %r11 # initialise the data pointer offset as zero
540 # start AES for num_initial_blocks blocks
542 mov %arg5, %rax # %rax = *Y0
543 movdqu (%rax), \XMM0 # XMM0 = Y0
544 PSHUFB_XMM %xmm14, \XMM0
546 .if (\i == 5) || (\i == 6) || (\i == 7)
548 MOVADQ ONE(%RIP),\TMP1
549 MOVADQ 0(%arg1),\TMP2
551 paddd \TMP1, \XMM0 # INCR Y0
552 MOVADQ \XMM0, %xmm\index
553 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
554 pxor \TMP2, %xmm\index
558 shr $2,%eax # 128->4, 192->6, 256->8
559 add $5,%eax # 128->9, 192->11, 256->13
561 aes_loop_initial_enc\num_initial_blocks:
564 AESENC \TMP1, %xmm\index
568 jnz aes_loop_initial_enc\num_initial_blocks
572 AESENCLAST \TMP1, %xmm\index # Last Round
575 movdqu (%arg3 , %r11, 1), \TMP1
576 pxor \TMP1, %xmm\index
577 movdqu %xmm\index, (%arg2 , %r11, 1)
578 # write back plaintext/ciphertext for num_initial_blocks
580 PSHUFB_XMM %xmm14, %xmm\index
582 # prepare plaintext/ciphertext for GHASH computation
586 # apply GHASH on num_initial_blocks blocks
590 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
592 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
594 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
597 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
599 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
602 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
605 jl _initial_blocks_done\num_initial_blocks\operation
606 # no need for precomputed values
609 * Precomputations for HashKey parallel with encryption of first 4 blocks.
610 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
612 MOVADQ ONE(%RIP),\TMP1
613 paddd \TMP1, \XMM0 # INCR Y0
615 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
617 paddd \TMP1, \XMM0 # INCR Y0
619 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
621 paddd \TMP1, \XMM0 # INCR Y0
623 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
625 paddd \TMP1, \XMM0 # INCR Y0
627 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
629 MOVADQ 0(%arg1),\TMP1
635 pshufd $78, \TMP3, \TMP1
637 movdqa \TMP1, HashKey_k(%rsp)
638 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
639 # TMP5 = HashKey^2<<1 (mod poly)
640 movdqa \TMP5, HashKey_2(%rsp)
641 # HashKey_2 = HashKey^2<<1 (mod poly)
642 pshufd $78, \TMP5, \TMP1
644 movdqa \TMP1, HashKey_2_k(%rsp)
645 .irpc index, 1234 # do 4 rounds
646 movaps 0x10*\index(%arg1), \TMP1
652 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
653 # TMP5 = HashKey^3<<1 (mod poly)
654 movdqa \TMP5, HashKey_3(%rsp)
655 pshufd $78, \TMP5, \TMP1
657 movdqa \TMP1, HashKey_3_k(%rsp)
658 .irpc index, 56789 # do next 5 rounds
659 movaps 0x10*\index(%arg1), \TMP1
665 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
666 # TMP5 = HashKey^3<<1 (mod poly)
667 movdqa \TMP5, HashKey_4(%rsp)
668 pshufd $78, \TMP5, \TMP1
670 movdqa \TMP1, HashKey_4_k(%rsp)
673 shr $2,%eax # 128->4, 192->6, 256->8
674 sub $4,%eax # 128->0, 192->2, 256->4
675 jz aes_loop_pre_enc_done\num_initial_blocks
677 aes_loop_pre_enc\num_initial_blocks:
680 AESENC \TMP2, %xmm\index
684 jnz aes_loop_pre_enc\num_initial_blocks
686 aes_loop_pre_enc_done\num_initial_blocks:
688 AESENCLAST \TMP2, \XMM1
689 AESENCLAST \TMP2, \XMM2
690 AESENCLAST \TMP2, \XMM3
691 AESENCLAST \TMP2, \XMM4
692 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
694 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
696 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
698 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
700 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
701 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
702 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
703 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
706 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
708 # combine GHASHed value with the corresponding ciphertext
709 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
710 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
711 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
713 _initial_blocks_done\num_initial_blocks\operation:
718 * encrypt 4 blocks at a time
719 * ghash the 4 previously encrypted ciphertext blocks
720 * arg1, %arg2, %arg3 are used as pointers only, not modified
721 * %r11 is the data offset value
723 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
724 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
731 movdqa SHUF_MASK(%rip), %xmm15
732 # multiply TMP5 * HashKey using karatsuba
735 pshufd $78, \XMM5, \TMP6
737 paddd ONE(%rip), \XMM0 # INCR CNT
738 movdqa HashKey_4(%rsp), \TMP5
739 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
741 paddd ONE(%rip), \XMM0 # INCR CNT
743 paddd ONE(%rip), \XMM0 # INCR CNT
745 paddd ONE(%rip), \XMM0 # INCR CNT
747 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
748 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
749 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
750 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
751 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
757 movdqa HashKey_4_k(%rsp), \TMP5
758 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
759 movaps 0x10(%arg1), \TMP1
760 AESENC \TMP1, \XMM1 # Round 1
764 movaps 0x20(%arg1), \TMP1
765 AESENC \TMP1, \XMM1 # Round 2
770 pshufd $78, \XMM6, \TMP2
772 movdqa HashKey_3(%rsp), \TMP5
773 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
774 movaps 0x30(%arg1), \TMP3
775 AESENC \TMP3, \XMM1 # Round 3
779 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
780 movaps 0x40(%arg1), \TMP3
781 AESENC \TMP3, \XMM1 # Round 4
785 movdqa HashKey_3_k(%rsp), \TMP5
786 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
787 movaps 0x50(%arg1), \TMP3
788 AESENC \TMP3, \XMM1 # Round 5
793 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
797 pshufd $78, \XMM7, \TMP2
799 movdqa HashKey_2(%rsp ), \TMP5
801 # Multiply TMP5 * HashKey using karatsuba
803 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
804 movaps 0x60(%arg1), \TMP3
805 AESENC \TMP3, \XMM1 # Round 6
809 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
810 movaps 0x70(%arg1), \TMP3
811 AESENC \TMP3, \XMM1 # Round 7
815 movdqa HashKey_2_k(%rsp), \TMP5
816 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
817 movaps 0x80(%arg1), \TMP3
818 AESENC \TMP3, \XMM1 # Round 8
823 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
827 # Multiply XMM8 * HashKey
828 # XMM8 and TMP5 hold the values for the two operands
831 pshufd $78, \XMM8, \TMP2
833 movdqa HashKey(%rsp), \TMP5
834 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
835 movaps 0x90(%arg1), \TMP3
836 AESENC \TMP3, \XMM1 # Round 9
840 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
843 shr $2,%eax # 128->4, 192->6, 256->8
844 sub $4,%eax # 128->0, 192->2, 256->4
845 jz aes_loop_par_enc_done
850 AESENC \TMP3, %xmm\index
856 aes_loop_par_enc_done:
858 AESENCLAST \TMP3, \XMM1 # Round 10
859 AESENCLAST \TMP3, \XMM2
860 AESENCLAST \TMP3, \XMM3
861 AESENCLAST \TMP3, \XMM4
862 movdqa HashKey_k(%rsp), \TMP5
863 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
864 movdqu (%arg3,%r11,1), \TMP3
865 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
866 movdqu 16(%arg3,%r11,1), \TMP3
867 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
868 movdqu 32(%arg3,%r11,1), \TMP3
869 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
870 movdqu 48(%arg3,%r11,1), \TMP3
871 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
872 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
873 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
874 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
875 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
876 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
877 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
878 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
879 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
887 pslldq $8, \TMP3 # left shift TMP3 2 DWs
888 psrldq $8, \TMP2 # right shift TMP2 2 DWs
890 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
892 # first phase of reduction
897 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
898 pslld $31, \TMP2 # packed right shift << 31
899 pslld $30, \TMP3 # packed right shift << 30
900 pslld $25, \TMP4 # packed right shift << 25
901 pxor \TMP3, \TMP2 # xor the shifted versions
904 psrldq $4, \TMP5 # right shift T5 1 DW
905 pslldq $12, \TMP2 # left shift T2 3 DWs
908 # second phase of reduction
910 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
913 psrld $1, \TMP2 # packed left shift >>1
914 psrld $2, \TMP3 # packed left shift >>2
915 psrld $7, \TMP4 # packed left shift >>7
916 pxor \TMP3,\TMP2 # xor the shifted versions
920 pxor \TMP1, \XMM5 # result is in TMP1
926 * decrypt 4 blocks at a time
927 * ghash the 4 previously decrypted ciphertext blocks
928 * arg1, %arg2, %arg3 are used as pointers only, not modified
929 * %r11 is the data offset value
931 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
932 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
939 movdqa SHUF_MASK(%rip), %xmm15
940 # multiply TMP5 * HashKey using karatsuba
943 pshufd $78, \XMM5, \TMP6
945 paddd ONE(%rip), \XMM0 # INCR CNT
946 movdqa HashKey_4(%rsp), \TMP5
947 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
949 paddd ONE(%rip), \XMM0 # INCR CNT
951 paddd ONE(%rip), \XMM0 # INCR CNT
953 paddd ONE(%rip), \XMM0 # INCR CNT
955 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
956 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
957 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
958 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
959 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
965 movdqa HashKey_4_k(%rsp), \TMP5
966 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
967 movaps 0x10(%arg1), \TMP1
968 AESENC \TMP1, \XMM1 # Round 1
972 movaps 0x20(%arg1), \TMP1
973 AESENC \TMP1, \XMM1 # Round 2
978 pshufd $78, \XMM6, \TMP2
980 movdqa HashKey_3(%rsp), \TMP5
981 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
982 movaps 0x30(%arg1), \TMP3
983 AESENC \TMP3, \XMM1 # Round 3
987 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
988 movaps 0x40(%arg1), \TMP3
989 AESENC \TMP3, \XMM1 # Round 4
993 movdqa HashKey_3_k(%rsp), \TMP5
994 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
995 movaps 0x50(%arg1), \TMP3
996 AESENC \TMP3, \XMM1 # Round 5
1001 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1005 pshufd $78, \XMM7, \TMP2
1007 movdqa HashKey_2(%rsp ), \TMP5
1009 # Multiply TMP5 * HashKey using karatsuba
1011 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1012 movaps 0x60(%arg1), \TMP3
1013 AESENC \TMP3, \XMM1 # Round 6
1017 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1018 movaps 0x70(%arg1), \TMP3
1019 AESENC \TMP3, \XMM1 # Round 7
1023 movdqa HashKey_2_k(%rsp), \TMP5
1024 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1025 movaps 0x80(%arg1), \TMP3
1026 AESENC \TMP3, \XMM1 # Round 8
1031 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1035 # Multiply XMM8 * HashKey
1036 # XMM8 and TMP5 hold the values for the two operands
1039 pshufd $78, \XMM8, \TMP2
1041 movdqa HashKey(%rsp), \TMP5
1042 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1043 movaps 0x90(%arg1), \TMP3
1044 AESENC \TMP3, \XMM1 # Round 9
1048 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1049 lea 0xa0(%arg1),%r10
1051 shr $2,%eax # 128->4, 192->6, 256->8
1052 sub $4,%eax # 128->0, 192->2, 256->4
1053 jz aes_loop_par_dec_done
1058 AESENC \TMP3, %xmm\index
1062 jnz aes_loop_par_dec
1064 aes_loop_par_dec_done:
1065 MOVADQ (%r10), \TMP3
1066 AESENCLAST \TMP3, \XMM1 # last round
1067 AESENCLAST \TMP3, \XMM2
1068 AESENCLAST \TMP3, \XMM3
1069 AESENCLAST \TMP3, \XMM4
1070 movdqa HashKey_k(%rsp), \TMP5
1071 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1072 movdqu (%arg3,%r11,1), \TMP3
1073 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1074 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1076 movdqu 16(%arg3,%r11,1), \TMP3
1077 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1078 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1080 movdqu 32(%arg3,%r11,1), \TMP3
1081 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1082 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1084 movdqu 48(%arg3,%r11,1), \TMP3
1085 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1086 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1088 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1089 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1090 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1091 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1099 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1100 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1102 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1104 # first phase of reduction
1109 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1110 pslld $31, \TMP2 # packed right shift << 31
1111 pslld $30, \TMP3 # packed right shift << 30
1112 pslld $25, \TMP4 # packed right shift << 25
1113 pxor \TMP3, \TMP2 # xor the shifted versions
1116 psrldq $4, \TMP5 # right shift T5 1 DW
1117 pslldq $12, \TMP2 # left shift T2 3 DWs
1120 # second phase of reduction
1122 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1125 psrld $1, \TMP2 # packed left shift >>1
1126 psrld $2, \TMP3 # packed left shift >>2
1127 psrld $7, \TMP4 # packed left shift >>7
1128 pxor \TMP3,\TMP2 # xor the shifted versions
1132 pxor \TMP1, \XMM5 # result is in TMP1
1137 /* GHASH the last 4 ciphertext blocks. */
1138 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1139 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1141 # Multiply TMP6 * HashKey (using Karatsuba)
1144 pshufd $78, \XMM1, \TMP2
1146 movdqa HashKey_4(%rsp), \TMP5
1147 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1148 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1149 movdqa HashKey_4_k(%rsp), \TMP4
1150 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1151 movdqa \XMM1, \XMMDst
1152 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1154 # Multiply TMP1 * HashKey (using Karatsuba)
1157 pshufd $78, \XMM2, \TMP2
1159 movdqa HashKey_3(%rsp), \TMP5
1160 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1161 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1162 movdqa HashKey_3_k(%rsp), \TMP4
1163 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1167 # results accumulated in TMP6, XMMDst, XMM1
1169 # Multiply TMP1 * HashKey (using Karatsuba)
1172 pshufd $78, \XMM3, \TMP2
1174 movdqa HashKey_2(%rsp), \TMP5
1175 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1176 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1177 movdqa HashKey_2_k(%rsp), \TMP4
1178 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1181 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1183 # Multiply TMP1 * HashKey (using Karatsuba)
1185 pshufd $78, \XMM4, \TMP2
1187 movdqa HashKey(%rsp), \TMP5
1188 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1189 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1190 movdqa HashKey_k(%rsp), \TMP4
1191 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1197 # middle section of the temp results combined as in karatsuba algorithm
1199 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1200 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1203 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1204 # first phase of the reduction
1205 movdqa \XMMDst, \TMP2
1206 movdqa \XMMDst, \TMP3
1207 movdqa \XMMDst, \TMP4
1208 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1209 pslld $31, \TMP2 # packed right shifting << 31
1210 pslld $30, \TMP3 # packed right shifting << 30
1211 pslld $25, \TMP4 # packed right shifting << 25
1212 pxor \TMP3, \TMP2 # xor the shifted versions
1215 psrldq $4, \TMP7 # right shift TMP7 1 DW
1216 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1219 # second phase of the reduction
1220 movdqa \XMMDst, \TMP2
1221 # make 3 copies of XMMDst for doing 3 shift operations
1222 movdqa \XMMDst, \TMP3
1223 movdqa \XMMDst, \TMP4
1224 psrld $1, \TMP2 # packed left shift >> 1
1225 psrld $2, \TMP3 # packed left shift >> 2
1226 psrld $7, \TMP4 # packed left shift >> 7
1227 pxor \TMP3, \TMP2 # xor the shifted versions
1231 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1235 /* Encryption of a single block
1239 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1243 shr $2,%eax # 128->4, 192->6, 256->8
1244 add $5,%eax # 128->9, 192->11, 256->13
1245 lea 16(%arg1), %r10 # get first expanded key address
1255 AESENCLAST \TMP1,\XMM0
1257 /*****************************************************************************
1258 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1259 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1260 * const u8 *in, // Ciphertext input
1261 * u64 plaintext_len, // Length of data in bytes for decryption.
1262 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1263 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1264 * // concatenated with 0x00000001. 16-byte aligned pointer.
1265 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1266 * const u8 *aad, // Additional Authentication Data (AAD)
1267 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1268 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1269 * // given authentication tag and only return the plaintext if they match.
1270 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1271 * // (most likely), 12 or 8.
1276 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1277 * set of 11 keys in the data structure void *aes_ctx
1281 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1282 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1283 * | Salt (From the SA) |
1284 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1285 * | Initialization Vector |
1286 * | (This is the sequence number from IPSec header) |
1287 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1289 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1294 * AAD padded to 128 bits with 0
1295 * for example, assume AAD is a u32 vector
1297 * if AAD is 8 bytes:
1298 * AAD[3] = {A0, A1};
1299 * padded AAD in xmm register = {A1 A0 0 0}
1302 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1303 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1305 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1306 * | 32-bit Sequence Number (A0) |
1307 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1309 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1311 * AAD Format with 32-bit Sequence Number
1313 * if AAD is 12 bytes:
1314 * AAD[3] = {A0, A1, A2};
1315 * padded AAD in xmm register = {A2 A1 A0 0}
1318 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1319 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1320 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1321 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1323 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1324 * | 64-bit Extended Sequence Number {A1,A0} |
1326 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1328 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1330 * AAD Format with 64-bit Extended Sequence Number
1332 * poly = x^128 + x^127 + x^126 + x^121 + 1
1334 *****************************************************************************/
1335 ENTRY(aesni_gcm_dec)
1341 * states of %xmm registers %xmm6:%xmm15 not saved
1342 * all %xmm registers are clobbered
1344 sub $VARIABLE_OFFSET, %rsp
1345 and $~63, %rsp # align rsp to 64 bytes
1347 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1348 movdqa SHUF_MASK(%rip), %xmm2
1349 PSHUFB_XMM %xmm2, %xmm13
1352 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1354 movdqa %xmm13, %xmm2
1364 pshufd $0x24, %xmm1, %xmm2
1365 pcmpeqd TWOONE(%rip), %xmm2
1366 pand POLY(%rip), %xmm2
1367 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1370 # Decrypt first few blocks
1372 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1373 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1374 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1377 jz _initial_num_blocks_is_0_decrypt
1379 jb _initial_num_blocks_is_1_decrypt
1380 je _initial_num_blocks_is_2_decrypt
1381 _initial_num_blocks_is_3_decrypt:
1382 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1383 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1385 jmp _initial_blocks_decrypted
1386 _initial_num_blocks_is_2_decrypt:
1387 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1388 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1390 jmp _initial_blocks_decrypted
1391 _initial_num_blocks_is_1_decrypt:
1392 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1393 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1395 jmp _initial_blocks_decrypted
1396 _initial_num_blocks_is_0_decrypt:
1397 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1398 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1399 _initial_blocks_decrypted:
1401 je _zero_cipher_left_decrypt
1403 je _four_cipher_left_decrypt
1405 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1406 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1410 _four_cipher_left_decrypt:
1411 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1412 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1413 _zero_cipher_left_decrypt:
1415 and $15, %r13 # %r13 = arg4 (mod 16)
1416 je _multiple_of_16_bytes_decrypt
1418 # Handle the last <16 byte block separately
1420 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1421 movdqa SHUF_MASK(%rip), %xmm10
1422 PSHUFB_XMM %xmm10, %xmm0
1424 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1426 lea (%arg3,%r11,1), %r10
1428 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1430 lea ALL_F+16(%rip), %r12
1433 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1434 movdqu (%r12), %xmm1
1435 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1436 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1438 movdqa SHUF_MASK(%rip), %xmm10
1439 PSHUFB_XMM %xmm10 ,%xmm2
1442 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1445 MOVQ_R64_XMM %xmm0, %rax
1447 jle _less_than_8_bytes_left_decrypt
1448 mov %rax, (%arg2 , %r11, 1)
1451 MOVQ_R64_XMM %xmm0, %rax
1453 _less_than_8_bytes_left_decrypt:
1454 mov %al, (%arg2, %r11, 1)
1458 jne _less_than_8_bytes_left_decrypt
1459 _multiple_of_16_bytes_decrypt:
1460 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1461 shl $3, %r12 # convert into number of bits
1462 movd %r12d, %xmm15 # len(A) in %xmm15
1463 shl $3, %arg4 # len(C) in bits (*128)
1464 MOVQ_R64_XMM %arg4, %xmm1
1465 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1466 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1468 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1469 # final GHASH computation
1470 movdqa SHUF_MASK(%rip), %xmm10
1471 PSHUFB_XMM %xmm10, %xmm8
1473 mov %arg5, %rax # %rax = *Y0
1474 movdqu (%rax), %xmm0 # %xmm0 = Y0
1475 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1478 mov arg9, %r10 # %r10 = authTag
1479 mov arg10, %r11 # %r11 = auth_tag_len
1485 MOVQ_R64_XMM %xmm0, %rax
1491 je _return_T_done_decrypt
1499 je _return_T_done_decrypt
1506 je _return_T_done_decrypt
1511 jmp _return_T_done_decrypt
1513 movdqu %xmm0, (%r10)
1514 _return_T_done_decrypt:
1520 ENDPROC(aesni_gcm_dec)
1523 /*****************************************************************************
1524 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1525 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1526 * const u8 *in, // Plaintext input
1527 * u64 plaintext_len, // Length of data in bytes for encryption.
1528 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1529 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1530 * // concatenated with 0x00000001. 16-byte aligned pointer.
1531 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1532 * const u8 *aad, // Additional Authentication Data (AAD)
1533 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1534 * u8 *auth_tag, // Authenticated Tag output.
1535 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1541 * keys are pre-expanded and aligned to 16 bytes. we are using the
1542 * first set of 11 keys in the data structure void *aes_ctx
1547 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1548 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549 * | Salt (From the SA) |
1550 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1551 * | Initialization Vector |
1552 * | (This is the sequence number from IPSec header) |
1553 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1555 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1560 * AAD padded to 128 bits with 0
1561 * for example, assume AAD is a u32 vector
1563 * if AAD is 8 bytes:
1564 * AAD[3] = {A0, A1};
1565 * padded AAD in xmm register = {A1 A0 0 0}
1568 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1569 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1571 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1572 * | 32-bit Sequence Number (A0) |
1573 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577 * AAD Format with 32-bit Sequence Number
1579 * if AAD is 12 bytes:
1580 * AAD[3] = {A0, A1, A2};
1581 * padded AAD in xmm register = {A2 A1 A0 0}
1584 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1585 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1587 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588 * | 64-bit Extended Sequence Number {A1,A0} |
1590 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1592 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1594 * AAD Format with 64-bit Extended Sequence Number
1596 * poly = x^128 + x^127 + x^126 + x^121 + 1
1597 ***************************************************************************/
1598 ENTRY(aesni_gcm_enc)
1604 # states of %xmm registers %xmm6:%xmm15 not saved
1605 # all %xmm registers are clobbered
1607 sub $VARIABLE_OFFSET, %rsp
1610 movdqu (%r12), %xmm13
1611 movdqa SHUF_MASK(%rip), %xmm2
1612 PSHUFB_XMM %xmm2, %xmm13
1615 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1617 movdqa %xmm13, %xmm2
1627 pshufd $0x24, %xmm1, %xmm2
1628 pcmpeqd TWOONE(%rip), %xmm2
1629 pand POLY(%rip), %xmm2
1631 movdqa %xmm13, HashKey(%rsp)
1632 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1636 # Encrypt first few blocks
1639 jz _initial_num_blocks_is_0_encrypt
1641 jb _initial_num_blocks_is_1_encrypt
1642 je _initial_num_blocks_is_2_encrypt
1643 _initial_num_blocks_is_3_encrypt:
1644 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1645 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1647 jmp _initial_blocks_encrypted
1648 _initial_num_blocks_is_2_encrypt:
1649 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1650 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1652 jmp _initial_blocks_encrypted
1653 _initial_num_blocks_is_1_encrypt:
1654 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1655 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1657 jmp _initial_blocks_encrypted
1658 _initial_num_blocks_is_0_encrypt:
1659 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1660 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1661 _initial_blocks_encrypted:
1663 # Main loop - Encrypt remaining blocks
1666 je _zero_cipher_left_encrypt
1668 je _four_cipher_left_encrypt
1669 _encrypt_by_4_encrypt:
1670 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1671 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1674 jne _encrypt_by_4_encrypt
1675 _four_cipher_left_encrypt:
1676 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1677 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1678 _zero_cipher_left_encrypt:
1680 and $15, %r13 # %r13 = arg4 (mod 16)
1681 je _multiple_of_16_bytes_encrypt
1683 # Handle the last <16 Byte block separately
1684 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1685 movdqa SHUF_MASK(%rip), %xmm10
1686 PSHUFB_XMM %xmm10, %xmm0
1688 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1690 lea (%arg3,%r11,1), %r10
1692 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
1694 lea ALL_F+16(%rip), %r12
1696 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1697 movdqu (%r12), %xmm1
1698 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1699 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1700 movdqa SHUF_MASK(%rip), %xmm10
1701 PSHUFB_XMM %xmm10,%xmm0
1704 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1705 # GHASH computation for the last <16 byte block
1706 movdqa SHUF_MASK(%rip), %xmm10
1707 PSHUFB_XMM %xmm10, %xmm0
1709 # shuffle xmm0 back to output as ciphertext
1712 MOVQ_R64_XMM %xmm0, %rax
1714 jle _less_than_8_bytes_left_encrypt
1715 mov %rax, (%arg2 , %r11, 1)
1718 MOVQ_R64_XMM %xmm0, %rax
1720 _less_than_8_bytes_left_encrypt:
1721 mov %al, (%arg2, %r11, 1)
1725 jne _less_than_8_bytes_left_encrypt
1726 _multiple_of_16_bytes_encrypt:
1727 mov arg8, %r12 # %r12 = addLen (number of bytes)
1729 movd %r12d, %xmm15 # len(A) in %xmm15
1730 shl $3, %arg4 # len(C) in bits (*128)
1731 MOVQ_R64_XMM %arg4, %xmm1
1732 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1733 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1735 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1736 # final GHASH computation
1737 movdqa SHUF_MASK(%rip), %xmm10
1738 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1740 mov %arg5, %rax # %rax = *Y0
1741 movdqu (%rax), %xmm0 # %xmm0 = Y0
1742 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1745 mov arg9, %r10 # %r10 = authTag
1746 mov arg10, %r11 # %r11 = auth_tag_len
1752 MOVQ_R64_XMM %xmm0, %rax
1758 je _return_T_done_encrypt
1766 je _return_T_done_encrypt
1773 je _return_T_done_encrypt
1778 jmp _return_T_done_encrypt
1780 movdqu %xmm0, (%r10)
1781 _return_T_done_encrypt:
1787 ENDPROC(aesni_gcm_enc)
1794 _key_expansion_256a:
1795 pshufd $0b11111111, %xmm1, %xmm1
1796 shufps $0b00010000, %xmm0, %xmm4
1798 shufps $0b10001100, %xmm0, %xmm4
1801 movaps %xmm0, (TKEYP)
1804 ENDPROC(_key_expansion_128)
1805 ENDPROC(_key_expansion_256a)
1808 _key_expansion_192a:
1809 pshufd $0b01010101, %xmm1, %xmm1
1810 shufps $0b00010000, %xmm0, %xmm4
1812 shufps $0b10001100, %xmm0, %xmm4
1819 pshufd $0b11111111, %xmm0, %xmm3
1824 shufps $0b01000100, %xmm0, %xmm6
1825 movaps %xmm6, (TKEYP)
1826 shufps $0b01001110, %xmm2, %xmm1
1827 movaps %xmm1, 0x10(TKEYP)
1830 ENDPROC(_key_expansion_192a)
1833 _key_expansion_192b:
1834 pshufd $0b01010101, %xmm1, %xmm1
1835 shufps $0b00010000, %xmm0, %xmm4
1837 shufps $0b10001100, %xmm0, %xmm4
1843 pshufd $0b11111111, %xmm0, %xmm3
1847 movaps %xmm0, (TKEYP)
1850 ENDPROC(_key_expansion_192b)
1853 _key_expansion_256b:
1854 pshufd $0b10101010, %xmm1, %xmm1
1855 shufps $0b00010000, %xmm2, %xmm4
1857 shufps $0b10001100, %xmm2, %xmm4
1860 movaps %xmm2, (TKEYP)
1863 ENDPROC(_key_expansion_256b)
1866 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1867 * unsigned int key_len)
1869 ENTRY(aesni_set_key)
1873 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1874 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1875 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1877 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1878 movaps %xmm0, (KEYP)
1879 lea 0x10(KEYP), TKEYP # key addr
1880 movl %edx, 480(KEYP)
1881 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1885 movups 0x10(UKEYP), %xmm2 # other user key
1886 movaps %xmm2, (TKEYP)
1888 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1889 call _key_expansion_256a
1890 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1891 call _key_expansion_256b
1892 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1893 call _key_expansion_256a
1894 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1895 call _key_expansion_256b
1896 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1897 call _key_expansion_256a
1898 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1899 call _key_expansion_256b
1900 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1901 call _key_expansion_256a
1902 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1903 call _key_expansion_256b
1904 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1905 call _key_expansion_256a
1906 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1907 call _key_expansion_256b
1908 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1909 call _key_expansion_256a
1910 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1911 call _key_expansion_256b
1912 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1913 call _key_expansion_256a
1916 movq 0x10(UKEYP), %xmm2 # other user key
1917 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1918 call _key_expansion_192a
1919 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1920 call _key_expansion_192b
1921 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1922 call _key_expansion_192a
1923 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1924 call _key_expansion_192b
1925 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1926 call _key_expansion_192a
1927 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1928 call _key_expansion_192b
1929 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1930 call _key_expansion_192a
1931 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1932 call _key_expansion_192b
1935 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1936 call _key_expansion_128
1937 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1938 call _key_expansion_128
1939 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1940 call _key_expansion_128
1941 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1942 call _key_expansion_128
1943 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1944 call _key_expansion_128
1945 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1946 call _key_expansion_128
1947 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1948 call _key_expansion_128
1949 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1950 call _key_expansion_128
1951 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1952 call _key_expansion_128
1953 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1954 call _key_expansion_128
1957 movaps (KEYP), %xmm0
1958 movaps (TKEYP), %xmm1
1959 movaps %xmm0, 240(TKEYP)
1960 movaps %xmm1, 240(KEYP)
1962 lea 240-16(TKEYP), UKEYP
1965 movaps (KEYP), %xmm0
1967 movaps %xmm1, (UKEYP)
1978 ENDPROC(aesni_set_key)
1981 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1988 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1989 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1990 movl (FRAME_OFFSET+20)(%esp), INP # src
1992 movl 480(KEYP), KLEN # key length
1993 movups (INP), STATE # input
1995 movups STATE, (OUTP) # output
2005 * _aesni_enc1: internal ABI
2007 * KEYP: key struct pointer
2009 * STATE: initial state (input)
2011 * STATE: finial state (output)
2018 movaps (KEYP), KEY # key
2020 pxor KEY, STATE # round 0
2024 lea 0x20(TKEYP), TKEYP
2027 movaps -0x60(TKEYP), KEY
2029 movaps -0x50(TKEYP), KEY
2033 movaps -0x40(TKEYP), KEY
2035 movaps -0x30(TKEYP), KEY
2039 movaps -0x20(TKEYP), KEY
2041 movaps -0x10(TKEYP), KEY
2045 movaps 0x10(TKEYP), KEY
2047 movaps 0x20(TKEYP), KEY
2049 movaps 0x30(TKEYP), KEY
2051 movaps 0x40(TKEYP), KEY
2053 movaps 0x50(TKEYP), KEY
2055 movaps 0x60(TKEYP), KEY
2057 movaps 0x70(TKEYP), KEY
2058 AESENCLAST KEY STATE
2060 ENDPROC(_aesni_enc1)
2063 * _aesni_enc4: internal ABI
2065 * KEYP: key struct pointer
2067 * STATE1: initial state (input)
2072 * STATE1: finial state (output)
2082 movaps (KEYP), KEY # key
2084 pxor KEY, STATE1 # round 0
2091 lea 0x20(TKEYP), TKEYP
2094 movaps -0x60(TKEYP), KEY
2099 movaps -0x50(TKEYP), KEY
2106 movaps -0x40(TKEYP), KEY
2111 movaps -0x30(TKEYP), KEY
2118 movaps -0x20(TKEYP), KEY
2123 movaps -0x10(TKEYP), KEY
2133 movaps 0x10(TKEYP), KEY
2138 movaps 0x20(TKEYP), KEY
2143 movaps 0x30(TKEYP), KEY
2148 movaps 0x40(TKEYP), KEY
2153 movaps 0x50(TKEYP), KEY
2158 movaps 0x60(TKEYP), KEY
2163 movaps 0x70(TKEYP), KEY
2164 AESENCLAST KEY STATE1 # last round
2165 AESENCLAST KEY STATE2
2166 AESENCLAST KEY STATE3
2167 AESENCLAST KEY STATE4
2169 ENDPROC(_aesni_enc4)
2172 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2179 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2180 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2181 movl (FRAME_OFFSET+20)(%esp), INP # src
2183 mov 480(KEYP), KLEN # key length
2185 movups (INP), STATE # input
2187 movups STATE, (OUTP) #output
2197 * _aesni_dec1: internal ABI
2199 * KEYP: key struct pointer
2201 * STATE: initial state (input)
2203 * STATE: finial state (output)
2210 movaps (KEYP), KEY # key
2212 pxor KEY, STATE # round 0
2216 lea 0x20(TKEYP), TKEYP
2219 movaps -0x60(TKEYP), KEY
2221 movaps -0x50(TKEYP), KEY
2225 movaps -0x40(TKEYP), KEY
2227 movaps -0x30(TKEYP), KEY
2231 movaps -0x20(TKEYP), KEY
2233 movaps -0x10(TKEYP), KEY
2237 movaps 0x10(TKEYP), KEY
2239 movaps 0x20(TKEYP), KEY
2241 movaps 0x30(TKEYP), KEY
2243 movaps 0x40(TKEYP), KEY
2245 movaps 0x50(TKEYP), KEY
2247 movaps 0x60(TKEYP), KEY
2249 movaps 0x70(TKEYP), KEY
2250 AESDECLAST KEY STATE
2252 ENDPROC(_aesni_dec1)
2255 * _aesni_dec4: internal ABI
2257 * KEYP: key struct pointer
2259 * STATE1: initial state (input)
2264 * STATE1: finial state (output)
2274 movaps (KEYP), KEY # key
2276 pxor KEY, STATE1 # round 0
2283 lea 0x20(TKEYP), TKEYP
2286 movaps -0x60(TKEYP), KEY
2291 movaps -0x50(TKEYP), KEY
2298 movaps -0x40(TKEYP), KEY
2303 movaps -0x30(TKEYP), KEY
2310 movaps -0x20(TKEYP), KEY
2315 movaps -0x10(TKEYP), KEY
2325 movaps 0x10(TKEYP), KEY
2330 movaps 0x20(TKEYP), KEY
2335 movaps 0x30(TKEYP), KEY
2340 movaps 0x40(TKEYP), KEY
2345 movaps 0x50(TKEYP), KEY
2350 movaps 0x60(TKEYP), KEY
2355 movaps 0x70(TKEYP), KEY
2356 AESDECLAST KEY STATE1 # last round
2357 AESDECLAST KEY STATE2
2358 AESDECLAST KEY STATE3
2359 AESDECLAST KEY STATE4
2361 ENDPROC(_aesni_dec4)
2364 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2367 ENTRY(aesni_ecb_enc)
2373 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2374 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2375 movl (FRAME_OFFSET+24)(%esp), INP # src
2376 movl (FRAME_OFFSET+28)(%esp), LEN # len
2378 test LEN, LEN # check length
2387 movups (INP), STATE1
2388 movups 0x10(INP), STATE2
2389 movups 0x20(INP), STATE3
2390 movups 0x30(INP), STATE4
2392 movups STATE1, (OUTP)
2393 movups STATE2, 0x10(OUTP)
2394 movups STATE3, 0x20(OUTP)
2395 movups STATE4, 0x30(OUTP)
2405 movups (INP), STATE1
2407 movups STATE1, (OUTP)
2421 ENDPROC(aesni_ecb_enc)
2424 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2427 ENTRY(aesni_ecb_dec)
2433 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2434 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2435 movl (FRAME_OFFSET+24)(%esp), INP # src
2436 movl (FRAME_OFFSET+28)(%esp), LEN # len
2448 movups (INP), STATE1
2449 movups 0x10(INP), STATE2
2450 movups 0x20(INP), STATE3
2451 movups 0x30(INP), STATE4
2453 movups STATE1, (OUTP)
2454 movups STATE2, 0x10(OUTP)
2455 movups STATE3, 0x20(OUTP)
2456 movups STATE4, 0x30(OUTP)
2466 movups (INP), STATE1
2468 movups STATE1, (OUTP)
2482 ENDPROC(aesni_ecb_dec)
2485 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2486 * size_t len, u8 *iv)
2488 ENTRY(aesni_cbc_enc)
2495 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2496 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2497 movl (FRAME_OFFSET+28)(%esp), INP # src
2498 movl (FRAME_OFFSET+32)(%esp), LEN # len
2499 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2504 movups (IVP), STATE # load iv as initial state
2507 movups (INP), IN # load input
2510 movups STATE, (OUTP) # store output
2526 ENDPROC(aesni_cbc_enc)
2529 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2530 * size_t len, u8 *iv)
2532 ENTRY(aesni_cbc_dec)
2539 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2540 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2541 movl (FRAME_OFFSET+28)(%esp), INP # src
2542 movl (FRAME_OFFSET+32)(%esp), LEN # len
2543 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2546 jb .Lcbc_dec_just_ret
2556 movups 0x10(INP), IN2
2559 movups 0x20(INP), IN3
2561 movups 0x30(INP), IN4
2564 movups 0x20(INP), IN1
2566 movups 0x30(INP), IN2
2581 movups 0x10(INP), IN2
2584 movups STATE1, (OUTP)
2585 movups STATE2, 0x10(OUTP)
2586 movups STATE3, 0x20(OUTP)
2587 movups STATE4, 0x30(OUTP)
2601 movups STATE, (OUTP)
2619 ENDPROC(aesni_cbc_dec)
2622 .pushsection .rodata
2625 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2629 * _aesni_inc_init: internal ABI
2630 * setup registers used by _aesni_inc
2634 * CTR: == IV, in little endian
2635 * TCTR_LOW: == lower qword of CTR
2636 * INC: == 1, in little endian
2637 * BSWAP_MASK == endian swapping mask
2641 movaps .Lbswap_mask, BSWAP_MASK
2643 PSHUFB_XMM BSWAP_MASK CTR
2645 MOVQ_R64_XMM TCTR_LOW INC
2646 MOVQ_R64_XMM CTR TCTR_LOW
2648 ENDPROC(_aesni_inc_init)
2651 * _aesni_inc: internal ABI
2652 * Increase IV by 1, IV is in big endian
2655 * CTR: == IV, in little endian
2656 * TCTR_LOW: == lower qword of CTR
2657 * INC: == 1, in little endian
2658 * BSWAP_MASK == endian swapping mask
2662 * CTR: == output IV, in little endian
2663 * TCTR_LOW: == lower qword of CTR
2675 PSHUFB_XMM BSWAP_MASK IV
2680 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2681 * size_t len, u8 *iv)
2683 ENTRY(aesni_ctr_enc)
2686 jb .Lctr_enc_just_ret
2689 call _aesni_inc_init
2699 movups 0x10(INP), IN2
2702 movups 0x20(INP), IN3
2705 movups 0x30(INP), IN4
2708 movups STATE1, (OUTP)
2710 movups STATE2, 0x10(OUTP)
2712 movups STATE3, 0x20(OUTP)
2714 movups STATE4, 0x30(OUTP)
2729 movups STATE, (OUTP)
2740 ENDPROC(aesni_ctr_enc)
2743 * _aesni_gf128mul_x_ble: internal ABI
2744 * Multiply in GF(2^128) for XTS IVs
2747 * GF128MUL_MASK == mask with 0x87 and 0x01
2751 * CTR: == temporary value
2753 #define _aesni_gf128mul_x_ble() \
2754 pshufd $0x13, IV, CTR; \
2757 pand GF128MUL_MASK, CTR; \
2761 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2764 ENTRY(aesni_xts_crypt8)
2769 leaq _aesni_enc4, %r11
2770 leaq _aesni_dec4, %rax
2774 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2781 movdqu 0x00(INP), INC
2783 movdqu IV, 0x00(OUTP)
2785 _aesni_gf128mul_x_ble()
2787 movdqu 0x10(INP), INC
2789 movdqu IV, 0x10(OUTP)
2791 _aesni_gf128mul_x_ble()
2793 movdqu 0x20(INP), INC
2795 movdqu IV, 0x20(OUTP)
2797 _aesni_gf128mul_x_ble()
2799 movdqu 0x30(INP), INC
2801 movdqu IV, 0x30(OUTP)
2805 movdqu 0x00(OUTP), INC
2807 movdqu STATE1, 0x00(OUTP)
2809 _aesni_gf128mul_x_ble()
2811 movdqu 0x40(INP), INC
2813 movdqu IV, 0x40(OUTP)
2815 movdqu 0x10(OUTP), INC
2817 movdqu STATE2, 0x10(OUTP)
2819 _aesni_gf128mul_x_ble()
2821 movdqu 0x50(INP), INC
2823 movdqu IV, 0x50(OUTP)
2825 movdqu 0x20(OUTP), INC
2827 movdqu STATE3, 0x20(OUTP)
2829 _aesni_gf128mul_x_ble()
2831 movdqu 0x60(INP), INC
2833 movdqu IV, 0x60(OUTP)
2835 movdqu 0x30(OUTP), INC
2837 movdqu STATE4, 0x30(OUTP)
2839 _aesni_gf128mul_x_ble()
2841 movdqu 0x70(INP), INC
2843 movdqu IV, 0x70(OUTP)
2845 _aesni_gf128mul_x_ble()
2850 movdqu 0x40(OUTP), INC
2852 movdqu STATE1, 0x40(OUTP)
2854 movdqu 0x50(OUTP), INC
2856 movdqu STATE2, 0x50(OUTP)
2858 movdqu 0x60(OUTP), INC
2860 movdqu STATE3, 0x60(OUTP)
2862 movdqu 0x70(OUTP), INC
2864 movdqu STATE4, 0x70(OUTP)
2868 ENDPROC(aesni_xts_crypt8)