1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Implement AES algorithm in Intel AES-NI instructions.
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
24 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
28 #include <linux/linkage.h>
30 #include <asm/frame.h>
31 #include <asm/nospec-branch.h>
34 * The following macros are used to move an (un)aligned 16 byte value to/from
35 * an XMM register. This can done for either FP or integer values, for FP use
36 * movaps (move aligned packed single) or integer use movdqa (move double quad
37 * aligned). It doesn't make a performance difference which instruction is used
38 * since Nehalem (original Core i7) was released. However, the movaps is a byte
39 * shorter, so that is the one we'll use for now. (same for unaligned).
46 # constants in mergeable sections, linker can reorder and merge
47 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
49 .Lgf128mul_x_ble_mask:
50 .octa 0x00000000000000010000000000000087
51 .section .rodata.cst16.POLY, "aM", @progbits, 16
53 POLY: .octa 0xC2000000000000000000000000000001
54 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
56 TWOONE: .octa 0x00000001000000000000000000000001
58 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
60 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
61 .section .rodata.cst16.MASK1, "aM", @progbits, 16
63 MASK1: .octa 0x0000000000000000ffffffffffffffff
64 .section .rodata.cst16.MASK2, "aM", @progbits, 16
66 MASK2: .octa 0xffffffffffffffff0000000000000000
67 .section .rodata.cst16.ONE, "aM", @progbits, 16
69 ONE: .octa 0x00000000000000000000000000000001
70 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
72 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
73 .section .rodata.cst16.dec, "aM", @progbits, 16
76 .section .rodata.cst16.enc, "aM", @progbits, 16
80 # order of these constants should not change.
81 # more specifically, ALL_F should follow SHIFT_MASK,
82 # and zero should follow ALL_F
83 .section .rodata, "a", @progbits
85 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
86 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
87 .octa 0x00000000000000000000000000000000
92 #define STACK_OFFSET 8*3
96 #define InLen (16*1)+8
97 #define PBlockEncKey 16*2
100 #define PBlockLen 16*5
101 #define HashKey 16*6 // store HashKey <<1 mod poly here
102 #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
103 #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
104 #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
105 #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
106 // bits of HashKey <<1 mod poly here
107 //(for Karatsuba purposes)
108 #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
109 // bits of HashKey^2 <<1 mod poly here
110 // (for Karatsuba purposes)
111 #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
112 // bits of HashKey^3 <<1 mod poly here
113 // (for Karatsuba purposes)
114 #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
115 // bits of HashKey^4 <<1 mod poly here
116 // (for Karatsuba purposes)
124 #define arg7 STACK_OFFSET+8(%rsp)
125 #define arg8 STACK_OFFSET+16(%rsp)
126 #define arg9 STACK_OFFSET+24(%rsp)
127 #define arg10 STACK_OFFSET+32(%rsp)
128 #define arg11 STACK_OFFSET+40(%rsp)
129 #define keysize 2*15*16(%arg1)
146 #define BSWAP_MASK %xmm10
150 #define GF128MUL_MASK %xmm10
183 # states of %xmm registers %xmm6:%xmm15 not saved
184 # all %xmm registers are clobbered
195 # Precompute hashkeys.
196 # Input: Hash subkey.
197 # Output: HashKeys stored in gcm_context_data. Only needs to be called
199 # clobbers r12, and tmp xmm registers.
200 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
203 movdqa SHUF_MASK(%rip), \TMP2
204 PSHUFB_XMM \TMP2, \TMP3
206 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
218 pshufd $0x24, \TMP1, \TMP2
219 pcmpeqd TWOONE(%rip), \TMP2
220 pand POLY(%rip), \TMP2
222 movdqu \TMP3, HashKey(%arg2)
225 pshufd $78, \TMP3, \TMP1
227 movdqu \TMP1, HashKey_k(%arg2)
229 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
230 # TMP5 = HashKey^2<<1 (mod poly)
231 movdqu \TMP5, HashKey_2(%arg2)
232 # HashKey_2 = HashKey^2<<1 (mod poly)
233 pshufd $78, \TMP5, \TMP1
235 movdqu \TMP1, HashKey_2_k(%arg2)
237 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
238 # TMP5 = HashKey^3<<1 (mod poly)
239 movdqu \TMP5, HashKey_3(%arg2)
240 pshufd $78, \TMP5, \TMP1
242 movdqu \TMP1, HashKey_3_k(%arg2)
244 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
245 # TMP5 = HashKey^3<<1 (mod poly)
246 movdqu \TMP5, HashKey_4(%arg2)
247 pshufd $78, \TMP5, \TMP1
249 movdqu \TMP1, HashKey_4_k(%arg2)
252 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
253 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
254 .macro GCM_INIT Iv SUBKEY AAD AADLEN
256 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
258 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
259 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
260 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
263 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
265 movdqa SHUF_MASK(%rip), %xmm2
266 PSHUFB_XMM %xmm2, %xmm0
267 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
269 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
270 movdqu HashKey(%arg2), %xmm13
272 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
276 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
277 # struct has been initialized by GCM_INIT.
278 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
279 # Clobbers rax, r10-r13, and xmm0-xmm15
280 .macro GCM_ENC_DEC operation
281 movdqu AadHash(%arg2), %xmm8
282 movdqu HashKey(%arg2), %xmm13
283 add %arg5, InLen(%arg2)
285 xor %r11d, %r11d # initialise the data pointer offset as zero
286 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
288 sub %r11, %arg5 # sub partial block data used
289 mov %arg5, %r13 # save the number of bytes
291 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
293 # Encrypt/Decrypt first few blocks
296 jz _initial_num_blocks_is_0_\@
298 jb _initial_num_blocks_is_1_\@
299 je _initial_num_blocks_is_2_\@
300 _initial_num_blocks_is_3_\@:
301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
304 jmp _initial_blocks_\@
305 _initial_num_blocks_is_2_\@:
306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
309 jmp _initial_blocks_\@
310 _initial_num_blocks_is_1_\@:
311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
314 jmp _initial_blocks_\@
315 _initial_num_blocks_is_0_\@:
316 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
317 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
320 # Main loop - Encrypt/Decrypt remaining blocks
323 je _zero_cipher_left_\@
325 je _four_cipher_left_\@
327 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
328 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
333 _four_cipher_left_\@:
334 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
335 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
336 _zero_cipher_left_\@:
337 movdqu %xmm8, AadHash(%arg2)
338 movdqu %xmm0, CurCount(%arg2)
341 and $15, %r13 # %r13 = arg5 (mod 16)
342 je _multiple_of_16_bytes_\@
344 mov %r13, PBlockLen(%arg2)
346 # Handle the last <16 Byte block separately
347 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
348 movdqu %xmm0, CurCount(%arg2)
349 movdqa SHUF_MASK(%rip), %xmm10
350 PSHUFB_XMM %xmm10, %xmm0
352 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
353 movdqu %xmm0, PBlockEncKey(%arg2)
356 jge _large_enough_update_\@
358 lea (%arg4,%r11,1), %r10
360 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
363 _large_enough_update_\@:
367 # receive the last <16 Byte block
368 movdqu (%arg4, %r11, 1), %xmm1
373 lea SHIFT_MASK+16(%rip), %r12
374 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
375 # (r13 is the number of bytes in plaintext mod 16)
377 # get the appropriate shuffle mask
379 # shift right 16-r13 bytes
380 PSHUFB_XMM %xmm2, %xmm1
383 lea ALL_F+16(%rip), %r12
389 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
391 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
392 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
395 movdqa SHUF_MASK(%rip), %xmm10
396 PSHUFB_XMM %xmm10 ,%xmm2
400 movdqa SHUF_MASK(%rip), %xmm10
401 PSHUFB_XMM %xmm10,%xmm0
406 movdqu %xmm8, AadHash(%arg2)
408 # GHASH computation for the last <16 byte block
409 movdqa SHUF_MASK(%rip), %xmm10
410 # shuffle xmm0 back to output as ciphertext
411 PSHUFB_XMM %xmm10, %xmm0
415 MOVQ_R64_XMM %xmm0, %rax
417 jle _less_than_8_bytes_left_\@
418 mov %rax, (%arg3 , %r11, 1)
421 MOVQ_R64_XMM %xmm0, %rax
423 _less_than_8_bytes_left_\@:
424 mov %al, (%arg3, %r11, 1)
428 jne _less_than_8_bytes_left_\@
429 _multiple_of_16_bytes_\@:
432 # GCM_COMPLETE Finishes update of tag of last partial block
433 # Output: Authorization Tag (AUTH_TAG)
434 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
435 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
436 movdqu AadHash(%arg2), %xmm8
437 movdqu HashKey(%arg2), %xmm13
439 mov PBlockLen(%arg2), %r12
444 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
447 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
448 shl $3, %r12 # convert into number of bits
449 movd %r12d, %xmm15 # len(A) in %xmm15
450 mov InLen(%arg2), %r12
451 shl $3, %r12 # len(C) in bits (*128)
452 MOVQ_R64_XMM %r12, %xmm1
454 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
455 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
457 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
458 # final GHASH computation
459 movdqa SHUF_MASK(%rip), %xmm10
460 PSHUFB_XMM %xmm10, %xmm8
462 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
463 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
466 mov \AUTHTAG, %r10 # %r10 = authTag
467 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
473 MOVQ_R64_XMM %xmm0, %rax
499 jmp _return_T_done_\@
506 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
509 * Input: A and B (128-bits each, bit-reflected)
510 * Output: C = A*B*x mod poly, (i.e. >>1 )
511 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
512 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
515 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
517 pshufd $78, \GH, \TMP2
518 pshufd $78, \HK, \TMP3
519 pxor \GH, \TMP2 # TMP2 = a1+a0
520 pxor \HK, \TMP3 # TMP3 = b1+b0
521 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
522 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
523 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
525 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
527 pslldq $8, \TMP3 # left shift TMP3 2 DWs
528 psrldq $8, \TMP2 # right shift TMP2 2 DWs
530 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
532 # first phase of the reduction
536 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
537 # in in order to perform
539 pslld $31, \TMP2 # packed right shift <<31
540 pslld $30, \TMP3 # packed right shift <<30
541 pslld $25, \TMP4 # packed right shift <<25
542 pxor \TMP3, \TMP2 # xor the shifted versions
545 psrldq $4, \TMP5 # right shift TMP5 1 DW
546 pslldq $12, \TMP2 # left shift TMP2 3 DWs
549 # second phase of the reduction
551 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
552 # in in order to perform
556 psrld $1,\TMP2 # packed left shift >>1
557 psrld $2,\TMP3 # packed left shift >>2
558 psrld $7,\TMP4 # packed left shift >>7
559 pxor \TMP3,\TMP2 # xor the shifted versions
563 pxor \TMP1, \GH # result is in TMP1
566 # Reads DLEN bytes starting at DPTR and stores in XMMDst
567 # where 0 < DLEN < 16
568 # Clobbers %rax, DLEN and XMM1
569 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
573 MOVQ_R64_XMM %rax, \XMMDst
575 jz _done_read_partial_block_\@
579 mov 7(\DPTR, \DLEN, 1), %al
581 jnz _read_next_byte_\@
582 MOVQ_R64_XMM %rax, \XMM1
585 jmp _done_read_partial_block_\@
588 _read_next_byte_lt8_\@:
590 mov -1(\DPTR, \DLEN, 1), %al
592 jnz _read_next_byte_lt8_\@
593 MOVQ_R64_XMM %rax, \XMMDst
594 _done_read_partial_block_\@:
597 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
598 # clobbers r10-11, xmm14
599 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
601 MOVADQ SHUF_MASK(%rip), %xmm14
602 mov \AAD, %r10 # %r10 = AAD
603 mov \AADLEN, %r11 # %r11 = aadLen
611 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
613 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
617 jge _get_AAD_blocks\@
621 /* read the last <16B of AAD */
626 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
627 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
629 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
633 movdqu \TMP6, AadHash(%arg2)
636 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
637 # between update calls.
638 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
639 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
640 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
641 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
643 mov PBlockLen(%arg2), %r13
645 je _partial_block_done_\@ # Leave Macro if no partial blocks
646 # Read in input data without over reading
647 cmp $16, \PLAIN_CYPH_LEN
648 jl _fewer_than_16_bytes_\@
649 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
652 _fewer_than_16_bytes_\@:
653 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
654 mov \PLAIN_CYPH_LEN, %r12
655 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
657 mov PBlockLen(%arg2), %r13
659 _data_read_\@: # Finished reading in data
661 movdqu PBlockEncKey(%arg2), %xmm9
662 movdqu HashKey(%arg2), %xmm13
664 lea SHIFT_MASK(%rip), %r12
666 # adjust the shuffle mask pointer to be able to shift r13 bytes
667 # r16-r13 is the number of bytes in plaintext mod 16)
669 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
670 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
674 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
676 mov \PLAIN_CYPH_LEN, %r10
678 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
680 # Determine if if partial block is not being filled and
681 # shift mask accordingly
682 jge _no_extra_mask_1_\@
686 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
687 # get the appropriate mask to mask out bottom r13 bytes of xmm9
688 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
691 movdqa SHUF_MASK(%rip), %xmm10
692 PSHUFB_XMM %xmm10, %xmm3
693 PSHUFB_XMM %xmm2, %xmm3
694 pxor %xmm3, \AAD_HASH
697 jl _partial_incomplete_1_\@
699 # GHASH computation for the last <16 Byte block
700 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
703 mov %rax, PBlockLen(%arg2)
705 _partial_incomplete_1_\@:
706 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
708 movdqu \AAD_HASH, AadHash(%arg2)
710 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
712 mov \PLAIN_CYPH_LEN, %r10
714 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
716 # Determine if if partial block is not being filled and
717 # shift mask accordingly
718 jge _no_extra_mask_2_\@
722 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
723 # get the appropriate mask to mask out bottom r13 bytes of xmm9
726 movdqa SHUF_MASK(%rip), %xmm1
727 PSHUFB_XMM %xmm1, %xmm9
728 PSHUFB_XMM %xmm2, %xmm9
729 pxor %xmm9, \AAD_HASH
732 jl _partial_incomplete_2_\@
734 # GHASH computation for the last <16 Byte block
735 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
738 mov %rax, PBlockLen(%arg2)
740 _partial_incomplete_2_\@:
741 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
743 movdqu \AAD_HASH, AadHash(%arg2)
745 movdqa SHUF_MASK(%rip), %xmm10
746 # shuffle xmm9 back to output as ciphertext
747 PSHUFB_XMM %xmm10, %xmm9
748 PSHUFB_XMM %xmm2, %xmm9
750 # output encrypted Bytes
755 # Set r13 to be the number of bytes to write out
759 mov \PLAIN_CYPH_LEN, %r13
762 MOVQ_R64_XMM %xmm0, %rax
764 jle _less_than_8_bytes_left_\@
766 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
769 MOVQ_R64_XMM %xmm0, %rax
771 _less_than_8_bytes_left_\@:
772 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
776 jne _less_than_8_bytes_left_\@
777 _partial_block_done_\@:
778 .endm # PARTIAL_BLOCK
781 * if a = number of total plaintext bytes
783 * num_initial_blocks = b mod 4
784 * encrypt the initial num_initial_blocks blocks and apply ghash on
786 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
788 * arg1, %arg2, %arg3 are used as a pointer only, not modified
792 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
793 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
794 MOVADQ SHUF_MASK(%rip), %xmm14
796 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
798 # start AES for num_initial_blocks blocks
800 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
802 .if (\i == 5) || (\i == 6) || (\i == 7)
804 MOVADQ ONE(%RIP),\TMP1
805 MOVADQ 0(%arg1),\TMP2
807 paddd \TMP1, \XMM0 # INCR Y0
809 movdqa \XMM0, %xmm\index
811 MOVADQ \XMM0, %xmm\index
813 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
814 pxor \TMP2, %xmm\index
818 shr $2,%eax # 128->4, 192->6, 256->8
819 add $5,%eax # 128->9, 192->11, 256->13
824 AESENC \TMP1, %xmm\index
828 jnz aes_loop_initial_\@
832 AESENCLAST \TMP1, %xmm\index # Last Round
835 movdqu (%arg4 , %r11, 1), \TMP1
836 pxor \TMP1, %xmm\index
837 movdqu %xmm\index, (%arg3 , %r11, 1)
838 # write back plaintext/ciphertext for num_initial_blocks
842 movdqa \TMP1, %xmm\index
844 PSHUFB_XMM %xmm14, %xmm\index
846 # prepare plaintext/ciphertext for GHASH computation
850 # apply GHASH on num_initial_blocks blocks
854 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
869 jl _initial_blocks_done\@
870 # no need for precomputed values
873 * Precomputations for HashKey parallel with encryption of first 4 blocks.
874 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
876 MOVADQ ONE(%RIP),\TMP1
877 paddd \TMP1, \XMM0 # INCR Y0
879 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
881 paddd \TMP1, \XMM0 # INCR Y0
883 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
885 paddd \TMP1, \XMM0 # INCR Y0
887 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
889 paddd \TMP1, \XMM0 # INCR Y0
891 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
893 MOVADQ 0(%arg1),\TMP1
898 .irpc index, 1234 # do 4 rounds
899 movaps 0x10*\index(%arg1), \TMP1
905 .irpc index, 56789 # do next 5 rounds
906 movaps 0x10*\index(%arg1), \TMP1
914 shr $2,%eax # 128->4, 192->6, 256->8
915 sub $4,%eax # 128->0, 192->2, 256->4
916 jz aes_loop_pre_done\@
921 AESENC \TMP2, %xmm\index
929 AESENCLAST \TMP2, \XMM1
930 AESENCLAST \TMP2, \XMM2
931 AESENCLAST \TMP2, \XMM3
932 AESENCLAST \TMP2, \XMM4
933 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
936 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
939 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
942 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
945 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
948 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
951 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
954 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
957 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
958 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
959 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
960 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
964 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
966 # combine GHASHed value with the corresponding ciphertext
967 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
968 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
969 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
971 _initial_blocks_done\@:
976 * encrypt 4 blocks at a time
977 * ghash the 4 previously encrypted ciphertext blocks
978 * arg1, %arg3, %arg4 are used as pointers only, not modified
979 * %r11 is the data offset value
981 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
982 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
989 movdqa SHUF_MASK(%rip), %xmm15
990 # multiply TMP5 * HashKey using karatsuba
993 pshufd $78, \XMM5, \TMP6
995 paddd ONE(%rip), \XMM0 # INCR CNT
996 movdqu HashKey_4(%arg2), \TMP5
997 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
999 paddd ONE(%rip), \XMM0 # INCR CNT
1001 paddd ONE(%rip), \XMM0 # INCR CNT
1003 paddd ONE(%rip), \XMM0 # INCR CNT
1005 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1006 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1015 movdqu HashKey_4_k(%arg2), \TMP5
1016 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1017 movaps 0x10(%arg1), \TMP1
1018 AESENC \TMP1, \XMM1 # Round 1
1022 movaps 0x20(%arg1), \TMP1
1023 AESENC \TMP1, \XMM1 # Round 2
1028 pshufd $78, \XMM6, \TMP2
1030 movdqu HashKey_3(%arg2), \TMP5
1031 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1032 movaps 0x30(%arg1), \TMP3
1033 AESENC \TMP3, \XMM1 # Round 3
1037 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1038 movaps 0x40(%arg1), \TMP3
1039 AESENC \TMP3, \XMM1 # Round 4
1043 movdqu HashKey_3_k(%arg2), \TMP5
1044 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1045 movaps 0x50(%arg1), \TMP3
1046 AESENC \TMP3, \XMM1 # Round 5
1051 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1055 pshufd $78, \XMM7, \TMP2
1057 movdqu HashKey_2(%arg2), \TMP5
1059 # Multiply TMP5 * HashKey using karatsuba
1061 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1062 movaps 0x60(%arg1), \TMP3
1063 AESENC \TMP3, \XMM1 # Round 6
1067 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1068 movaps 0x70(%arg1), \TMP3
1069 AESENC \TMP3, \XMM1 # Round 7
1073 movdqu HashKey_2_k(%arg2), \TMP5
1074 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1075 movaps 0x80(%arg1), \TMP3
1076 AESENC \TMP3, \XMM1 # Round 8
1081 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1085 # Multiply XMM8 * HashKey
1086 # XMM8 and TMP5 hold the values for the two operands
1089 pshufd $78, \XMM8, \TMP2
1091 movdqu HashKey(%arg2), \TMP5
1092 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1093 movaps 0x90(%arg1), \TMP3
1094 AESENC \TMP3, \XMM1 # Round 9
1098 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1099 lea 0xa0(%arg1),%r10
1101 shr $2,%eax # 128->4, 192->6, 256->8
1102 sub $4,%eax # 128->0, 192->2, 256->4
1103 jz aes_loop_par_enc_done\@
1108 AESENC \TMP3, %xmm\index
1112 jnz aes_loop_par_enc\@
1114 aes_loop_par_enc_done\@:
1115 MOVADQ (%r10), \TMP3
1116 AESENCLAST \TMP3, \XMM1 # Round 10
1117 AESENCLAST \TMP3, \XMM2
1118 AESENCLAST \TMP3, \XMM3
1119 AESENCLAST \TMP3, \XMM4
1120 movdqu HashKey_k(%arg2), \TMP5
1121 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1122 movdqu (%arg4,%r11,1), \TMP3
1123 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1124 movdqu 16(%arg4,%r11,1), \TMP3
1125 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1126 movdqu 32(%arg4,%r11,1), \TMP3
1127 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1128 movdqu 48(%arg4,%r11,1), \TMP3
1129 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1130 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1131 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1132 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1133 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
1134 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1135 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1136 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1137 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1145 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1146 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1148 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1150 # first phase of reduction
1155 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1156 pslld $31, \TMP2 # packed right shift << 31
1157 pslld $30, \TMP3 # packed right shift << 30
1158 pslld $25, \TMP4 # packed right shift << 25
1159 pxor \TMP3, \TMP2 # xor the shifted versions
1162 psrldq $4, \TMP5 # right shift T5 1 DW
1163 pslldq $12, \TMP2 # left shift T2 3 DWs
1166 # second phase of reduction
1168 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1171 psrld $1, \TMP2 # packed left shift >>1
1172 psrld $2, \TMP3 # packed left shift >>2
1173 psrld $7, \TMP4 # packed left shift >>7
1174 pxor \TMP3,\TMP2 # xor the shifted versions
1178 pxor \TMP1, \XMM5 # result is in TMP1
1184 * decrypt 4 blocks at a time
1185 * ghash the 4 previously decrypted ciphertext blocks
1186 * arg1, %arg3, %arg4 are used as pointers only, not modified
1187 * %r11 is the data offset value
1189 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1190 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1197 movdqa SHUF_MASK(%rip), %xmm15
1198 # multiply TMP5 * HashKey using karatsuba
1201 pshufd $78, \XMM5, \TMP6
1203 paddd ONE(%rip), \XMM0 # INCR CNT
1204 movdqu HashKey_4(%arg2), \TMP5
1205 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1207 paddd ONE(%rip), \XMM0 # INCR CNT
1209 paddd ONE(%rip), \XMM0 # INCR CNT
1211 paddd ONE(%rip), \XMM0 # INCR CNT
1213 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1214 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1215 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1216 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1217 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1223 movdqu HashKey_4_k(%arg2), \TMP5
1224 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1225 movaps 0x10(%arg1), \TMP1
1226 AESENC \TMP1, \XMM1 # Round 1
1230 movaps 0x20(%arg1), \TMP1
1231 AESENC \TMP1, \XMM1 # Round 2
1236 pshufd $78, \XMM6, \TMP2
1238 movdqu HashKey_3(%arg2), \TMP5
1239 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1240 movaps 0x30(%arg1), \TMP3
1241 AESENC \TMP3, \XMM1 # Round 3
1245 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1246 movaps 0x40(%arg1), \TMP3
1247 AESENC \TMP3, \XMM1 # Round 4
1251 movdqu HashKey_3_k(%arg2), \TMP5
1252 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1253 movaps 0x50(%arg1), \TMP3
1254 AESENC \TMP3, \XMM1 # Round 5
1259 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1263 pshufd $78, \XMM7, \TMP2
1265 movdqu HashKey_2(%arg2), \TMP5
1267 # Multiply TMP5 * HashKey using karatsuba
1269 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1270 movaps 0x60(%arg1), \TMP3
1271 AESENC \TMP3, \XMM1 # Round 6
1275 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1276 movaps 0x70(%arg1), \TMP3
1277 AESENC \TMP3, \XMM1 # Round 7
1281 movdqu HashKey_2_k(%arg2), \TMP5
1282 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1283 movaps 0x80(%arg1), \TMP3
1284 AESENC \TMP3, \XMM1 # Round 8
1289 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1293 # Multiply XMM8 * HashKey
1294 # XMM8 and TMP5 hold the values for the two operands
1297 pshufd $78, \XMM8, \TMP2
1299 movdqu HashKey(%arg2), \TMP5
1300 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1301 movaps 0x90(%arg1), \TMP3
1302 AESENC \TMP3, \XMM1 # Round 9
1306 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1307 lea 0xa0(%arg1),%r10
1309 shr $2,%eax # 128->4, 192->6, 256->8
1310 sub $4,%eax # 128->0, 192->2, 256->4
1311 jz aes_loop_par_dec_done\@
1316 AESENC \TMP3, %xmm\index
1320 jnz aes_loop_par_dec\@
1322 aes_loop_par_dec_done\@:
1323 MOVADQ (%r10), \TMP3
1324 AESENCLAST \TMP3, \XMM1 # last round
1325 AESENCLAST \TMP3, \XMM2
1326 AESENCLAST \TMP3, \XMM3
1327 AESENCLAST \TMP3, \XMM4
1328 movdqu HashKey_k(%arg2), \TMP5
1329 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1330 movdqu (%arg4,%r11,1), \TMP3
1331 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1332 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1334 movdqu 16(%arg4,%r11,1), \TMP3
1335 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1336 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1338 movdqu 32(%arg4,%r11,1), \TMP3
1339 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1340 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1342 movdqu 48(%arg4,%r11,1), \TMP3
1343 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1344 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1346 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1347 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1348 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1349 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1357 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1358 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1360 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1362 # first phase of reduction
1367 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1368 pslld $31, \TMP2 # packed right shift << 31
1369 pslld $30, \TMP3 # packed right shift << 30
1370 pslld $25, \TMP4 # packed right shift << 25
1371 pxor \TMP3, \TMP2 # xor the shifted versions
1374 psrldq $4, \TMP5 # right shift T5 1 DW
1375 pslldq $12, \TMP2 # left shift T2 3 DWs
1378 # second phase of reduction
1380 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1383 psrld $1, \TMP2 # packed left shift >>1
1384 psrld $2, \TMP3 # packed left shift >>2
1385 psrld $7, \TMP4 # packed left shift >>7
1386 pxor \TMP3,\TMP2 # xor the shifted versions
1390 pxor \TMP1, \XMM5 # result is in TMP1
1395 /* GHASH the last 4 ciphertext blocks. */
1396 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1397 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1399 # Multiply TMP6 * HashKey (using Karatsuba)
1402 pshufd $78, \XMM1, \TMP2
1404 movdqu HashKey_4(%arg2), \TMP5
1405 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1406 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1407 movdqu HashKey_4_k(%arg2), \TMP4
1408 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1409 movdqa \XMM1, \XMMDst
1410 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1412 # Multiply TMP1 * HashKey (using Karatsuba)
1415 pshufd $78, \XMM2, \TMP2
1417 movdqu HashKey_3(%arg2), \TMP5
1418 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1419 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1420 movdqu HashKey_3_k(%arg2), \TMP4
1421 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1425 # results accumulated in TMP6, XMMDst, XMM1
1427 # Multiply TMP1 * HashKey (using Karatsuba)
1430 pshufd $78, \XMM3, \TMP2
1432 movdqu HashKey_2(%arg2), \TMP5
1433 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1434 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1435 movdqu HashKey_2_k(%arg2), \TMP4
1436 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1439 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1441 # Multiply TMP1 * HashKey (using Karatsuba)
1443 pshufd $78, \XMM4, \TMP2
1445 movdqu HashKey(%arg2), \TMP5
1446 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1447 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1448 movdqu HashKey_k(%arg2), \TMP4
1449 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1455 # middle section of the temp results combined as in karatsuba algorithm
1457 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1458 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1461 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1462 # first phase of the reduction
1463 movdqa \XMMDst, \TMP2
1464 movdqa \XMMDst, \TMP3
1465 movdqa \XMMDst, \TMP4
1466 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1467 pslld $31, \TMP2 # packed right shifting << 31
1468 pslld $30, \TMP3 # packed right shifting << 30
1469 pslld $25, \TMP4 # packed right shifting << 25
1470 pxor \TMP3, \TMP2 # xor the shifted versions
1473 psrldq $4, \TMP7 # right shift TMP7 1 DW
1474 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1477 # second phase of the reduction
1478 movdqa \XMMDst, \TMP2
1479 # make 3 copies of XMMDst for doing 3 shift operations
1480 movdqa \XMMDst, \TMP3
1481 movdqa \XMMDst, \TMP4
1482 psrld $1, \TMP2 # packed left shift >> 1
1483 psrld $2, \TMP3 # packed left shift >> 2
1484 psrld $7, \TMP4 # packed left shift >> 7
1485 pxor \TMP3, \TMP2 # xor the shifted versions
1489 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1493 /* Encryption of a single block
1497 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1501 shr $2,%eax # 128->4, 192->6, 256->8
1502 add $5,%eax # 128->9, 192->11, 256->13
1503 lea 16(%arg1), %r10 # get first expanded key address
1513 AESENCLAST \TMP1,\XMM0
1515 /*****************************************************************************
1516 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1517 * struct gcm_context_data *data
1519 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1520 * const u8 *in, // Ciphertext input
1521 * u64 plaintext_len, // Length of data in bytes for decryption.
1522 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1523 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1524 * // concatenated with 0x00000001. 16-byte aligned pointer.
1525 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1526 * const u8 *aad, // Additional Authentication Data (AAD)
1527 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1528 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1529 * // given authentication tag and only return the plaintext if they match.
1530 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1531 * // (most likely), 12 or 8.
1536 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1537 * set of 11 keys in the data structure void *aes_ctx
1541 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1542 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543 * | Salt (From the SA) |
1544 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545 * | Initialization Vector |
1546 * | (This is the sequence number from IPSec header) |
1547 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1554 * AAD padded to 128 bits with 0
1555 * for example, assume AAD is a u32 vector
1557 * if AAD is 8 bytes:
1558 * AAD[3] = {A0, A1};
1559 * padded AAD in xmm register = {A1 A0 0 0}
1562 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1563 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1566 * | 32-bit Sequence Number (A0) |
1567 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1569 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1571 * AAD Format with 32-bit Sequence Number
1573 * if AAD is 12 bytes:
1574 * AAD[3] = {A0, A1, A2};
1575 * padded AAD in xmm register = {A2 A1 A0 0}
1578 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1579 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1580 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1581 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584 * | 64-bit Extended Sequence Number {A1,A0} |
1586 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1590 * AAD Format with 64-bit Extended Sequence Number
1592 * poly = x^128 + x^127 + x^126 + x^121 + 1
1594 *****************************************************************************/
1595 ENTRY(aesni_gcm_dec)
1598 GCM_INIT %arg6, arg7, arg8, arg9
1600 GCM_COMPLETE arg10, arg11
1603 ENDPROC(aesni_gcm_dec)
1606 /*****************************************************************************
1607 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1608 * struct gcm_context_data *data
1610 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1611 * const u8 *in, // Plaintext input
1612 * u64 plaintext_len, // Length of data in bytes for encryption.
1613 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1614 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1615 * // concatenated with 0x00000001. 16-byte aligned pointer.
1616 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1617 * const u8 *aad, // Additional Authentication Data (AAD)
1618 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1619 * u8 *auth_tag, // Authenticated Tag output.
1620 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1626 * keys are pre-expanded and aligned to 16 bytes. we are using the
1627 * first set of 11 keys in the data structure void *aes_ctx
1632 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1633 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634 * | Salt (From the SA) |
1635 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636 * | Initialization Vector |
1637 * | (This is the sequence number from IPSec header) |
1638 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645 * AAD padded to 128 bits with 0
1646 * for example, assume AAD is a u32 vector
1648 * if AAD is 8 bytes:
1649 * AAD[3] = {A0, A1};
1650 * padded AAD in xmm register = {A1 A0 0 0}
1653 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1654 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657 * | 32-bit Sequence Number (A0) |
1658 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1660 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1662 * AAD Format with 32-bit Sequence Number
1664 * if AAD is 12 bytes:
1665 * AAD[3] = {A0, A1, A2};
1666 * padded AAD in xmm register = {A2 A1 A0 0}
1669 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1670 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673 * | 64-bit Extended Sequence Number {A1,A0} |
1675 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1679 * AAD Format with 64-bit Extended Sequence Number
1681 * poly = x^128 + x^127 + x^126 + x^121 + 1
1682 ***************************************************************************/
1683 ENTRY(aesni_gcm_enc)
1686 GCM_INIT %arg6, arg7, arg8, arg9
1689 GCM_COMPLETE arg10, arg11
1692 ENDPROC(aesni_gcm_enc)
1694 /*****************************************************************************
1695 * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1696 * struct gcm_context_data *data,
1698 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1699 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1700 * // concatenated with 0x00000001. 16-byte aligned pointer.
1701 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1702 * const u8 *aad, // Additional Authentication Data (AAD)
1703 * u64 aad_len) // Length of AAD in bytes.
1705 ENTRY(aesni_gcm_init)
1707 GCM_INIT %arg3, %arg4,%arg5, %arg6
1710 ENDPROC(aesni_gcm_init)
1712 /*****************************************************************************
1713 * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1714 * struct gcm_context_data *data,
1716 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1717 * const u8 *in, // Plaintext input
1718 * u64 plaintext_len, // Length of data in bytes for encryption.
1720 ENTRY(aesni_gcm_enc_update)
1725 ENDPROC(aesni_gcm_enc_update)
1727 /*****************************************************************************
1728 * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1729 * struct gcm_context_data *data,
1731 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1732 * const u8 *in, // Plaintext input
1733 * u64 plaintext_len, // Length of data in bytes for encryption.
1735 ENTRY(aesni_gcm_dec_update)
1740 ENDPROC(aesni_gcm_dec_update)
1742 /*****************************************************************************
1743 * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1744 * struct gcm_context_data *data,
1746 * u8 *auth_tag, // Authenticated Tag output.
1747 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1750 ENTRY(aesni_gcm_finalize)
1752 GCM_COMPLETE %arg3 %arg4
1755 ENDPROC(aesni_gcm_finalize)
1762 _key_expansion_256a:
1763 pshufd $0b11111111, %xmm1, %xmm1
1764 shufps $0b00010000, %xmm0, %xmm4
1766 shufps $0b10001100, %xmm0, %xmm4
1769 movaps %xmm0, (TKEYP)
1772 ENDPROC(_key_expansion_128)
1773 ENDPROC(_key_expansion_256a)
1776 _key_expansion_192a:
1777 pshufd $0b01010101, %xmm1, %xmm1
1778 shufps $0b00010000, %xmm0, %xmm4
1780 shufps $0b10001100, %xmm0, %xmm4
1787 pshufd $0b11111111, %xmm0, %xmm3
1792 shufps $0b01000100, %xmm0, %xmm6
1793 movaps %xmm6, (TKEYP)
1794 shufps $0b01001110, %xmm2, %xmm1
1795 movaps %xmm1, 0x10(TKEYP)
1798 ENDPROC(_key_expansion_192a)
1801 _key_expansion_192b:
1802 pshufd $0b01010101, %xmm1, %xmm1
1803 shufps $0b00010000, %xmm0, %xmm4
1805 shufps $0b10001100, %xmm0, %xmm4
1811 pshufd $0b11111111, %xmm0, %xmm3
1815 movaps %xmm0, (TKEYP)
1818 ENDPROC(_key_expansion_192b)
1821 _key_expansion_256b:
1822 pshufd $0b10101010, %xmm1, %xmm1
1823 shufps $0b00010000, %xmm2, %xmm4
1825 shufps $0b10001100, %xmm2, %xmm4
1828 movaps %xmm2, (TKEYP)
1831 ENDPROC(_key_expansion_256b)
1834 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1835 * unsigned int key_len)
1837 ENTRY(aesni_set_key)
1841 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1842 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1843 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1845 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1846 movaps %xmm0, (KEYP)
1847 lea 0x10(KEYP), TKEYP # key addr
1848 movl %edx, 480(KEYP)
1849 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1853 movups 0x10(UKEYP), %xmm2 # other user key
1854 movaps %xmm2, (TKEYP)
1856 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1857 call _key_expansion_256a
1858 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1859 call _key_expansion_256b
1860 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1861 call _key_expansion_256a
1862 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1863 call _key_expansion_256b
1864 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1865 call _key_expansion_256a
1866 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1867 call _key_expansion_256b
1868 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1869 call _key_expansion_256a
1870 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1871 call _key_expansion_256b
1872 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1873 call _key_expansion_256a
1874 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1875 call _key_expansion_256b
1876 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1877 call _key_expansion_256a
1878 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1879 call _key_expansion_256b
1880 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1881 call _key_expansion_256a
1884 movq 0x10(UKEYP), %xmm2 # other user key
1885 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1886 call _key_expansion_192a
1887 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1888 call _key_expansion_192b
1889 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1890 call _key_expansion_192a
1891 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1892 call _key_expansion_192b
1893 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1894 call _key_expansion_192a
1895 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1896 call _key_expansion_192b
1897 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1898 call _key_expansion_192a
1899 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1900 call _key_expansion_192b
1903 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1904 call _key_expansion_128
1905 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1906 call _key_expansion_128
1907 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1908 call _key_expansion_128
1909 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1910 call _key_expansion_128
1911 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1912 call _key_expansion_128
1913 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1914 call _key_expansion_128
1915 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1916 call _key_expansion_128
1917 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1918 call _key_expansion_128
1919 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1920 call _key_expansion_128
1921 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1922 call _key_expansion_128
1925 movaps (KEYP), %xmm0
1926 movaps (TKEYP), %xmm1
1927 movaps %xmm0, 240(TKEYP)
1928 movaps %xmm1, 240(KEYP)
1930 lea 240-16(TKEYP), UKEYP
1933 movaps (KEYP), %xmm0
1935 movaps %xmm1, (UKEYP)
1946 ENDPROC(aesni_set_key)
1949 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1956 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1957 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1958 movl (FRAME_OFFSET+20)(%esp), INP # src
1960 movl 480(KEYP), KLEN # key length
1961 movups (INP), STATE # input
1963 movups STATE, (OUTP) # output
1973 * _aesni_enc1: internal ABI
1975 * KEYP: key struct pointer
1977 * STATE: initial state (input)
1979 * STATE: finial state (output)
1986 movaps (KEYP), KEY # key
1988 pxor KEY, STATE # round 0
1992 lea 0x20(TKEYP), TKEYP
1995 movaps -0x60(TKEYP), KEY
1997 movaps -0x50(TKEYP), KEY
2001 movaps -0x40(TKEYP), KEY
2003 movaps -0x30(TKEYP), KEY
2007 movaps -0x20(TKEYP), KEY
2009 movaps -0x10(TKEYP), KEY
2013 movaps 0x10(TKEYP), KEY
2015 movaps 0x20(TKEYP), KEY
2017 movaps 0x30(TKEYP), KEY
2019 movaps 0x40(TKEYP), KEY
2021 movaps 0x50(TKEYP), KEY
2023 movaps 0x60(TKEYP), KEY
2025 movaps 0x70(TKEYP), KEY
2026 AESENCLAST KEY STATE
2028 ENDPROC(_aesni_enc1)
2031 * _aesni_enc4: internal ABI
2033 * KEYP: key struct pointer
2035 * STATE1: initial state (input)
2040 * STATE1: finial state (output)
2050 movaps (KEYP), KEY # key
2052 pxor KEY, STATE1 # round 0
2059 lea 0x20(TKEYP), TKEYP
2062 movaps -0x60(TKEYP), KEY
2067 movaps -0x50(TKEYP), KEY
2074 movaps -0x40(TKEYP), KEY
2079 movaps -0x30(TKEYP), KEY
2086 movaps -0x20(TKEYP), KEY
2091 movaps -0x10(TKEYP), KEY
2101 movaps 0x10(TKEYP), KEY
2106 movaps 0x20(TKEYP), KEY
2111 movaps 0x30(TKEYP), KEY
2116 movaps 0x40(TKEYP), KEY
2121 movaps 0x50(TKEYP), KEY
2126 movaps 0x60(TKEYP), KEY
2131 movaps 0x70(TKEYP), KEY
2132 AESENCLAST KEY STATE1 # last round
2133 AESENCLAST KEY STATE2
2134 AESENCLAST KEY STATE3
2135 AESENCLAST KEY STATE4
2137 ENDPROC(_aesni_enc4)
2140 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2147 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2148 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2149 movl (FRAME_OFFSET+20)(%esp), INP # src
2151 mov 480(KEYP), KLEN # key length
2153 movups (INP), STATE # input
2155 movups STATE, (OUTP) #output
2165 * _aesni_dec1: internal ABI
2167 * KEYP: key struct pointer
2169 * STATE: initial state (input)
2171 * STATE: finial state (output)
2178 movaps (KEYP), KEY # key
2180 pxor KEY, STATE # round 0
2184 lea 0x20(TKEYP), TKEYP
2187 movaps -0x60(TKEYP), KEY
2189 movaps -0x50(TKEYP), KEY
2193 movaps -0x40(TKEYP), KEY
2195 movaps -0x30(TKEYP), KEY
2199 movaps -0x20(TKEYP), KEY
2201 movaps -0x10(TKEYP), KEY
2205 movaps 0x10(TKEYP), KEY
2207 movaps 0x20(TKEYP), KEY
2209 movaps 0x30(TKEYP), KEY
2211 movaps 0x40(TKEYP), KEY
2213 movaps 0x50(TKEYP), KEY
2215 movaps 0x60(TKEYP), KEY
2217 movaps 0x70(TKEYP), KEY
2218 AESDECLAST KEY STATE
2220 ENDPROC(_aesni_dec1)
2223 * _aesni_dec4: internal ABI
2225 * KEYP: key struct pointer
2227 * STATE1: initial state (input)
2232 * STATE1: finial state (output)
2242 movaps (KEYP), KEY # key
2244 pxor KEY, STATE1 # round 0
2251 lea 0x20(TKEYP), TKEYP
2254 movaps -0x60(TKEYP), KEY
2259 movaps -0x50(TKEYP), KEY
2266 movaps -0x40(TKEYP), KEY
2271 movaps -0x30(TKEYP), KEY
2278 movaps -0x20(TKEYP), KEY
2283 movaps -0x10(TKEYP), KEY
2293 movaps 0x10(TKEYP), KEY
2298 movaps 0x20(TKEYP), KEY
2303 movaps 0x30(TKEYP), KEY
2308 movaps 0x40(TKEYP), KEY
2313 movaps 0x50(TKEYP), KEY
2318 movaps 0x60(TKEYP), KEY
2323 movaps 0x70(TKEYP), KEY
2324 AESDECLAST KEY STATE1 # last round
2325 AESDECLAST KEY STATE2
2326 AESDECLAST KEY STATE3
2327 AESDECLAST KEY STATE4
2329 ENDPROC(_aesni_dec4)
2332 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2335 ENTRY(aesni_ecb_enc)
2341 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2342 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2343 movl (FRAME_OFFSET+24)(%esp), INP # src
2344 movl (FRAME_OFFSET+28)(%esp), LEN # len
2346 test LEN, LEN # check length
2355 movups (INP), STATE1
2356 movups 0x10(INP), STATE2
2357 movups 0x20(INP), STATE3
2358 movups 0x30(INP), STATE4
2360 movups STATE1, (OUTP)
2361 movups STATE2, 0x10(OUTP)
2362 movups STATE3, 0x20(OUTP)
2363 movups STATE4, 0x30(OUTP)
2373 movups (INP), STATE1
2375 movups STATE1, (OUTP)
2389 ENDPROC(aesni_ecb_enc)
2392 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2395 ENTRY(aesni_ecb_dec)
2401 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2402 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2403 movl (FRAME_OFFSET+24)(%esp), INP # src
2404 movl (FRAME_OFFSET+28)(%esp), LEN # len
2416 movups (INP), STATE1
2417 movups 0x10(INP), STATE2
2418 movups 0x20(INP), STATE3
2419 movups 0x30(INP), STATE4
2421 movups STATE1, (OUTP)
2422 movups STATE2, 0x10(OUTP)
2423 movups STATE3, 0x20(OUTP)
2424 movups STATE4, 0x30(OUTP)
2434 movups (INP), STATE1
2436 movups STATE1, (OUTP)
2450 ENDPROC(aesni_ecb_dec)
2453 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2454 * size_t len, u8 *iv)
2456 ENTRY(aesni_cbc_enc)
2463 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2464 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2465 movl (FRAME_OFFSET+28)(%esp), INP # src
2466 movl (FRAME_OFFSET+32)(%esp), LEN # len
2467 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2472 movups (IVP), STATE # load iv as initial state
2475 movups (INP), IN # load input
2478 movups STATE, (OUTP) # store output
2494 ENDPROC(aesni_cbc_enc)
2497 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2498 * size_t len, u8 *iv)
2500 ENTRY(aesni_cbc_dec)
2507 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2508 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2509 movl (FRAME_OFFSET+28)(%esp), INP # src
2510 movl (FRAME_OFFSET+32)(%esp), LEN # len
2511 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2514 jb .Lcbc_dec_just_ret
2524 movups 0x10(INP), IN2
2527 movups 0x20(INP), IN3
2529 movups 0x30(INP), IN4
2532 movups 0x20(INP), IN1
2534 movups 0x30(INP), IN2
2549 movups 0x10(INP), IN2
2552 movups STATE1, (OUTP)
2553 movups STATE2, 0x10(OUTP)
2554 movups STATE3, 0x20(OUTP)
2555 movups STATE4, 0x30(OUTP)
2569 movups STATE, (OUTP)
2587 ENDPROC(aesni_cbc_dec)
2590 .pushsection .rodata
2593 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2597 * _aesni_inc_init: internal ABI
2598 * setup registers used by _aesni_inc
2602 * CTR: == IV, in little endian
2603 * TCTR_LOW: == lower qword of CTR
2604 * INC: == 1, in little endian
2605 * BSWAP_MASK == endian swapping mask
2609 movaps .Lbswap_mask, BSWAP_MASK
2611 PSHUFB_XMM BSWAP_MASK CTR
2613 MOVQ_R64_XMM TCTR_LOW INC
2614 MOVQ_R64_XMM CTR TCTR_LOW
2616 ENDPROC(_aesni_inc_init)
2619 * _aesni_inc: internal ABI
2620 * Increase IV by 1, IV is in big endian
2623 * CTR: == IV, in little endian
2624 * TCTR_LOW: == lower qword of CTR
2625 * INC: == 1, in little endian
2626 * BSWAP_MASK == endian swapping mask
2630 * CTR: == output IV, in little endian
2631 * TCTR_LOW: == lower qword of CTR
2643 PSHUFB_XMM BSWAP_MASK IV
2648 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2649 * size_t len, u8 *iv)
2651 ENTRY(aesni_ctr_enc)
2654 jb .Lctr_enc_just_ret
2657 call _aesni_inc_init
2667 movups 0x10(INP), IN2
2670 movups 0x20(INP), IN3
2673 movups 0x30(INP), IN4
2676 movups STATE1, (OUTP)
2678 movups STATE2, 0x10(OUTP)
2680 movups STATE3, 0x20(OUTP)
2682 movups STATE4, 0x30(OUTP)
2697 movups STATE, (OUTP)
2708 ENDPROC(aesni_ctr_enc)
2711 * _aesni_gf128mul_x_ble: internal ABI
2712 * Multiply in GF(2^128) for XTS IVs
2715 * GF128MUL_MASK == mask with 0x87 and 0x01
2719 * CTR: == temporary value
2721 #define _aesni_gf128mul_x_ble() \
2722 pshufd $0x13, IV, CTR; \
2725 pand GF128MUL_MASK, CTR; \
2729 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2730 * const u8 *src, unsigned int len, le128 *iv)
2732 ENTRY(aesni_xts_encrypt)
2735 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2742 movdqu 0x00(INP), INC
2744 movdqu IV, 0x00(OUTP)
2746 _aesni_gf128mul_x_ble()
2748 movdqu 0x10(INP), INC
2750 movdqu IV, 0x10(OUTP)
2752 _aesni_gf128mul_x_ble()
2754 movdqu 0x20(INP), INC
2756 movdqu IV, 0x20(OUTP)
2758 _aesni_gf128mul_x_ble()
2760 movdqu 0x30(INP), INC
2762 movdqu IV, 0x30(OUTP)
2766 movdqu 0x00(OUTP), INC
2768 movdqu STATE1, 0x00(OUTP)
2770 movdqu 0x10(OUTP), INC
2772 movdqu STATE2, 0x10(OUTP)
2774 movdqu 0x20(OUTP), INC
2776 movdqu STATE3, 0x20(OUTP)
2778 movdqu 0x30(OUTP), INC
2780 movdqu STATE4, 0x30(OUTP)
2782 _aesni_gf128mul_x_ble()
2793 ENDPROC(aesni_xts_encrypt)
2796 * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2797 * const u8 *src, unsigned int len, le128 *iv)
2799 ENTRY(aesni_xts_decrypt)
2802 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2810 movdqu 0x00(INP), INC
2812 movdqu IV, 0x00(OUTP)
2814 _aesni_gf128mul_x_ble()
2816 movdqu 0x10(INP), INC
2818 movdqu IV, 0x10(OUTP)
2820 _aesni_gf128mul_x_ble()
2822 movdqu 0x20(INP), INC
2824 movdqu IV, 0x20(OUTP)
2826 _aesni_gf128mul_x_ble()
2828 movdqu 0x30(INP), INC
2830 movdqu IV, 0x30(OUTP)
2834 movdqu 0x00(OUTP), INC
2836 movdqu STATE1, 0x00(OUTP)
2838 movdqu 0x10(OUTP), INC
2840 movdqu STATE2, 0x10(OUTP)
2842 movdqu 0x20(OUTP), INC
2844 movdqu STATE3, 0x20(OUTP)
2846 movdqu 0x30(OUTP), INC
2848 movdqu STATE4, 0x30(OUTP)
2850 _aesni_gf128mul_x_ble()
2861 ENDPROC(aesni_xts_decrypt)