1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
124 # constants in mergeable sections, linker can reorder and merge
125 .section .rodata.cst16.POLY, "aM", @progbits, 16
127 POLY: .octa 0xC2000000000000000000000000000001
129 .section .rodata.cst16.POLY2, "aM", @progbits, 16
131 POLY2: .octa 0xC20000000000000000000001C2000000
133 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
135 TWOONE: .octa 0x00000001000000000000000000000001
137 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
141 .section .rodata.cst16.ONE, "aM", @progbits, 16
143 ONE: .octa 0x00000000000000000000000000000001
145 .section .rodata.cst16.ONEf, "aM", @progbits, 16
147 ONEf: .octa 0x01000000000000000000000000000000
149 # order of these constants should not change.
150 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151 .section .rodata, "a", @progbits
153 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
154 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
155 .octa 0x00000000000000000000000000000000
159 .type aad_shift_arr, @object
160 .size aad_shift_arr, 272
162 .octa 0xffffffffffffffffffffffffffffffff
163 .octa 0xffffffffffffffffffffffffffffff0C
164 .octa 0xffffffffffffffffffffffffffff0D0C
165 .octa 0xffffffffffffffffffffffffff0E0D0C
166 .octa 0xffffffffffffffffffffffff0F0E0D0C
167 .octa 0xffffffffffffffffffffff0C0B0A0908
168 .octa 0xffffffffffffffffffff0D0C0B0A0908
169 .octa 0xffffffffffffffffff0E0D0C0B0A0908
170 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
171 .octa 0xffffffffffffff0C0B0A090807060504
172 .octa 0xffffffffffff0D0C0B0A090807060504
173 .octa 0xffffffffff0E0D0C0B0A090807060504
174 .octa 0xffffffff0F0E0D0C0B0A090807060504
175 .octa 0xffffff0C0B0A09080706050403020100
176 .octa 0xffff0D0C0B0A09080706050403020100
177 .octa 0xff0E0D0C0B0A09080706050403020100
178 .octa 0x0F0E0D0C0B0A09080706050403020100
186 #define InLen (16*1)+8
187 #define PBlockEncKey 16*2
189 #define CurCount 16*4
190 #define PBlockLen 16*5
192 HashKey = 16*6 # store HashKey <<1 mod poly here
193 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
194 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
195 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
196 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
197 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
198 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
199 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
200 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
201 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
202 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
203 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
204 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
205 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
206 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
207 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
215 #define keysize 2*15*16(arg1)
225 .macro define_reg r n
236 TMP1 = 16*0 # Temporary storage for AAD
237 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
238 TMP3 = 16*2 # Temporary storage for AES State 3
239 TMP4 = 16*3 # Temporary storage for AES State 4
240 TMP5 = 16*4 # Temporary storage for AES State 5
241 TMP6 = 16*5 # Temporary storage for AES State 6
242 TMP7 = 16*6 # Temporary storage for AES State 7
243 TMP8 = 16*7 # Temporary storage for AES State 8
245 VARIABLE_OFFSET = 16*8
247 ################################
249 ################################
259 sub $VARIABLE_OFFSET, %rsp
260 and $~63, %rsp # align rsp to 64 bytes
272 # Encryption of a single block
273 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
274 vpxor (arg1), \XMM0, \XMM0
278 vaesenc 16*i(arg1), \XMM0, \XMM0
282 vaesenclast 16*i(arg1), \XMM0, \XMM0
285 # combined for GCM encrypt and decrypt functions
286 # clobbering all xmm registers
287 # clobbering r10, r11, r12, r13, r15, rax
288 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
289 vmovdqu AadHash(arg2), %xmm8
290 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
291 add arg5, InLen(arg2)
293 # initialize the data pointer offset as zero
296 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
299 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
300 and $-16, %r13 # r13 = r13 - (r13 mod 16)
305 jz _initial_num_blocks_is_0\@
308 je _initial_num_blocks_is_7\@
310 je _initial_num_blocks_is_6\@
312 je _initial_num_blocks_is_5\@
314 je _initial_num_blocks_is_4\@
316 je _initial_num_blocks_is_3\@
318 je _initial_num_blocks_is_2\@
320 jmp _initial_num_blocks_is_1\@
322 _initial_num_blocks_is_7\@:
323 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
325 jmp _initial_blocks_encrypted\@
327 _initial_num_blocks_is_6\@:
328 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
330 jmp _initial_blocks_encrypted\@
332 _initial_num_blocks_is_5\@:
333 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
335 jmp _initial_blocks_encrypted\@
337 _initial_num_blocks_is_4\@:
338 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
340 jmp _initial_blocks_encrypted\@
342 _initial_num_blocks_is_3\@:
343 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
345 jmp _initial_blocks_encrypted\@
347 _initial_num_blocks_is_2\@:
348 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
350 jmp _initial_blocks_encrypted\@
352 _initial_num_blocks_is_1\@:
353 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
355 jmp _initial_blocks_encrypted\@
357 _initial_num_blocks_is_0\@:
358 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
361 _initial_blocks_encrypted\@:
363 je _zero_cipher_left\@
366 je _eight_cipher_left\@
373 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
383 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
386 jne _encrypt_by_8_new\@
388 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
389 jmp _eight_cipher_left\@
392 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
395 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
398 jne _encrypt_by_8_new\@
400 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
405 _eight_cipher_left\@:
406 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
410 vmovdqu %xmm14, AadHash(arg2)
411 vmovdqu %xmm9, CurCount(arg2)
415 and $15, %r13 # r13 = (arg5 mod 16)
417 je _multiple_of_16_bytes\@
419 # handle the last <16 Byte block separately
421 mov %r13, PBlockLen(arg2)
423 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
424 vmovdqu %xmm9, CurCount(arg2)
425 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
427 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
428 vmovdqu %xmm9, PBlockEncKey(arg2)
431 jge _large_enough_update\@
433 lea (arg4,%r11,1), %r10
436 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
438 lea SHIFT_MASK+16(%rip), %r12
439 sub %r13, %r12 # adjust the shuffle mask pointer to be
440 # able to shift 16-r13 bytes (r13 is the
441 # number of bytes in plaintext mod 16)
443 jmp _final_ghash_mul\@
445 _large_enough_update\@:
449 # receive the last <16 Byte block
450 vmovdqu (arg4, %r11, 1), %xmm1
455 lea SHIFT_MASK+16(%rip), %r12
456 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
457 # (r13 is the number of bytes in plaintext mod 16)
459 # get the appropriate shuffle mask
460 vmovdqu (%r12), %xmm2
461 # shift right 16-r13 bytes
462 vpshufb %xmm2, %xmm1, %xmm1
467 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
468 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
469 # mask out top 16-r13 bytes of xmm9
470 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
471 vpand %xmm1, %xmm2, %xmm2
472 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
473 vpxor %xmm2, %xmm14, %xmm14
475 vmovdqu %xmm14, AadHash(arg2)
477 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
479 # mask out top 16-r13 bytes of xmm9
480 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
481 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
482 vpxor %xmm9, %xmm14, %xmm14
484 vmovdqu %xmm14, AadHash(arg2)
485 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
489 #############################
493 jle _less_than_8_bytes_left\@
495 mov %rax, (arg3 , %r11)
497 vpsrldq $8, %xmm9, %xmm9
501 _less_than_8_bytes_left\@:
502 movb %al, (arg3 , %r11)
506 jne _less_than_8_bytes_left\@
507 #############################
509 _multiple_of_16_bytes\@:
513 # GCM_COMPLETE Finishes update of tag of last partial block
514 # Output: Authorization Tag (AUTH_TAG)
515 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
516 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
517 vmovdqu AadHash(arg2), %xmm14
518 vmovdqu HashKey(arg2), %xmm13
520 mov PBlockLen(arg2), %r12
524 #GHASH computation for the last <16 Byte block
525 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
528 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
529 shl $3, %r12 # convert into number of bits
530 vmovd %r12d, %xmm15 # len(A) in xmm15
532 mov InLen(arg2), %r12
533 shl $3, %r12 # len(C) in bits (*128)
535 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
536 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
538 vpxor %xmm15, %xmm14, %xmm14
539 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
540 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
542 vmovdqu OrigIV(arg2), %xmm9
544 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
546 vpxor %xmm14, %xmm9, %xmm9
551 mov \AUTH_TAG, %r10 # r10 = authTag
552 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
565 vpsrldq $8, %xmm9, %xmm9
573 vpsrldq $4, %xmm9, %xmm9
590 vmovdqu %xmm9, (%r10)
595 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
597 mov \AAD, %r10 # r10 = AAD
598 mov \AADLEN, %r12 # r12 = aadLen
609 vpshufb SHUF_MASK(%rip), \T7, \T7
611 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
616 jge _get_AAD_blocks\@
623 /* read the last <16B of AAD. since we have at least 4B of
624 data right after the AAD (the ICV, and maybe some CT), we can
625 read 4B/8B blocks safely, and then get rid of the extra stuff */
643 vpslldq $12, \T1, \T1
647 /* finalize: shift out the extra bytes we read, and align
648 left. since pslldq can only shift by an immediate, we use
649 vpshufb and an array of shuffle masks */
652 vmovdqu aad_shift_arr(%r11), \T1
653 vpshufb \T1, \T7, \T7
654 _get_AAD_rest_final\@:
655 vpshufb SHUF_MASK(%rip), \T7, \T7
657 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
660 vmovdqu \T7, AadHash(arg2)
663 .macro INIT GHASH_MUL PRECOMPUTE
665 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
667 mov %r11, InLen(arg2) # ctx_data.in_length = 0
669 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
670 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
673 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
675 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
676 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
678 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
680 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
681 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
683 vpsllq $1, %xmm6, %xmm6
684 vpsrlq $63, %xmm2, %xmm2
686 vpslldq $8, %xmm2, %xmm2
687 vpsrldq $8, %xmm1, %xmm1
688 vpor %xmm2, %xmm6, %xmm6
690 vpshufd $0b00100100, %xmm1, %xmm2
691 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
692 vpand POLY(%rip), %xmm2, %xmm2
693 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
694 #######################################################################
695 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
697 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
699 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
703 # Reads DLEN bytes starting at DPTR and stores in XMMDst
704 # where 0 < DLEN < 16
705 # Clobbers %rax, DLEN
706 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
707 vpxor \XMMDst, \XMMDst, \XMMDst
712 vpinsrq $0, %rax, \XMMDst, \XMMDst
714 jz _done_read_partial_block_\@
718 mov 7(\DPTR, \DLEN, 1), %al
720 jnz _read_next_byte_\@
721 vpinsrq $1, %rax, \XMMDst, \XMMDst
722 jmp _done_read_partial_block_\@
725 _read_next_byte_lt8_\@:
727 mov -1(\DPTR, \DLEN, 1), %al
729 jnz _read_next_byte_lt8_\@
730 vpinsrq $0, %rax, \XMMDst, \XMMDst
731 _done_read_partial_block_\@:
734 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
735 # between update calls.
736 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
737 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
738 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
739 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
741 mov PBlockLen(arg2), %r13
743 je _partial_block_done_\@ # Leave Macro if no partial blocks
744 # Read in input data without over reading
745 cmp $16, \PLAIN_CYPH_LEN
746 jl _fewer_than_16_bytes_\@
747 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
750 _fewer_than_16_bytes_\@:
751 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
752 mov \PLAIN_CYPH_LEN, %r12
753 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
755 mov PBlockLen(arg2), %r13
757 _data_read_\@: # Finished reading in data
759 vmovdqu PBlockEncKey(arg2), %xmm9
760 vmovdqu HashKey(arg2), %xmm13
762 lea SHIFT_MASK(%rip), %r12
764 # adjust the shuffle mask pointer to be able to shift r13 bytes
765 # r16-r13 is the number of bytes in plaintext mod 16)
767 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
768 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
772 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
774 mov \PLAIN_CYPH_LEN, %r10
776 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
778 # Determine if if partial block is not being filled and
779 # shift mask accordingly
780 jge _no_extra_mask_1_\@
784 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
785 # get the appropriate mask to mask out bottom r13 bytes of xmm9
786 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
788 vpand %xmm1, %xmm3, %xmm3
789 vmovdqa SHUF_MASK(%rip), %xmm10
790 vpshufb %xmm10, %xmm3, %xmm3
791 vpshufb %xmm2, %xmm3, %xmm3
792 vpxor %xmm3, \AAD_HASH, \AAD_HASH
795 jl _partial_incomplete_1_\@
797 # GHASH computation for the last <16 Byte block
798 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
801 mov %rax, PBlockLen(arg2)
803 _partial_incomplete_1_\@:
804 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
806 vmovdqu \AAD_HASH, AadHash(arg2)
808 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
810 mov \PLAIN_CYPH_LEN, %r10
812 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
814 # Determine if if partial block is not being filled and
815 # shift mask accordingly
816 jge _no_extra_mask_2_\@
820 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
821 # get the appropriate mask to mask out bottom r13 bytes of xmm9
822 vpand %xmm1, %xmm9, %xmm9
824 vmovdqa SHUF_MASK(%rip), %xmm1
825 vpshufb %xmm1, %xmm9, %xmm9
826 vpshufb %xmm2, %xmm9, %xmm9
827 vpxor %xmm9, \AAD_HASH, \AAD_HASH
830 jl _partial_incomplete_2_\@
832 # GHASH computation for the last <16 Byte block
833 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
836 mov %rax, PBlockLen(arg2)
838 _partial_incomplete_2_\@:
839 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
841 vmovdqu \AAD_HASH, AadHash(arg2)
843 vmovdqa SHUF_MASK(%rip), %xmm10
844 # shuffle xmm9 back to output as ciphertext
845 vpshufb %xmm10, %xmm9, %xmm9
846 vpshufb %xmm2, %xmm9, %xmm9
848 # output encrypted Bytes
853 # Set r13 to be the number of bytes to write out
857 mov \PLAIN_CYPH_LEN, %r13
862 jle _less_than_8_bytes_left_\@
864 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
869 _less_than_8_bytes_left_\@:
870 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
874 jne _less_than_8_bytes_left_\@
875 _partial_block_done_\@:
876 .endm # PARTIAL_BLOCK
878 ###############################################################################
879 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
880 # Input: A and B (128-bits each, bit-reflected)
881 # Output: C = A*B*x mod poly, (i.e. >>1 )
882 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
883 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
884 ###############################################################################
885 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
887 vpshufd $0b01001110, \GH, \T2
888 vpshufd $0b01001110, \HK, \T3
889 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
890 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
892 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
893 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
894 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
896 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
898 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
899 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
901 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
903 #first phase of the reduction
904 vpslld $31, \GH, \T2 # packed right shifting << 31
905 vpslld $30, \GH, \T3 # packed right shifting shift << 30
906 vpslld $25, \GH, \T4 # packed right shifting shift << 25
908 vpxor \T3, \T2, \T2 # xor the shifted versions
911 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
913 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
914 vpxor \T2, \GH, \GH # first phase of the reduction complete
916 #second phase of the reduction
918 vpsrld $1,\GH, \T2 # packed left shifting >> 1
919 vpsrld $2,\GH, \T3 # packed left shifting >> 2
920 vpsrld $7,\GH, \T4 # packed left shifting >> 7
921 vpxor \T3, \T2, \T2 # xor the shifted versions
926 vpxor \T1, \GH, \GH # the result is in GH
931 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
933 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
936 vpshufd $0b01001110, \T5, \T1
938 vmovdqu \T1, HashKey_k(arg2)
940 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
941 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
942 vpshufd $0b01001110, \T5, \T1
944 vmovdqu \T1, HashKey_2_k(arg2)
946 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
947 vmovdqu \T5, HashKey_3(arg2)
948 vpshufd $0b01001110, \T5, \T1
950 vmovdqu \T1, HashKey_3_k(arg2)
952 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
953 vmovdqu \T5, HashKey_4(arg2)
954 vpshufd $0b01001110, \T5, \T1
956 vmovdqu \T1, HashKey_4_k(arg2)
958 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
959 vmovdqu \T5, HashKey_5(arg2)
960 vpshufd $0b01001110, \T5, \T1
962 vmovdqu \T1, HashKey_5_k(arg2)
964 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
965 vmovdqu \T5, HashKey_6(arg2)
966 vpshufd $0b01001110, \T5, \T1
968 vmovdqu \T1, HashKey_6_k(arg2)
970 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
971 vmovdqu \T5, HashKey_7(arg2)
972 vpshufd $0b01001110, \T5, \T1
974 vmovdqu \T1, HashKey_7_k(arg2)
976 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
977 vmovdqu \T5, HashKey_8(arg2)
978 vpshufd $0b01001110, \T5, \T1
980 vmovdqu \T1, HashKey_8_k(arg2)
984 ## if a = number of total plaintext bytes
986 ## num_initial_blocks = b mod 4#
987 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
988 ## r10, r11, r12, rax are clobbered
989 ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
991 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
992 i = (8-\num_initial_blocks)
994 vmovdqu AadHash(arg2), reg_i
996 # start AES for num_initial_blocks blocks
997 vmovdqu CurCount(arg2), \CTR
999 i = (9-\num_initial_blocks)
1001 .rep \num_initial_blocks
1002 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1004 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1009 vmovdqa (arg1), \T_key
1010 i = (9-\num_initial_blocks)
1012 .rep \num_initial_blocks
1013 vpxor \T_key, reg_i, reg_i
1021 vmovdqa 16*j(arg1), \T_key
1022 i = (9-\num_initial_blocks)
1024 .rep \num_initial_blocks
1025 vaesenc \T_key, reg_i, reg_i
1034 vmovdqa 16*j(arg1), \T_key
1035 i = (9-\num_initial_blocks)
1037 .rep \num_initial_blocks
1038 vaesenclast \T_key, reg_i, reg_i
1043 i = (9-\num_initial_blocks)
1045 .rep \num_initial_blocks
1046 vmovdqu (arg4, %r11), \T1
1047 vpxor \T1, reg_i, reg_i
1048 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1053 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1059 i = (8-\num_initial_blocks)
1060 j = (9-\num_initial_blocks)
1063 .rep \num_initial_blocks
1064 vpxor reg_i, reg_j, reg_j
1065 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1070 # XMM8 has the combined result here
1072 vmovdqa \XMM8, TMP1(%rsp)
1076 jl _initial_blocks_done\@ # no need for precomputed constants
1078 ###############################################################################
1079 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1080 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1082 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1084 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1086 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1088 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1090 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1092 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1094 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1096 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1098 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1100 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1102 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1104 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1106 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1108 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1110 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1112 vmovdqa (arg1), \T_key
1113 vpxor \T_key, \XMM1, \XMM1
1114 vpxor \T_key, \XMM2, \XMM2
1115 vpxor \T_key, \XMM3, \XMM3
1116 vpxor \T_key, \XMM4, \XMM4
1117 vpxor \T_key, \XMM5, \XMM5
1118 vpxor \T_key, \XMM6, \XMM6
1119 vpxor \T_key, \XMM7, \XMM7
1120 vpxor \T_key, \XMM8, \XMM8
1124 .rep \REP # do REP rounds
1125 vmovdqa 16*i(arg1), \T_key
1126 vaesenc \T_key, \XMM1, \XMM1
1127 vaesenc \T_key, \XMM2, \XMM2
1128 vaesenc \T_key, \XMM3, \XMM3
1129 vaesenc \T_key, \XMM4, \XMM4
1130 vaesenc \T_key, \XMM5, \XMM5
1131 vaesenc \T_key, \XMM6, \XMM6
1132 vaesenc \T_key, \XMM7, \XMM7
1133 vaesenc \T_key, \XMM8, \XMM8
1138 vmovdqa 16*i(arg1), \T_key
1139 vaesenclast \T_key, \XMM1, \XMM1
1140 vaesenclast \T_key, \XMM2, \XMM2
1141 vaesenclast \T_key, \XMM3, \XMM3
1142 vaesenclast \T_key, \XMM4, \XMM4
1143 vaesenclast \T_key, \XMM5, \XMM5
1144 vaesenclast \T_key, \XMM6, \XMM6
1145 vaesenclast \T_key, \XMM7, \XMM7
1146 vaesenclast \T_key, \XMM8, \XMM8
1148 vmovdqu (arg4, %r11), \T1
1149 vpxor \T1, \XMM1, \XMM1
1150 vmovdqu \XMM1, (arg3 , %r11)
1155 vmovdqu 16*1(arg4, %r11), \T1
1156 vpxor \T1, \XMM2, \XMM2
1157 vmovdqu \XMM2, 16*1(arg3 , %r11)
1162 vmovdqu 16*2(arg4, %r11), \T1
1163 vpxor \T1, \XMM3, \XMM3
1164 vmovdqu \XMM3, 16*2(arg3 , %r11)
1169 vmovdqu 16*3(arg4, %r11), \T1
1170 vpxor \T1, \XMM4, \XMM4
1171 vmovdqu \XMM4, 16*3(arg3 , %r11)
1176 vmovdqu 16*4(arg4, %r11), \T1
1177 vpxor \T1, \XMM5, \XMM5
1178 vmovdqu \XMM5, 16*4(arg3 , %r11)
1183 vmovdqu 16*5(arg4, %r11), \T1
1184 vpxor \T1, \XMM6, \XMM6
1185 vmovdqu \XMM6, 16*5(arg3 , %r11)
1190 vmovdqu 16*6(arg4, %r11), \T1
1191 vpxor \T1, \XMM7, \XMM7
1192 vmovdqu \XMM7, 16*6(arg3 , %r11)
1197 vmovdqu 16*7(arg4, %r11), \T1
1198 vpxor \T1, \XMM8, \XMM8
1199 vmovdqu \XMM8, 16*7(arg3 , %r11)
1206 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1207 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1208 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1209 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1210 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1211 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1212 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1213 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1214 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1216 ###############################################################################
1218 _initial_blocks_done\@:
1222 # encrypt 8 blocks at a time
1223 # ghash the 8 previously encrypted ciphertext blocks
1224 # arg1, arg2, arg3, arg4 are used as pointers only, not modified
1225 # r11 is the data offset value
1226 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1229 vmovdqa \XMM2, TMP2(%rsp)
1230 vmovdqa \XMM3, TMP3(%rsp)
1231 vmovdqa \XMM4, TMP4(%rsp)
1232 vmovdqa \XMM5, TMP5(%rsp)
1233 vmovdqa \XMM6, TMP6(%rsp)
1234 vmovdqa \XMM7, TMP7(%rsp)
1235 vmovdqa \XMM8, TMP8(%rsp)
1237 .if \loop_idx == in_order
1238 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1239 vpaddd ONE(%rip), \XMM1, \XMM2
1240 vpaddd ONE(%rip), \XMM2, \XMM3
1241 vpaddd ONE(%rip), \XMM3, \XMM4
1242 vpaddd ONE(%rip), \XMM4, \XMM5
1243 vpaddd ONE(%rip), \XMM5, \XMM6
1244 vpaddd ONE(%rip), \XMM6, \XMM7
1245 vpaddd ONE(%rip), \XMM7, \XMM8
1248 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1249 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1250 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1251 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1252 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1253 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1254 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1255 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1257 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1258 vpaddd ONEf(%rip), \XMM1, \XMM2
1259 vpaddd ONEf(%rip), \XMM2, \XMM3
1260 vpaddd ONEf(%rip), \XMM3, \XMM4
1261 vpaddd ONEf(%rip), \XMM4, \XMM5
1262 vpaddd ONEf(%rip), \XMM5, \XMM6
1263 vpaddd ONEf(%rip), \XMM6, \XMM7
1264 vpaddd ONEf(%rip), \XMM7, \XMM8
1269 #######################################################################
1272 vpxor \T1, \XMM1, \XMM1
1273 vpxor \T1, \XMM2, \XMM2
1274 vpxor \T1, \XMM3, \XMM3
1275 vpxor \T1, \XMM4, \XMM4
1276 vpxor \T1, \XMM5, \XMM5
1277 vpxor \T1, \XMM6, \XMM6
1278 vpxor \T1, \XMM7, \XMM7
1279 vpxor \T1, \XMM8, \XMM8
1281 #######################################################################
1287 vmovdqu 16*1(arg1), \T1
1288 vaesenc \T1, \XMM1, \XMM1
1289 vaesenc \T1, \XMM2, \XMM2
1290 vaesenc \T1, \XMM3, \XMM3
1291 vaesenc \T1, \XMM4, \XMM4
1292 vaesenc \T1, \XMM5, \XMM5
1293 vaesenc \T1, \XMM6, \XMM6
1294 vaesenc \T1, \XMM7, \XMM7
1295 vaesenc \T1, \XMM8, \XMM8
1297 vmovdqu 16*2(arg1), \T1
1298 vaesenc \T1, \XMM1, \XMM1
1299 vaesenc \T1, \XMM2, \XMM2
1300 vaesenc \T1, \XMM3, \XMM3
1301 vaesenc \T1, \XMM4, \XMM4
1302 vaesenc \T1, \XMM5, \XMM5
1303 vaesenc \T1, \XMM6, \XMM6
1304 vaesenc \T1, \XMM7, \XMM7
1305 vaesenc \T1, \XMM8, \XMM8
1308 #######################################################################
1310 vmovdqu HashKey_8(arg2), \T5
1311 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1312 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1314 vpshufd $0b01001110, \T2, \T6
1317 vmovdqu HashKey_8_k(arg2), \T5
1318 vpclmulqdq $0x00, \T5, \T6, \T6
1320 vmovdqu 16*3(arg1), \T1
1321 vaesenc \T1, \XMM1, \XMM1
1322 vaesenc \T1, \XMM2, \XMM2
1323 vaesenc \T1, \XMM3, \XMM3
1324 vaesenc \T1, \XMM4, \XMM4
1325 vaesenc \T1, \XMM5, \XMM5
1326 vaesenc \T1, \XMM6, \XMM6
1327 vaesenc \T1, \XMM7, \XMM7
1328 vaesenc \T1, \XMM8, \XMM8
1330 vmovdqa TMP2(%rsp), \T1
1331 vmovdqu HashKey_7(arg2), \T5
1332 vpclmulqdq $0x11, \T5, \T1, \T3
1334 vpclmulqdq $0x00, \T5, \T1, \T3
1337 vpshufd $0b01001110, \T1, \T3
1339 vmovdqu HashKey_7_k(arg2), \T5
1340 vpclmulqdq $0x10, \T5, \T3, \T3
1343 vmovdqu 16*4(arg1), \T1
1344 vaesenc \T1, \XMM1, \XMM1
1345 vaesenc \T1, \XMM2, \XMM2
1346 vaesenc \T1, \XMM3, \XMM3
1347 vaesenc \T1, \XMM4, \XMM4
1348 vaesenc \T1, \XMM5, \XMM5
1349 vaesenc \T1, \XMM6, \XMM6
1350 vaesenc \T1, \XMM7, \XMM7
1351 vaesenc \T1, \XMM8, \XMM8
1353 #######################################################################
1355 vmovdqa TMP3(%rsp), \T1
1356 vmovdqu HashKey_6(arg2), \T5
1357 vpclmulqdq $0x11, \T5, \T1, \T3
1359 vpclmulqdq $0x00, \T5, \T1, \T3
1362 vpshufd $0b01001110, \T1, \T3
1364 vmovdqu HashKey_6_k(arg2), \T5
1365 vpclmulqdq $0x10, \T5, \T3, \T3
1368 vmovdqu 16*5(arg1), \T1
1369 vaesenc \T1, \XMM1, \XMM1
1370 vaesenc \T1, \XMM2, \XMM2
1371 vaesenc \T1, \XMM3, \XMM3
1372 vaesenc \T1, \XMM4, \XMM4
1373 vaesenc \T1, \XMM5, \XMM5
1374 vaesenc \T1, \XMM6, \XMM6
1375 vaesenc \T1, \XMM7, \XMM7
1376 vaesenc \T1, \XMM8, \XMM8
1378 vmovdqa TMP4(%rsp), \T1
1379 vmovdqu HashKey_5(arg2), \T5
1380 vpclmulqdq $0x11, \T5, \T1, \T3
1382 vpclmulqdq $0x00, \T5, \T1, \T3
1385 vpshufd $0b01001110, \T1, \T3
1387 vmovdqu HashKey_5_k(arg2), \T5
1388 vpclmulqdq $0x10, \T5, \T3, \T3
1391 vmovdqu 16*6(arg1), \T1
1392 vaesenc \T1, \XMM1, \XMM1
1393 vaesenc \T1, \XMM2, \XMM2
1394 vaesenc \T1, \XMM3, \XMM3
1395 vaesenc \T1, \XMM4, \XMM4
1396 vaesenc \T1, \XMM5, \XMM5
1397 vaesenc \T1, \XMM6, \XMM6
1398 vaesenc \T1, \XMM7, \XMM7
1399 vaesenc \T1, \XMM8, \XMM8
1402 vmovdqa TMP5(%rsp), \T1
1403 vmovdqu HashKey_4(arg2), \T5
1404 vpclmulqdq $0x11, \T5, \T1, \T3
1406 vpclmulqdq $0x00, \T5, \T1, \T3
1409 vpshufd $0b01001110, \T1, \T3
1411 vmovdqu HashKey_4_k(arg2), \T5
1412 vpclmulqdq $0x10, \T5, \T3, \T3
1415 vmovdqu 16*7(arg1), \T1
1416 vaesenc \T1, \XMM1, \XMM1
1417 vaesenc \T1, \XMM2, \XMM2
1418 vaesenc \T1, \XMM3, \XMM3
1419 vaesenc \T1, \XMM4, \XMM4
1420 vaesenc \T1, \XMM5, \XMM5
1421 vaesenc \T1, \XMM6, \XMM6
1422 vaesenc \T1, \XMM7, \XMM7
1423 vaesenc \T1, \XMM8, \XMM8
1425 vmovdqa TMP6(%rsp), \T1
1426 vmovdqu HashKey_3(arg2), \T5
1427 vpclmulqdq $0x11, \T5, \T1, \T3
1429 vpclmulqdq $0x00, \T5, \T1, \T3
1432 vpshufd $0b01001110, \T1, \T3
1434 vmovdqu HashKey_3_k(arg2), \T5
1435 vpclmulqdq $0x10, \T5, \T3, \T3
1439 vmovdqu 16*8(arg1), \T1
1440 vaesenc \T1, \XMM1, \XMM1
1441 vaesenc \T1, \XMM2, \XMM2
1442 vaesenc \T1, \XMM3, \XMM3
1443 vaesenc \T1, \XMM4, \XMM4
1444 vaesenc \T1, \XMM5, \XMM5
1445 vaesenc \T1, \XMM6, \XMM6
1446 vaesenc \T1, \XMM7, \XMM7
1447 vaesenc \T1, \XMM8, \XMM8
1449 vmovdqa TMP7(%rsp), \T1
1450 vmovdqu HashKey_2(arg2), \T5
1451 vpclmulqdq $0x11, \T5, \T1, \T3
1453 vpclmulqdq $0x00, \T5, \T1, \T3
1456 vpshufd $0b01001110, \T1, \T3
1458 vmovdqu HashKey_2_k(arg2), \T5
1459 vpclmulqdq $0x10, \T5, \T3, \T3
1462 #######################################################################
1464 vmovdqu 16*9(arg1), \T5
1465 vaesenc \T5, \XMM1, \XMM1
1466 vaesenc \T5, \XMM2, \XMM2
1467 vaesenc \T5, \XMM3, \XMM3
1468 vaesenc \T5, \XMM4, \XMM4
1469 vaesenc \T5, \XMM5, \XMM5
1470 vaesenc \T5, \XMM6, \XMM6
1471 vaesenc \T5, \XMM7, \XMM7
1472 vaesenc \T5, \XMM8, \XMM8
1474 vmovdqa TMP8(%rsp), \T1
1475 vmovdqu HashKey(arg2), \T5
1476 vpclmulqdq $0x11, \T5, \T1, \T3
1478 vpclmulqdq $0x00, \T5, \T1, \T3
1481 vpshufd $0b01001110, \T1, \T3
1483 vmovdqu HashKey_k(arg2), \T5
1484 vpclmulqdq $0x10, \T5, \T3, \T3
1490 vmovdqu 16*10(arg1), \T5
1496 vaesenc \T5, \XMM1, \XMM1
1497 vaesenc \T5, \XMM2, \XMM2
1498 vaesenc \T5, \XMM3, \XMM3
1499 vaesenc \T5, \XMM4, \XMM4
1500 vaesenc \T5, \XMM5, \XMM5
1501 vaesenc \T5, \XMM6, \XMM6
1502 vaesenc \T5, \XMM7, \XMM7
1503 vaesenc \T5, \XMM8, \XMM8
1505 vmovdqu 16*i(arg1), \T5
1514 vpxor 16*i(arg4, %r11), \T5, \T2
1516 vaesenclast \T2, reg_j, reg_j
1518 vaesenclast \T2, reg_j, \T3
1519 vmovdqu 16*i(arg4, %r11), reg_j
1520 vmovdqu \T3, 16*i(arg3, %r11)
1526 #######################################################################
1529 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1530 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1532 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1536 #######################################################################
1537 #first phase of the reduction
1538 #######################################################################
1539 vpslld $31, \T7, \T2 # packed right shifting << 31
1540 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1541 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1543 vpxor \T3, \T2, \T2 # xor the shifted versions
1546 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1548 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1549 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1550 #######################################################################
1552 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1553 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1554 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1555 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1556 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1557 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1558 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1559 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1562 #######################################################################
1563 #second phase of the reduction
1564 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1565 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1566 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1567 vpxor \T3, \T2, \T2 # xor the shifted versions
1572 vpxor \T7, \T6, \T6 # the result is in T6
1573 #######################################################################
1575 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1576 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1577 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1578 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1579 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1580 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1581 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1582 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1585 vpxor \T6, \XMM1, \XMM1
1592 # GHASH the last 4 ciphertext blocks.
1593 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1598 vpshufd $0b01001110, \XMM1, \T2
1599 vpxor \XMM1, \T2, \T2
1600 vmovdqu HashKey_8(arg2), \T5
1601 vpclmulqdq $0x11, \T5, \XMM1, \T6
1602 vpclmulqdq $0x00, \T5, \XMM1, \T7
1604 vmovdqu HashKey_8_k(arg2), \T3
1605 vpclmulqdq $0x00, \T3, \T2, \XMM1
1607 ######################
1609 vpshufd $0b01001110, \XMM2, \T2
1610 vpxor \XMM2, \T2, \T2
1611 vmovdqu HashKey_7(arg2), \T5
1612 vpclmulqdq $0x11, \T5, \XMM2, \T4
1615 vpclmulqdq $0x00, \T5, \XMM2, \T4
1618 vmovdqu HashKey_7_k(arg2), \T3
1619 vpclmulqdq $0x00, \T3, \T2, \T2
1620 vpxor \T2, \XMM1, \XMM1
1622 ######################
1624 vpshufd $0b01001110, \XMM3, \T2
1625 vpxor \XMM3, \T2, \T2
1626 vmovdqu HashKey_6(arg2), \T5
1627 vpclmulqdq $0x11, \T5, \XMM3, \T4
1630 vpclmulqdq $0x00, \T5, \XMM3, \T4
1633 vmovdqu HashKey_6_k(arg2), \T3
1634 vpclmulqdq $0x00, \T3, \T2, \T2
1635 vpxor \T2, \XMM1, \XMM1
1637 ######################
1639 vpshufd $0b01001110, \XMM4, \T2
1640 vpxor \XMM4, \T2, \T2
1641 vmovdqu HashKey_5(arg2), \T5
1642 vpclmulqdq $0x11, \T5, \XMM4, \T4
1645 vpclmulqdq $0x00, \T5, \XMM4, \T4
1648 vmovdqu HashKey_5_k(arg2), \T3
1649 vpclmulqdq $0x00, \T3, \T2, \T2
1650 vpxor \T2, \XMM1, \XMM1
1652 ######################
1654 vpshufd $0b01001110, \XMM5, \T2
1655 vpxor \XMM5, \T2, \T2
1656 vmovdqu HashKey_4(arg2), \T5
1657 vpclmulqdq $0x11, \T5, \XMM5, \T4
1660 vpclmulqdq $0x00, \T5, \XMM5, \T4
1663 vmovdqu HashKey_4_k(arg2), \T3
1664 vpclmulqdq $0x00, \T3, \T2, \T2
1665 vpxor \T2, \XMM1, \XMM1
1667 ######################
1669 vpshufd $0b01001110, \XMM6, \T2
1670 vpxor \XMM6, \T2, \T2
1671 vmovdqu HashKey_3(arg2), \T5
1672 vpclmulqdq $0x11, \T5, \XMM6, \T4
1675 vpclmulqdq $0x00, \T5, \XMM6, \T4
1678 vmovdqu HashKey_3_k(arg2), \T3
1679 vpclmulqdq $0x00, \T3, \T2, \T2
1680 vpxor \T2, \XMM1, \XMM1
1682 ######################
1684 vpshufd $0b01001110, \XMM7, \T2
1685 vpxor \XMM7, \T2, \T2
1686 vmovdqu HashKey_2(arg2), \T5
1687 vpclmulqdq $0x11, \T5, \XMM7, \T4
1690 vpclmulqdq $0x00, \T5, \XMM7, \T4
1693 vmovdqu HashKey_2_k(arg2), \T3
1694 vpclmulqdq $0x00, \T3, \T2, \T2
1695 vpxor \T2, \XMM1, \XMM1
1697 ######################
1699 vpshufd $0b01001110, \XMM8, \T2
1700 vpxor \XMM8, \T2, \T2
1701 vmovdqu HashKey(arg2), \T5
1702 vpclmulqdq $0x11, \T5, \XMM8, \T4
1705 vpclmulqdq $0x00, \T5, \XMM8, \T4
1708 vmovdqu HashKey_k(arg2), \T3
1709 vpclmulqdq $0x00, \T3, \T2, \T2
1711 vpxor \T2, \XMM1, \XMM1
1712 vpxor \T6, \XMM1, \XMM1
1713 vpxor \T7, \XMM1, \T2
1718 vpslldq $8, \T2, \T4
1719 vpsrldq $8, \T2, \T2
1722 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1723 # the accumulated carry-less multiplications
1725 #######################################################################
1726 #first phase of the reduction
1727 vpslld $31, \T7, \T2 # packed right shifting << 31
1728 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1729 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1731 vpxor \T3, \T2, \T2 # xor the shifted versions
1734 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1736 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1737 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1738 #######################################################################
1741 #second phase of the reduction
1742 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1743 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1744 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1745 vpxor \T3, \T2, \T2 # xor the shifted versions
1750 vpxor \T7, \T6, \T6 # the result is in T6
1754 #############################################################
1755 #void aesni_gcm_precomp_avx_gen2
1756 # (gcm_data *my_ctx_data,
1757 # gcm_context_data *data,
1758 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1759 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1760 # (from Security Association) concatenated with 8 byte
1761 # Initialisation Vector (from IPSec ESP Payload)
1762 # concatenated with 0x00000001. 16-byte aligned pointer. */
1763 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1764 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1765 #############################################################
1766 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1768 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1771 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1773 ###############################################################################
1774 #void aesni_gcm_enc_update_avx_gen2(
1775 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1776 # gcm_context_data *data,
1777 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1778 # const u8 *in, /* Plaintext input */
1779 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1780 ###############################################################################
1781 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1785 je key_256_enc_update
1787 je key_128_enc_update
1789 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1793 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1797 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1800 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1802 ###############################################################################
1803 #void aesni_gcm_dec_update_avx_gen2(
1804 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1805 # gcm_context_data *data,
1806 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1807 # const u8 *in, /* Ciphertext input */
1808 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1809 ###############################################################################
1810 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1814 je key_256_dec_update
1816 je key_128_dec_update
1818 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1822 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1826 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1829 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1831 ###############################################################################
1832 #void aesni_gcm_finalize_avx_gen2(
1833 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1834 # gcm_context_data *data,
1835 # u8 *auth_tag, /* Authenticated Tag output. */
1836 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1837 # Valid values are 16 (most likely), 12 or 8. */
1838 ###############################################################################
1839 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1847 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1851 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1855 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1858 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1860 ###############################################################################
1861 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1862 # Input: A and B (128-bits each, bit-reflected)
1863 # Output: C = A*B*x mod poly, (i.e. >>1 )
1864 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1865 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1866 ###############################################################################
1867 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1869 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1870 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1871 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1872 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1876 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1877 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1882 #######################################################################
1883 #first phase of the reduction
1884 vmovdqa POLY2(%rip), \T3
1886 vpclmulqdq $0x01, \GH, \T3, \T2
1887 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1889 vpxor \T2, \GH, \GH # first phase of the reduction complete
1890 #######################################################################
1891 #second phase of the reduction
1892 vpclmulqdq $0x00, \GH, \T3, \T2
1893 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1895 vpclmulqdq $0x10, \GH, \T3, \GH
1896 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1898 vpxor \T2, \GH, \GH # second phase of the reduction complete
1899 #######################################################################
1900 vpxor \T1, \GH, \GH # the result is in GH
1905 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1907 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1909 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1910 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1912 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1913 vmovdqu \T5, HashKey_3(arg2)
1915 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1916 vmovdqu \T5, HashKey_4(arg2)
1918 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1919 vmovdqu \T5, HashKey_5(arg2)
1921 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1922 vmovdqu \T5, HashKey_6(arg2)
1924 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1925 vmovdqu \T5, HashKey_7(arg2)
1927 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1928 vmovdqu \T5, HashKey_8(arg2)
1932 ## if a = number of total plaintext bytes
1934 ## num_initial_blocks = b mod 4#
1935 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1936 ## r10, r11, r12, rax are clobbered
1937 ## arg1, arg2, arg3, arg4 are used as pointers only, not modified
1939 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1940 i = (8-\num_initial_blocks)
1942 vmovdqu AadHash(arg2), reg_i
1944 # start AES for num_initial_blocks blocks
1945 vmovdqu CurCount(arg2), \CTR
1947 i = (9-\num_initial_blocks)
1949 .rep \num_initial_blocks
1950 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1952 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1957 vmovdqa (arg1), \T_key
1958 i = (9-\num_initial_blocks)
1960 .rep \num_initial_blocks
1961 vpxor \T_key, reg_i, reg_i
1969 vmovdqa 16*j(arg1), \T_key
1970 i = (9-\num_initial_blocks)
1972 .rep \num_initial_blocks
1973 vaesenc \T_key, reg_i, reg_i
1983 vmovdqa 16*j(arg1), \T_key
1984 i = (9-\num_initial_blocks)
1986 .rep \num_initial_blocks
1987 vaesenclast \T_key, reg_i, reg_i
1992 i = (9-\num_initial_blocks)
1994 .rep \num_initial_blocks
1995 vmovdqu (arg4, %r11), \T1
1996 vpxor \T1, reg_i, reg_i
1997 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
1998 # num_initial_blocks blocks
2003 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
2009 i = (8-\num_initial_blocks)
2010 j = (9-\num_initial_blocks)
2013 .rep \num_initial_blocks
2014 vpxor reg_i, reg_j, reg_j
2015 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
2020 # XMM8 has the combined result here
2022 vmovdqa \XMM8, TMP1(%rsp)
2026 jl _initial_blocks_done\@ # no need for precomputed constants
2028 ###############################################################################
2029 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2030 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2032 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2034 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2036 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2038 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2040 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2042 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2044 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2046 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2048 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2050 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2052 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2054 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2056 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2058 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2060 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2062 vmovdqa (arg1), \T_key
2063 vpxor \T_key, \XMM1, \XMM1
2064 vpxor \T_key, \XMM2, \XMM2
2065 vpxor \T_key, \XMM3, \XMM3
2066 vpxor \T_key, \XMM4, \XMM4
2067 vpxor \T_key, \XMM5, \XMM5
2068 vpxor \T_key, \XMM6, \XMM6
2069 vpxor \T_key, \XMM7, \XMM7
2070 vpxor \T_key, \XMM8, \XMM8
2074 .rep \REP # do REP rounds
2075 vmovdqa 16*i(arg1), \T_key
2076 vaesenc \T_key, \XMM1, \XMM1
2077 vaesenc \T_key, \XMM2, \XMM2
2078 vaesenc \T_key, \XMM3, \XMM3
2079 vaesenc \T_key, \XMM4, \XMM4
2080 vaesenc \T_key, \XMM5, \XMM5
2081 vaesenc \T_key, \XMM6, \XMM6
2082 vaesenc \T_key, \XMM7, \XMM7
2083 vaesenc \T_key, \XMM8, \XMM8
2089 vmovdqa 16*i(arg1), \T_key
2090 vaesenclast \T_key, \XMM1, \XMM1
2091 vaesenclast \T_key, \XMM2, \XMM2
2092 vaesenclast \T_key, \XMM3, \XMM3
2093 vaesenclast \T_key, \XMM4, \XMM4
2094 vaesenclast \T_key, \XMM5, \XMM5
2095 vaesenclast \T_key, \XMM6, \XMM6
2096 vaesenclast \T_key, \XMM7, \XMM7
2097 vaesenclast \T_key, \XMM8, \XMM8
2099 vmovdqu (arg4, %r11), \T1
2100 vpxor \T1, \XMM1, \XMM1
2101 vmovdqu \XMM1, (arg3 , %r11)
2106 vmovdqu 16*1(arg4, %r11), \T1
2107 vpxor \T1, \XMM2, \XMM2
2108 vmovdqu \XMM2, 16*1(arg3 , %r11)
2113 vmovdqu 16*2(arg4, %r11), \T1
2114 vpxor \T1, \XMM3, \XMM3
2115 vmovdqu \XMM3, 16*2(arg3 , %r11)
2120 vmovdqu 16*3(arg4, %r11), \T1
2121 vpxor \T1, \XMM4, \XMM4
2122 vmovdqu \XMM4, 16*3(arg3 , %r11)
2127 vmovdqu 16*4(arg4, %r11), \T1
2128 vpxor \T1, \XMM5, \XMM5
2129 vmovdqu \XMM5, 16*4(arg3 , %r11)
2134 vmovdqu 16*5(arg4, %r11), \T1
2135 vpxor \T1, \XMM6, \XMM6
2136 vmovdqu \XMM6, 16*5(arg3 , %r11)
2141 vmovdqu 16*6(arg4, %r11), \T1
2142 vpxor \T1, \XMM7, \XMM7
2143 vmovdqu \XMM7, 16*6(arg3 , %r11)
2148 vmovdqu 16*7(arg4, %r11), \T1
2149 vpxor \T1, \XMM8, \XMM8
2150 vmovdqu \XMM8, 16*7(arg3 , %r11)
2157 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2158 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2159 # the corresponding ciphertext
2160 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2161 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2162 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2163 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2164 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2165 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2166 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2168 ###############################################################################
2170 _initial_blocks_done\@:
2177 # encrypt 8 blocks at a time
2178 # ghash the 8 previously encrypted ciphertext blocks
2179 # arg1, arg2, arg3, arg4 are used as pointers only, not modified
2180 # r11 is the data offset value
2181 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2184 vmovdqa \XMM2, TMP2(%rsp)
2185 vmovdqa \XMM3, TMP3(%rsp)
2186 vmovdqa \XMM4, TMP4(%rsp)
2187 vmovdqa \XMM5, TMP5(%rsp)
2188 vmovdqa \XMM6, TMP6(%rsp)
2189 vmovdqa \XMM7, TMP7(%rsp)
2190 vmovdqa \XMM8, TMP8(%rsp)
2192 .if \loop_idx == in_order
2193 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2194 vpaddd ONE(%rip), \XMM1, \XMM2
2195 vpaddd ONE(%rip), \XMM2, \XMM3
2196 vpaddd ONE(%rip), \XMM3, \XMM4
2197 vpaddd ONE(%rip), \XMM4, \XMM5
2198 vpaddd ONE(%rip), \XMM5, \XMM6
2199 vpaddd ONE(%rip), \XMM6, \XMM7
2200 vpaddd ONE(%rip), \XMM7, \XMM8
2203 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2204 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2205 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2206 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2207 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2208 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2209 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2210 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2212 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2213 vpaddd ONEf(%rip), \XMM1, \XMM2
2214 vpaddd ONEf(%rip), \XMM2, \XMM3
2215 vpaddd ONEf(%rip), \XMM3, \XMM4
2216 vpaddd ONEf(%rip), \XMM4, \XMM5
2217 vpaddd ONEf(%rip), \XMM5, \XMM6
2218 vpaddd ONEf(%rip), \XMM6, \XMM7
2219 vpaddd ONEf(%rip), \XMM7, \XMM8
2224 #######################################################################
2227 vpxor \T1, \XMM1, \XMM1
2228 vpxor \T1, \XMM2, \XMM2
2229 vpxor \T1, \XMM3, \XMM3
2230 vpxor \T1, \XMM4, \XMM4
2231 vpxor \T1, \XMM5, \XMM5
2232 vpxor \T1, \XMM6, \XMM6
2233 vpxor \T1, \XMM7, \XMM7
2234 vpxor \T1, \XMM8, \XMM8
2236 #######################################################################
2242 vmovdqu 16*1(arg1), \T1
2243 vaesenc \T1, \XMM1, \XMM1
2244 vaesenc \T1, \XMM2, \XMM2
2245 vaesenc \T1, \XMM3, \XMM3
2246 vaesenc \T1, \XMM4, \XMM4
2247 vaesenc \T1, \XMM5, \XMM5
2248 vaesenc \T1, \XMM6, \XMM6
2249 vaesenc \T1, \XMM7, \XMM7
2250 vaesenc \T1, \XMM8, \XMM8
2252 vmovdqu 16*2(arg1), \T1
2253 vaesenc \T1, \XMM1, \XMM1
2254 vaesenc \T1, \XMM2, \XMM2
2255 vaesenc \T1, \XMM3, \XMM3
2256 vaesenc \T1, \XMM4, \XMM4
2257 vaesenc \T1, \XMM5, \XMM5
2258 vaesenc \T1, \XMM6, \XMM6
2259 vaesenc \T1, \XMM7, \XMM7
2260 vaesenc \T1, \XMM8, \XMM8
2263 #######################################################################
2265 vmovdqu HashKey_8(arg2), \T5
2266 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2267 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2268 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2269 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2272 vmovdqu 16*3(arg1), \T1
2273 vaesenc \T1, \XMM1, \XMM1
2274 vaesenc \T1, \XMM2, \XMM2
2275 vaesenc \T1, \XMM3, \XMM3
2276 vaesenc \T1, \XMM4, \XMM4
2277 vaesenc \T1, \XMM5, \XMM5
2278 vaesenc \T1, \XMM6, \XMM6
2279 vaesenc \T1, \XMM7, \XMM7
2280 vaesenc \T1, \XMM8, \XMM8
2282 vmovdqa TMP2(%rsp), \T1
2283 vmovdqu HashKey_7(arg2), \T5
2284 vpclmulqdq $0x11, \T5, \T1, \T3
2287 vpclmulqdq $0x00, \T5, \T1, \T3
2290 vpclmulqdq $0x01, \T5, \T1, \T3
2293 vpclmulqdq $0x10, \T5, \T1, \T3
2296 vmovdqu 16*4(arg1), \T1
2297 vaesenc \T1, \XMM1, \XMM1
2298 vaesenc \T1, \XMM2, \XMM2
2299 vaesenc \T1, \XMM3, \XMM3
2300 vaesenc \T1, \XMM4, \XMM4
2301 vaesenc \T1, \XMM5, \XMM5
2302 vaesenc \T1, \XMM6, \XMM6
2303 vaesenc \T1, \XMM7, \XMM7
2304 vaesenc \T1, \XMM8, \XMM8
2306 #######################################################################
2308 vmovdqa TMP3(%rsp), \T1
2309 vmovdqu HashKey_6(arg2), \T5
2310 vpclmulqdq $0x11, \T5, \T1, \T3
2313 vpclmulqdq $0x00, \T5, \T1, \T3
2316 vpclmulqdq $0x01, \T5, \T1, \T3
2319 vpclmulqdq $0x10, \T5, \T1, \T3
2322 vmovdqu 16*5(arg1), \T1
2323 vaesenc \T1, \XMM1, \XMM1
2324 vaesenc \T1, \XMM2, \XMM2
2325 vaesenc \T1, \XMM3, \XMM3
2326 vaesenc \T1, \XMM4, \XMM4
2327 vaesenc \T1, \XMM5, \XMM5
2328 vaesenc \T1, \XMM6, \XMM6
2329 vaesenc \T1, \XMM7, \XMM7
2330 vaesenc \T1, \XMM8, \XMM8
2332 vmovdqa TMP4(%rsp), \T1
2333 vmovdqu HashKey_5(arg2), \T5
2334 vpclmulqdq $0x11, \T5, \T1, \T3
2337 vpclmulqdq $0x00, \T5, \T1, \T3
2340 vpclmulqdq $0x01, \T5, \T1, \T3
2343 vpclmulqdq $0x10, \T5, \T1, \T3
2346 vmovdqu 16*6(arg1), \T1
2347 vaesenc \T1, \XMM1, \XMM1
2348 vaesenc \T1, \XMM2, \XMM2
2349 vaesenc \T1, \XMM3, \XMM3
2350 vaesenc \T1, \XMM4, \XMM4
2351 vaesenc \T1, \XMM5, \XMM5
2352 vaesenc \T1, \XMM6, \XMM6
2353 vaesenc \T1, \XMM7, \XMM7
2354 vaesenc \T1, \XMM8, \XMM8
2357 vmovdqa TMP5(%rsp), \T1
2358 vmovdqu HashKey_4(arg2), \T5
2359 vpclmulqdq $0x11, \T5, \T1, \T3
2362 vpclmulqdq $0x00, \T5, \T1, \T3
2365 vpclmulqdq $0x01, \T5, \T1, \T3
2368 vpclmulqdq $0x10, \T5, \T1, \T3
2371 vmovdqu 16*7(arg1), \T1
2372 vaesenc \T1, \XMM1, \XMM1
2373 vaesenc \T1, \XMM2, \XMM2
2374 vaesenc \T1, \XMM3, \XMM3
2375 vaesenc \T1, \XMM4, \XMM4
2376 vaesenc \T1, \XMM5, \XMM5
2377 vaesenc \T1, \XMM6, \XMM6
2378 vaesenc \T1, \XMM7, \XMM7
2379 vaesenc \T1, \XMM8, \XMM8
2381 vmovdqa TMP6(%rsp), \T1
2382 vmovdqu HashKey_3(arg2), \T5
2383 vpclmulqdq $0x11, \T5, \T1, \T3
2386 vpclmulqdq $0x00, \T5, \T1, \T3
2389 vpclmulqdq $0x01, \T5, \T1, \T3
2392 vpclmulqdq $0x10, \T5, \T1, \T3
2395 vmovdqu 16*8(arg1), \T1
2396 vaesenc \T1, \XMM1, \XMM1
2397 vaesenc \T1, \XMM2, \XMM2
2398 vaesenc \T1, \XMM3, \XMM3
2399 vaesenc \T1, \XMM4, \XMM4
2400 vaesenc \T1, \XMM5, \XMM5
2401 vaesenc \T1, \XMM6, \XMM6
2402 vaesenc \T1, \XMM7, \XMM7
2403 vaesenc \T1, \XMM8, \XMM8
2405 vmovdqa TMP7(%rsp), \T1
2406 vmovdqu HashKey_2(arg2), \T5
2407 vpclmulqdq $0x11, \T5, \T1, \T3
2410 vpclmulqdq $0x00, \T5, \T1, \T3
2413 vpclmulqdq $0x01, \T5, \T1, \T3
2416 vpclmulqdq $0x10, \T5, \T1, \T3
2420 #######################################################################
2422 vmovdqu 16*9(arg1), \T5
2423 vaesenc \T5, \XMM1, \XMM1
2424 vaesenc \T5, \XMM2, \XMM2
2425 vaesenc \T5, \XMM3, \XMM3
2426 vaesenc \T5, \XMM4, \XMM4
2427 vaesenc \T5, \XMM5, \XMM5
2428 vaesenc \T5, \XMM6, \XMM6
2429 vaesenc \T5, \XMM7, \XMM7
2430 vaesenc \T5, \XMM8, \XMM8
2432 vmovdqa TMP8(%rsp), \T1
2433 vmovdqu HashKey(arg2), \T5
2435 vpclmulqdq $0x00, \T5, \T1, \T3
2438 vpclmulqdq $0x01, \T5, \T1, \T3
2441 vpclmulqdq $0x10, \T5, \T1, \T3
2444 vpclmulqdq $0x11, \T5, \T1, \T3
2448 vmovdqu 16*10(arg1), \T5
2453 vaesenc \T5, \XMM1, \XMM1
2454 vaesenc \T5, \XMM2, \XMM2
2455 vaesenc \T5, \XMM3, \XMM3
2456 vaesenc \T5, \XMM4, \XMM4
2457 vaesenc \T5, \XMM5, \XMM5
2458 vaesenc \T5, \XMM6, \XMM6
2459 vaesenc \T5, \XMM7, \XMM7
2460 vaesenc \T5, \XMM8, \XMM8
2462 vmovdqu 16*i(arg1), \T5
2471 vpxor 16*i(arg4, %r11), \T5, \T2
2473 vaesenclast \T2, reg_j, reg_j
2475 vaesenclast \T2, reg_j, \T3
2476 vmovdqu 16*i(arg4, %r11), reg_j
2477 vmovdqu \T3, 16*i(arg3, %r11)
2483 #######################################################################
2486 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2487 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2489 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2493 #######################################################################
2494 #first phase of the reduction
2495 vmovdqa POLY2(%rip), \T3
2497 vpclmulqdq $0x01, \T7, \T3, \T2
2498 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2500 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2501 #######################################################################
2503 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2504 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2505 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2506 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2507 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2508 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2509 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2510 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2513 #######################################################################
2514 #second phase of the reduction
2515 vpclmulqdq $0x00, \T7, \T3, \T2
2516 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2518 vpclmulqdq $0x10, \T7, \T3, \T4
2519 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2521 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2522 #######################################################################
2523 vpxor \T4, \T1, \T1 # the result is in T1
2525 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2526 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2527 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2528 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2529 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2530 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2531 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2532 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2535 vpxor \T1, \XMM1, \XMM1
2542 # GHASH the last 4 ciphertext blocks.
2543 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2547 vmovdqu HashKey_8(arg2), \T5
2549 vpshufd $0b01001110, \XMM1, \T2
2550 vpshufd $0b01001110, \T5, \T3
2551 vpxor \XMM1, \T2, \T2
2554 vpclmulqdq $0x11, \T5, \XMM1, \T6
2555 vpclmulqdq $0x00, \T5, \XMM1, \T7
2557 vpclmulqdq $0x00, \T3, \T2, \XMM1
2559 ######################
2561 vmovdqu HashKey_7(arg2), \T5
2562 vpshufd $0b01001110, \XMM2, \T2
2563 vpshufd $0b01001110, \T5, \T3
2564 vpxor \XMM2, \T2, \T2
2567 vpclmulqdq $0x11, \T5, \XMM2, \T4
2570 vpclmulqdq $0x00, \T5, \XMM2, \T4
2573 vpclmulqdq $0x00, \T3, \T2, \T2
2575 vpxor \T2, \XMM1, \XMM1
2577 ######################
2579 vmovdqu HashKey_6(arg2), \T5
2580 vpshufd $0b01001110, \XMM3, \T2
2581 vpshufd $0b01001110, \T5, \T3
2582 vpxor \XMM3, \T2, \T2
2585 vpclmulqdq $0x11, \T5, \XMM3, \T4
2588 vpclmulqdq $0x00, \T5, \XMM3, \T4
2591 vpclmulqdq $0x00, \T3, \T2, \T2
2593 vpxor \T2, \XMM1, \XMM1
2595 ######################
2597 vmovdqu HashKey_5(arg2), \T5
2598 vpshufd $0b01001110, \XMM4, \T2
2599 vpshufd $0b01001110, \T5, \T3
2600 vpxor \XMM4, \T2, \T2
2603 vpclmulqdq $0x11, \T5, \XMM4, \T4
2606 vpclmulqdq $0x00, \T5, \XMM4, \T4
2609 vpclmulqdq $0x00, \T3, \T2, \T2
2611 vpxor \T2, \XMM1, \XMM1
2613 ######################
2615 vmovdqu HashKey_4(arg2), \T5
2616 vpshufd $0b01001110, \XMM5, \T2
2617 vpshufd $0b01001110, \T5, \T3
2618 vpxor \XMM5, \T2, \T2
2621 vpclmulqdq $0x11, \T5, \XMM5, \T4
2624 vpclmulqdq $0x00, \T5, \XMM5, \T4
2627 vpclmulqdq $0x00, \T3, \T2, \T2
2629 vpxor \T2, \XMM1, \XMM1
2631 ######################
2633 vmovdqu HashKey_3(arg2), \T5
2634 vpshufd $0b01001110, \XMM6, \T2
2635 vpshufd $0b01001110, \T5, \T3
2636 vpxor \XMM6, \T2, \T2
2639 vpclmulqdq $0x11, \T5, \XMM6, \T4
2642 vpclmulqdq $0x00, \T5, \XMM6, \T4
2645 vpclmulqdq $0x00, \T3, \T2, \T2
2647 vpxor \T2, \XMM1, \XMM1
2649 ######################
2651 vmovdqu HashKey_2(arg2), \T5
2652 vpshufd $0b01001110, \XMM7, \T2
2653 vpshufd $0b01001110, \T5, \T3
2654 vpxor \XMM7, \T2, \T2
2657 vpclmulqdq $0x11, \T5, \XMM7, \T4
2660 vpclmulqdq $0x00, \T5, \XMM7, \T4
2663 vpclmulqdq $0x00, \T3, \T2, \T2
2665 vpxor \T2, \XMM1, \XMM1
2667 ######################
2669 vmovdqu HashKey(arg2), \T5
2670 vpshufd $0b01001110, \XMM8, \T2
2671 vpshufd $0b01001110, \T5, \T3
2672 vpxor \XMM8, \T2, \T2
2675 vpclmulqdq $0x11, \T5, \XMM8, \T4
2678 vpclmulqdq $0x00, \T5, \XMM8, \T4
2681 vpclmulqdq $0x00, \T3, \T2, \T2
2683 vpxor \T2, \XMM1, \XMM1
2684 vpxor \T6, \XMM1, \XMM1
2685 vpxor \T7, \XMM1, \T2
2690 vpslldq $8, \T2, \T4
2691 vpsrldq $8, \T2, \T2
2694 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2695 # accumulated carry-less multiplications
2697 #######################################################################
2698 #first phase of the reduction
2699 vmovdqa POLY2(%rip), \T3
2701 vpclmulqdq $0x01, \T7, \T3, \T2
2702 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2704 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2705 #######################################################################
2708 #second phase of the reduction
2709 vpclmulqdq $0x00, \T7, \T3, \T2
2710 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2712 vpclmulqdq $0x10, \T7, \T3, \T4
2713 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2715 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2716 #######################################################################
2717 vpxor \T4, \T6, \T6 # the result is in T6
2722 #############################################################
2723 #void aesni_gcm_init_avx_gen4
2724 # (gcm_data *my_ctx_data,
2725 # gcm_context_data *data,
2726 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2727 # (from Security Association) concatenated with 8 byte
2728 # Initialisation Vector (from IPSec ESP Payload)
2729 # concatenated with 0x00000001. 16-byte aligned pointer. */
2730 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2731 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2732 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2733 #############################################################
2734 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2736 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2739 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2741 ###############################################################################
2742 #void aesni_gcm_enc_avx_gen4(
2743 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2744 # gcm_context_data *data,
2745 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2746 # const u8 *in, /* Plaintext input */
2747 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2748 ###############################################################################
2749 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2753 je key_256_enc_update4
2755 je key_128_enc_update4
2757 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2760 key_128_enc_update4:
2761 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2764 key_256_enc_update4:
2765 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2768 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2770 ###############################################################################
2771 #void aesni_gcm_dec_update_avx_gen4(
2772 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2773 # gcm_context_data *data,
2774 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2775 # const u8 *in, /* Ciphertext input */
2776 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2777 ###############################################################################
2778 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2782 je key_256_dec_update4
2784 je key_128_dec_update4
2786 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2789 key_128_dec_update4:
2790 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2793 key_256_dec_update4:
2794 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2797 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2799 ###############################################################################
2800 #void aesni_gcm_finalize_avx_gen4(
2801 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2802 # gcm_context_data *data,
2803 # u8 *auth_tag, /* Authenticated Tag output. */
2804 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2805 # Valid values are 16 (most likely), 12 or 8. */
2806 ###############################################################################
2807 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2811 je key_256_finalize4
2813 je key_128_finalize4
2815 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2819 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2823 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2826 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)