1 /* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
3 * AES CTR mode by8 optimization with AVX instructions. (x86_64)
5 * Copyright(c) 2014 Intel Corporation.
8 * James Guilford <james.guilford@intel.com>
9 * Sean Gulley <sean.m.gulley@intel.com>
10 * Chandramouli Narayanan <mouli@linux.intel.com>
13 * This is AES128/192/256 CTR mode optimization implementation. It requires
14 * the support of Intel(R) AESNI and AVX instructions.
16 * This work was inspired by the AES CTR mode optimization published
17 * in Intel Optimized IPSEC Cryptographic library.
18 * Additional information on it can be found at:
19 * https://github.com/intel/intel-ipsec-mb
22 #include <linux/linkage.h>
24 #define VMOVDQ vmovdqu
34 #define xcounter %xmm8
35 #define xbyteswap %xmm9
60 .octa 0x000102030405060708090A0B0C0D0E0F
62 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
64 .octa 0x00000000000000010000000000000000
66 .octa 0x00000000000000000000000000000001
68 .octa 0x00000000000000000000000000000002
70 .octa 0x00000000000000000000000000000003
72 .octa 0x00000000000000000000000000000004
74 .octa 0x00000000000000000000000000000005
76 .octa 0x00000000000000000000000000000006
78 .octa 0x00000000000000000000000000000007
80 .octa 0x00000000000000000000000000000008
84 /* generate a unique variable for ddq_add_x */
86 /* generate a unique variable for xmm register */
91 /* club the numeric 'id' to the symbol 'name' */
102 * do_aes num_in_par load_keys key_len
103 * This increments p_in, but not p_out
105 .macro do_aes b, k, key_len
111 vmovdqa 0*16(p_keys), xkey0
114 vpshufb xbyteswap, xcounter, xdata0
119 vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
120 vptest ddq_low_msk(%rip), var_xdata
122 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
123 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
125 vpshufb xbyteswap, var_xdata, var_xdata
129 vmovdqa 1*16(p_keys), xkeyA
131 vpxor xkey0, xdata0, xdata0
132 vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
133 vptest ddq_low_msk(%rip), xcounter
135 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
141 vpxor xkey0, var_xdata, var_xdata
145 vmovdqa 2*16(p_keys), xkeyB
150 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
154 .if (klen == KEY_128)
156 vmovdqa 3*16(p_keys), xkey4
159 vmovdqa 3*16(p_keys), xkeyA
165 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
171 .if (klen == KEY_128)
172 vmovdqa 4*16(p_keys), xkeyB
175 vmovdqa 4*16(p_keys), xkey4
183 .if (klen == KEY_128)
184 vaesenc xkey4, var_xdata, var_xdata
186 vaesenc xkeyA, var_xdata, var_xdata
191 vmovdqa 5*16(p_keys), xkeyA
197 .if (klen == KEY_128)
198 vaesenc xkeyB, var_xdata, var_xdata
200 vaesenc xkey4, var_xdata, var_xdata
205 .if (klen == KEY_128)
207 vmovdqa 6*16(p_keys), xkey8
210 vmovdqa 6*16(p_keys), xkeyB
216 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
220 vmovdqa 7*16(p_keys), xkeyA
226 .if (klen == KEY_128)
227 vaesenc xkey8, var_xdata, var_xdata
229 vaesenc xkeyB, var_xdata, var_xdata
234 .if (klen == KEY_128)
235 vmovdqa 8*16(p_keys), xkeyB
238 vmovdqa 8*16(p_keys), xkey8
245 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
249 .if (klen == KEY_128)
251 vmovdqa 9*16(p_keys), xkey12
254 vmovdqa 9*16(p_keys), xkeyA
261 .if (klen == KEY_128)
262 vaesenc xkeyB, var_xdata, var_xdata
264 vaesenc xkey8, var_xdata, var_xdata
269 vmovdqa 10*16(p_keys), xkeyB
275 .if (klen == KEY_128)
276 vaesenc xkey12, var_xdata, var_xdata
278 vaesenc xkeyA, var_xdata, var_xdata
283 .if (klen != KEY_128)
284 vmovdqa 11*16(p_keys), xkeyA
291 .if (klen == KEY_128)
292 vaesenclast xkeyB, var_xdata, var_xdata
294 vaesenc xkeyB, var_xdata, var_xdata
299 .if (klen != KEY_128)
301 vmovdqa 12*16(p_keys), xkey12
307 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
311 .if (klen == KEY_256)
312 vmovdqa 13*16(p_keys), xkeyA
318 .if (klen == KEY_256)
320 vaesenc xkey12, var_xdata, var_xdata
322 vaesenclast xkey12, var_xdata, var_xdata
327 .if (klen == KEY_256)
328 vmovdqa 14*16(p_keys), xkeyB
334 vaesenc xkeyA, var_xdata, var_xdata
342 vaesenclast xkeyB, var_xdata, var_xdata
351 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
352 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
354 vpxor xkeyA, var_xdata, var_xdata
356 vpxor xkeyB, var_xdata, var_xdata
361 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
363 vpxor xkeyA, var_xdata, var_xdata
369 VMOVDQ var_xdata, i*16(p_out)
374 .macro do_aes_load val, key_len
375 do_aes \val, 1, \key_len
378 .macro do_aes_noload val, key_len
379 do_aes \val, 0, \key_len
382 /* main body of aes ctr load */
384 .macro do_aes_ctrmain key_len
386 jb .Ldo_return2\key_len
388 vmovdqa byteswap_const(%rip), xbyteswap
389 vmovdqu (p_iv), xcounter
390 vpshufb xbyteswap, xcounter, xcounter
394 jz .Lmult_of_8_blks\key_len
407 do_aes_load 1, \key_len
409 and $(~7*16), num_bytes
410 jz .Ldo_return2\key_len
411 jmp .Lmain_loop2\key_len
414 do_aes_load 2, \key_len
416 and $(~7*16), num_bytes
417 jz .Ldo_return2\key_len
418 jmp .Lmain_loop2\key_len
422 do_aes_load 3, \key_len
424 and $(~7*16), num_bytes
425 jz .Ldo_return2\key_len
426 jmp .Lmain_loop2\key_len
429 do_aes_load 4, \key_len
431 and $(~7*16), num_bytes
432 jz .Ldo_return2\key_len
433 jmp .Lmain_loop2\key_len
441 do_aes_load 5, \key_len
443 and $(~7*16), num_bytes
444 jz .Ldo_return2\key_len
445 jmp .Lmain_loop2\key_len
448 do_aes_load 6, \key_len
450 and $(~7*16), num_bytes
451 jz .Ldo_return2\key_len
452 jmp .Lmain_loop2\key_len
455 do_aes_load 7, \key_len
457 and $(~7*16), num_bytes
458 jz .Ldo_return2\key_len
459 jmp .Lmain_loop2\key_len
461 .Lmult_of_8_blks\key_len:
462 .if (\key_len != KEY_128)
463 vmovdqa 0*16(p_keys), xkey0
464 vmovdqa 4*16(p_keys), xkey4
465 vmovdqa 8*16(p_keys), xkey8
466 vmovdqa 12*16(p_keys), xkey12
468 vmovdqa 0*16(p_keys), xkey0
469 vmovdqa 3*16(p_keys), xkey4
470 vmovdqa 6*16(p_keys), xkey8
471 vmovdqa 9*16(p_keys), xkey12
474 .Lmain_loop2\key_len:
475 /* num_bytes is a multiple of 8 and >0 */
476 do_aes_noload 8, \key_len
478 sub $(8*16), num_bytes
479 jne .Lmain_loop2\key_len
481 .Ldo_return2\key_len:
482 /* return updated IV */
483 vpshufb xbyteswap, xcounter, xcounter
484 vmovdqu xcounter, (p_iv)
489 * routine to do AES128 CTR enc/decrypt "by8"
490 * XMM registers are clobbered.
491 * Saving/restoring must be done at a higher level
492 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
493 * unsigned int num_bytes)
495 SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
496 /* call the aes main loop */
497 do_aes_ctrmain KEY_128
499 SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
502 * routine to do AES192 CTR enc/decrypt "by8"
503 * XMM registers are clobbered.
504 * Saving/restoring must be done at a higher level
505 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
506 * unsigned int num_bytes)
508 SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
509 /* call the aes main loop */
510 do_aes_ctrmain KEY_192
512 SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
515 * routine to do AES256 CTR enc/decrypt "by8"
516 * XMM registers are clobbered.
517 * Saving/restoring must be done at a higher level
518 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
519 * unsigned int num_bytes)
521 SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
522 /* call the aes main loop */
523 do_aes_ctrmain KEY_256
525 SYM_FUNC_END(aes_ctr_enc_256_avx_by8)