1 /* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
3 * AES CTR mode by8 optimization with AVX instructions. (x86_64)
5 * Copyright(c) 2014 Intel Corporation.
8 * James Guilford <james.guilford@intel.com>
9 * Sean Gulley <sean.m.gulley@intel.com>
10 * Chandramouli Narayanan <mouli@linux.intel.com>
13 * This is AES128/192/256 CTR mode optimization implementation. It requires
14 * the support of Intel(R) AESNI and AVX instructions.
16 * This work was inspired by the AES CTR mode optimization published
17 * in Intel Optimized IPSEC Cryptographic library.
18 * Additional information on it can be found at:
19 * https://github.com/intel/intel-ipsec-mb
22 #include <linux/linkage.h>
24 #define VMOVDQ vmovdqu
27 * Note: the "x" prefix in these aliases means "this is an xmm register". The
28 * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
39 #define xcounter %xmm8 // CTR mode only
40 #define xiv %xmm8 // XCTR mode only
41 #define xbyteswap %xmm9 // CTR mode only
42 #define xtmp %xmm9 // XCTR mode only
55 #define counter %r9 // XCTR mode only
67 .octa 0x000102030405060708090A0B0C0D0E0F
69 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
71 .octa 0x00000000000000010000000000000000
73 .octa 0x00000000000000000000000000000001
75 .octa 0x00000000000000000000000000000002
77 .octa 0x00000000000000000000000000000003
79 .octa 0x00000000000000000000000000000004
81 .octa 0x00000000000000000000000000000005
83 .octa 0x00000000000000000000000000000006
85 .octa 0x00000000000000000000000000000007
87 .octa 0x00000000000000000000000000000008
91 /* generate a unique variable for ddq_add_x */
93 /* generate a unique variable for xmm register */
98 /* club the numeric 'id' to the symbol 'name' */
109 * do_aes num_in_par load_keys key_len
110 * This increments p_in, but not p_out
112 .macro do_aes b, k, key_len, xctr
118 vmovdqa 0*16(p_keys), xkey0
126 vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
132 vpxor xiv, var_xdata, var_xdata
136 vpshufb xbyteswap, xcounter, xdata0
140 vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
141 vptest ddq_low_msk(%rip), var_xdata
143 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
144 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
146 vpshufb xbyteswap, var_xdata, var_xdata
151 vmovdqa 1*16(p_keys), xkeyA
153 vpxor xkey0, xdata0, xdata0
157 vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
158 vptest ddq_low_msk(%rip), xcounter
160 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
167 vpxor xkey0, var_xdata, var_xdata
171 vmovdqa 2*16(p_keys), xkeyB
176 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
180 .if (klen == KEY_128)
182 vmovdqa 3*16(p_keys), xkey4
185 vmovdqa 3*16(p_keys), xkeyA
191 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
197 .if (klen == KEY_128)
198 vmovdqa 4*16(p_keys), xkeyB
201 vmovdqa 4*16(p_keys), xkey4
209 .if (klen == KEY_128)
210 vaesenc xkey4, var_xdata, var_xdata
212 vaesenc xkeyA, var_xdata, var_xdata
217 vmovdqa 5*16(p_keys), xkeyA
223 .if (klen == KEY_128)
224 vaesenc xkeyB, var_xdata, var_xdata
226 vaesenc xkey4, var_xdata, var_xdata
231 .if (klen == KEY_128)
233 vmovdqa 6*16(p_keys), xkey8
236 vmovdqa 6*16(p_keys), xkeyB
242 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
246 vmovdqa 7*16(p_keys), xkeyA
252 .if (klen == KEY_128)
253 vaesenc xkey8, var_xdata, var_xdata
255 vaesenc xkeyB, var_xdata, var_xdata
260 .if (klen == KEY_128)
261 vmovdqa 8*16(p_keys), xkeyB
264 vmovdqa 8*16(p_keys), xkey8
271 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
275 .if (klen == KEY_128)
277 vmovdqa 9*16(p_keys), xkey12
280 vmovdqa 9*16(p_keys), xkeyA
287 .if (klen == KEY_128)
288 vaesenc xkeyB, var_xdata, var_xdata
290 vaesenc xkey8, var_xdata, var_xdata
295 vmovdqa 10*16(p_keys), xkeyB
301 .if (klen == KEY_128)
302 vaesenc xkey12, var_xdata, var_xdata
304 vaesenc xkeyA, var_xdata, var_xdata
309 .if (klen != KEY_128)
310 vmovdqa 11*16(p_keys), xkeyA
317 .if (klen == KEY_128)
318 vaesenclast xkeyB, var_xdata, var_xdata
320 vaesenc xkeyB, var_xdata, var_xdata
325 .if (klen != KEY_128)
327 vmovdqa 12*16(p_keys), xkey12
333 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
337 .if (klen == KEY_256)
338 vmovdqa 13*16(p_keys), xkeyA
344 .if (klen == KEY_256)
346 vaesenc xkey12, var_xdata, var_xdata
348 vaesenclast xkey12, var_xdata, var_xdata
353 .if (klen == KEY_256)
354 vmovdqa 14*16(p_keys), xkeyB
360 vaesenc xkeyA, var_xdata, var_xdata
368 vaesenclast xkeyB, var_xdata, var_xdata
377 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
378 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
380 vpxor xkeyA, var_xdata, var_xdata
382 vpxor xkeyB, var_xdata, var_xdata
387 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
389 vpxor xkeyA, var_xdata, var_xdata
395 VMOVDQ var_xdata, i*16(p_out)
400 .macro do_aes_load val, key_len, xctr
401 do_aes \val, 1, \key_len, \xctr
404 .macro do_aes_noload val, key_len, xctr
405 do_aes \val, 0, \key_len, \xctr
408 /* main body of aes ctr load */
410 .macro do_aes_ctrmain key_len, xctr
412 jb .Ldo_return2\xctr\key_len
418 vmovdqa byteswap_const(%rip), xbyteswap
419 vmovdqu (p_iv), xcounter
420 vpshufb xbyteswap, xcounter, xcounter
425 jz .Lmult_of_8_blks\xctr\key_len
429 jg .Lgt4\xctr\key_len
430 je .Leq4\xctr\key_len
434 jg .Leq3\xctr\key_len
435 je .Leq2\xctr\key_len
438 do_aes_load 1, \key_len, \xctr
440 and $(~7*16), num_bytes
441 jz .Ldo_return2\xctr\key_len
442 jmp .Lmain_loop2\xctr\key_len
445 do_aes_load 2, \key_len, \xctr
447 and $(~7*16), num_bytes
448 jz .Ldo_return2\xctr\key_len
449 jmp .Lmain_loop2\xctr\key_len
453 do_aes_load 3, \key_len, \xctr
455 and $(~7*16), num_bytes
456 jz .Ldo_return2\xctr\key_len
457 jmp .Lmain_loop2\xctr\key_len
460 do_aes_load 4, \key_len, \xctr
462 and $(~7*16), num_bytes
463 jz .Ldo_return2\xctr\key_len
464 jmp .Lmain_loop2\xctr\key_len
468 jg .Leq7\xctr\key_len
469 je .Leq6\xctr\key_len
472 do_aes_load 5, \key_len, \xctr
474 and $(~7*16), num_bytes
475 jz .Ldo_return2\xctr\key_len
476 jmp .Lmain_loop2\xctr\key_len
479 do_aes_load 6, \key_len, \xctr
481 and $(~7*16), num_bytes
482 jz .Ldo_return2\xctr\key_len
483 jmp .Lmain_loop2\xctr\key_len
486 do_aes_load 7, \key_len, \xctr
488 and $(~7*16), num_bytes
489 jz .Ldo_return2\xctr\key_len
490 jmp .Lmain_loop2\xctr\key_len
492 .Lmult_of_8_blks\xctr\key_len:
493 .if (\key_len != KEY_128)
494 vmovdqa 0*16(p_keys), xkey0
495 vmovdqa 4*16(p_keys), xkey4
496 vmovdqa 8*16(p_keys), xkey8
497 vmovdqa 12*16(p_keys), xkey12
499 vmovdqa 0*16(p_keys), xkey0
500 vmovdqa 3*16(p_keys), xkey4
501 vmovdqa 6*16(p_keys), xkey8
502 vmovdqa 9*16(p_keys), xkey12
505 .Lmain_loop2\xctr\key_len:
506 /* num_bytes is a multiple of 8 and >0 */
507 do_aes_noload 8, \key_len, \xctr
509 sub $(8*16), num_bytes
510 jne .Lmain_loop2\xctr\key_len
512 .Ldo_return2\xctr\key_len:
514 /* return updated IV */
515 vpshufb xbyteswap, xcounter, xcounter
516 vmovdqu xcounter, (p_iv)
522 * routine to do AES128 CTR enc/decrypt "by8"
523 * XMM registers are clobbered.
524 * Saving/restoring must be done at a higher level
525 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
526 * unsigned int num_bytes)
528 SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
529 /* call the aes main loop */
530 do_aes_ctrmain KEY_128 0
532 SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
535 * routine to do AES192 CTR enc/decrypt "by8"
536 * XMM registers are clobbered.
537 * Saving/restoring must be done at a higher level
538 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
539 * unsigned int num_bytes)
541 SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
542 /* call the aes main loop */
543 do_aes_ctrmain KEY_192 0
545 SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
548 * routine to do AES256 CTR enc/decrypt "by8"
549 * XMM registers are clobbered.
550 * Saving/restoring must be done at a higher level
551 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
552 * unsigned int num_bytes)
554 SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
555 /* call the aes main loop */
556 do_aes_ctrmain KEY_256 0
558 SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
561 * routine to do AES128 XCTR enc/decrypt "by8"
562 * XMM registers are clobbered.
563 * Saving/restoring must be done at a higher level
564 * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
565 * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
567 SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
568 /* call the aes main loop */
569 do_aes_ctrmain KEY_128 1
571 SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
574 * routine to do AES192 XCTR enc/decrypt "by8"
575 * XMM registers are clobbered.
576 * Saving/restoring must be done at a higher level
577 * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
578 * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
580 SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
581 /* call the aes main loop */
582 do_aes_ctrmain KEY_192 1
584 SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
587 * routine to do AES256 XCTR enc/decrypt "by8"
588 * XMM registers are clobbered.
589 * Saving/restoring must be done at a higher level
590 * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
591 * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
593 SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
594 /* call the aes main loop */
595 do_aes_ctrmain KEY_256 1
597 SYM_FUNC_END(aes_xctr_enc_256_avx_by8)