2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
17 * Copyright(c) 2014 Intel Corporation.
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
35 * Copyright(c) 2014 Intel Corporation.
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 #include <linux/linkage.h>
68 #define VMOVDQ vmovdqu
78 #define xcounter %xmm8
79 #define xbyteswap %xmm9
104 .octa 0x000102030405060708090A0B0C0D0E0F
106 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
108 .octa 0x00000000000000010000000000000000
110 .octa 0x00000000000000000000000000000001
112 .octa 0x00000000000000000000000000000002
114 .octa 0x00000000000000000000000000000003
116 .octa 0x00000000000000000000000000000004
118 .octa 0x00000000000000000000000000000005
120 .octa 0x00000000000000000000000000000006
122 .octa 0x00000000000000000000000000000007
124 .octa 0x00000000000000000000000000000008
128 /* generate a unique variable for ddq_add_x */
130 /* generate a unique variable for xmm register */
135 /* club the numeric 'id' to the symbol 'name' */
146 * do_aes num_in_par load_keys key_len
147 * This increments p_in, but not p_out
149 .macro do_aes b, k, key_len
155 vmovdqa 0*16(p_keys), xkey0
158 vpshufb xbyteswap, xcounter, xdata0
163 vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
164 vptest ddq_low_msk(%rip), var_xdata
166 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
167 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
169 vpshufb xbyteswap, var_xdata, var_xdata
173 vmovdqa 1*16(p_keys), xkeyA
175 vpxor xkey0, xdata0, xdata0
176 vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
177 vptest ddq_low_msk(%rip), xcounter
179 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
185 vpxor xkey0, var_xdata, var_xdata
189 vmovdqa 2*16(p_keys), xkeyB
194 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
198 .if (klen == KEY_128)
200 vmovdqa 3*16(p_keys), xkey4
203 vmovdqa 3*16(p_keys), xkeyA
209 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
215 .if (klen == KEY_128)
216 vmovdqa 4*16(p_keys), xkeyB
219 vmovdqa 4*16(p_keys), xkey4
227 .if (klen == KEY_128)
228 vaesenc xkey4, var_xdata, var_xdata
230 vaesenc xkeyA, var_xdata, var_xdata
235 vmovdqa 5*16(p_keys), xkeyA
241 .if (klen == KEY_128)
242 vaesenc xkeyB, var_xdata, var_xdata
244 vaesenc xkey4, var_xdata, var_xdata
249 .if (klen == KEY_128)
251 vmovdqa 6*16(p_keys), xkey8
254 vmovdqa 6*16(p_keys), xkeyB
260 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
264 vmovdqa 7*16(p_keys), xkeyA
270 .if (klen == KEY_128)
271 vaesenc xkey8, var_xdata, var_xdata
273 vaesenc xkeyB, var_xdata, var_xdata
278 .if (klen == KEY_128)
279 vmovdqa 8*16(p_keys), xkeyB
282 vmovdqa 8*16(p_keys), xkey8
289 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
293 .if (klen == KEY_128)
295 vmovdqa 9*16(p_keys), xkey12
298 vmovdqa 9*16(p_keys), xkeyA
305 .if (klen == KEY_128)
306 vaesenc xkeyB, var_xdata, var_xdata
308 vaesenc xkey8, var_xdata, var_xdata
313 vmovdqa 10*16(p_keys), xkeyB
319 .if (klen == KEY_128)
320 vaesenc xkey12, var_xdata, var_xdata
322 vaesenc xkeyA, var_xdata, var_xdata
327 .if (klen != KEY_128)
328 vmovdqa 11*16(p_keys), xkeyA
335 .if (klen == KEY_128)
336 vaesenclast xkeyB, var_xdata, var_xdata
338 vaesenc xkeyB, var_xdata, var_xdata
343 .if (klen != KEY_128)
345 vmovdqa 12*16(p_keys), xkey12
351 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
355 .if (klen == KEY_256)
356 vmovdqa 13*16(p_keys), xkeyA
362 .if (klen == KEY_256)
364 vaesenc xkey12, var_xdata, var_xdata
366 vaesenclast xkey12, var_xdata, var_xdata
371 .if (klen == KEY_256)
372 vmovdqa 14*16(p_keys), xkeyB
378 vaesenc xkeyA, var_xdata, var_xdata
386 vaesenclast xkeyB, var_xdata, var_xdata
395 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
396 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
398 vpxor xkeyA, var_xdata, var_xdata
400 vpxor xkeyB, var_xdata, var_xdata
405 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
407 vpxor xkeyA, var_xdata, var_xdata
413 VMOVDQ var_xdata, i*16(p_out)
418 .macro do_aes_load val, key_len
419 do_aes \val, 1, \key_len
422 .macro do_aes_noload val, key_len
423 do_aes \val, 0, \key_len
426 /* main body of aes ctr load */
428 .macro do_aes_ctrmain key_len
430 jb .Ldo_return2\key_len
432 vmovdqa byteswap_const(%rip), xbyteswap
433 vmovdqu (p_iv), xcounter
434 vpshufb xbyteswap, xcounter, xcounter
438 jz .Lmult_of_8_blks\key_len
451 do_aes_load 1, \key_len
453 and $(~7*16), num_bytes
454 jz .Ldo_return2\key_len
455 jmp .Lmain_loop2\key_len
458 do_aes_load 2, \key_len
460 and $(~7*16), num_bytes
461 jz .Ldo_return2\key_len
462 jmp .Lmain_loop2\key_len
466 do_aes_load 3, \key_len
468 and $(~7*16), num_bytes
469 jz .Ldo_return2\key_len
470 jmp .Lmain_loop2\key_len
473 do_aes_load 4, \key_len
475 and $(~7*16), num_bytes
476 jz .Ldo_return2\key_len
477 jmp .Lmain_loop2\key_len
485 do_aes_load 5, \key_len
487 and $(~7*16), num_bytes
488 jz .Ldo_return2\key_len
489 jmp .Lmain_loop2\key_len
492 do_aes_load 6, \key_len
494 and $(~7*16), num_bytes
495 jz .Ldo_return2\key_len
496 jmp .Lmain_loop2\key_len
499 do_aes_load 7, \key_len
501 and $(~7*16), num_bytes
502 jz .Ldo_return2\key_len
503 jmp .Lmain_loop2\key_len
505 .Lmult_of_8_blks\key_len:
506 .if (\key_len != KEY_128)
507 vmovdqa 0*16(p_keys), xkey0
508 vmovdqa 4*16(p_keys), xkey4
509 vmovdqa 8*16(p_keys), xkey8
510 vmovdqa 12*16(p_keys), xkey12
512 vmovdqa 0*16(p_keys), xkey0
513 vmovdqa 3*16(p_keys), xkey4
514 vmovdqa 6*16(p_keys), xkey8
515 vmovdqa 9*16(p_keys), xkey12
518 .Lmain_loop2\key_len:
519 /* num_bytes is a multiple of 8 and >0 */
520 do_aes_noload 8, \key_len
522 sub $(8*16), num_bytes
523 jne .Lmain_loop2\key_len
525 .Ldo_return2\key_len:
526 /* return updated IV */
527 vpshufb xbyteswap, xcounter, xcounter
528 vmovdqu xcounter, (p_iv)
533 * routine to do AES128 CTR enc/decrypt "by8"
534 * XMM registers are clobbered.
535 * Saving/restoring must be done at a higher level
536 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
537 * unsigned int num_bytes)
539 ENTRY(aes_ctr_enc_128_avx_by8)
540 /* call the aes main loop */
541 do_aes_ctrmain KEY_128
543 ENDPROC(aes_ctr_enc_128_avx_by8)
546 * routine to do AES192 CTR enc/decrypt "by8"
547 * XMM registers are clobbered.
548 * Saving/restoring must be done at a higher level
549 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
550 * unsigned int num_bytes)
552 ENTRY(aes_ctr_enc_192_avx_by8)
553 /* call the aes main loop */
554 do_aes_ctrmain KEY_192
556 ENDPROC(aes_ctr_enc_192_avx_by8)
559 * routine to do AES256 CTR enc/decrypt "by8"
560 * XMM registers are clobbered.
561 * Saving/restoring must be done at a higher level
562 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
563 * unsigned int num_bytes)
565 ENTRY(aes_ctr_enc_256_avx_by8)
566 /* call the aes main loop */
567 do_aes_ctrmain KEY_256
569 ENDPROC(aes_ctr_enc_256_avx_by8)