1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 # Accelerated AES-GCM stitched implementation for ppc64le.
5 # Copyright 2022- IBM Inc. All rights reserved
7 #===================================================================================
8 # Written by Danny Tsen <dtsen@linux.ibm.com>
10 # GHASH is based on the Karatsuba multiplication method.
14 # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
15 # (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
16 # (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
17 # (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
18 # (X4.h * H.h + X4.l * H.l + X4 * H)
22 # Hash keys = v3 - v14
24 # ( H^2.l, H^2, H^2.h)
25 # ( H^3.l, H^3, H^3.h)
26 # ( H^4.l, H^4, H^4.h)
32 # vs0 - vs14 for round keys
33 # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
35 # This implementation uses stitched AES-GCM approach to improve overall performance.
36 # AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
38 # ===================================================================================
41 #include <asm/ppc_asm.h>
42 #include <linux/linkage.h>
48 # v15 - v18 - input states
49 # vs1 - vs9 - round keys
51 .macro Loop_aes_middle4x
110 # v15 - v22 - input states
111 # vs1 - vs9 - round keys
113 .macro Loop_aes_middle8x
207 .macro Loop_aes_middle_1x
233 # Compute 4x hash values based on Karatsuba method.
235 .macro ppc_aes_gcm_ghash
238 vpmsumd 23, 12, 15 # H4.L * X.L
247 vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
248 vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
256 # sum hash and reduction with H Poly
257 vpmsumd 28, 23, 2 # reduction
260 vsldoi 26, 24, 29, 8 # mL
261 vsldoi 29, 29, 24, 8 # mH
262 vxor 23, 23, 26 # mL + L
264 vsldoi 23, 23, 23, 8 # swap
267 vpmsumd 24, 14, 15 # H4.H * X.H
278 # sum hash and reduction with H Poly
279 vsldoi 27, 23, 23, 8 # swap
284 xxlor 32, 23+32, 23+32 # update hash
289 # Combine two 4x ghash
290 # v15 - v22 - input blocks
292 .macro ppc_aes_gcm_ghash2_4x
294 vxor 15, 15, 0 # Xi + X
296 vpmsumd 23, 12, 15 # H4.L * X.L
305 vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
306 vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
313 # sum hash and reduction with H Poly
314 vpmsumd 28, 23, 2 # reduction
319 vsldoi 26, 24, 29, 8 # mL
320 vsldoi 29, 29, 24, 8 # mH
321 vxor 23, 23, 26 # mL + L
323 vsldoi 23, 23, 23, 8 # swap
326 vpmsumd 24, 14, 15 # H4.H * X.H
335 vxor 24, 24, 29 # H + mH
337 # sum hash and reduction with H Poly
338 vsldoi 27, 23, 23, 8 # swap
341 vxor 27, 23, 27 # 1st Xi
347 vxor 19, 19, 27 # Xi + X
348 vpmsumd 23, 12, 19 # H4.L * X.L
354 vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
355 vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
362 # sum hash and reduction with H Poly
363 vpmsumd 28, 23, 2 # reduction
368 vsldoi 26, 24, 29, 8 # mL
369 vsldoi 29, 29, 24, 8 # mH
370 vxor 23, 23, 26 # mL + L
372 vsldoi 23, 23, 23, 8 # swap
375 vpmsumd 24, 14, 19 # H4.H * X.H
384 vxor 24, 24, 29 # H + mH
386 # sum hash and reduction with H Poly
387 vsldoi 27, 23, 23, 8 # swap
392 xxlor 32, 23+32, 23+32 # update hash
397 # Compute update single hash
399 .macro ppc_update_hash_1x
404 vpmsumd 22, 3, 28 # L
405 vpmsumd 23, 4, 28 # M
406 vpmsumd 24, 5, 28 # H
408 vpmsumd 27, 22, 2 # reduction
410 vsldoi 25, 23, 19, 8 # mL
411 vsldoi 26, 19, 23, 8 # mH
412 vxor 22, 22, 25 # LL + LL
413 vxor 24, 24, 26 # HH + HH
415 vsldoi 22, 22, 22, 8 # swap
418 vsldoi 20, 22, 22, 8 # swap
419 vpmsumd 22, 22, 2 # reduction
423 vmr 0, 22 # update hash
524 .macro LOAD_HASH_TABLE
526 lxvb16x 32, 0, 8 # load Xi
528 # load Hash - h^4, h^3, h^2, h
530 lxvd2x 2+32, 10, 8 # H Poli
532 lxvd2x 3+32, 10, 8 # Hl
534 lxvd2x 4+32, 10, 8 # H
536 lxvd2x 5+32, 10, 8 # Hh
539 lxvd2x 6+32, 10, 8 # H^2l
541 lxvd2x 7+32, 10, 8 # H^2
543 lxvd2x 8+32, 10, 8 # H^2h
546 lxvd2x 9+32, 10, 8 # H^3l
548 lxvd2x 10+32, 10, 8 # H^3
550 lxvd2x 11+32, 10, 8 # H^3h
553 lxvd2x 12+32, 10, 8 # H^4l
555 lxvd2x 13+32, 10, 8 # H^4
557 lxvd2x 14+32, 10, 8 # H^4h
561 # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
562 # const char *rk, unsigned char iv[16], void *Xip);
567 # r6 - AES round keys
568 # r7 - iv and other data
569 # r8 - Xi, HPoli, hash keys
571 # rounds is at offset 240 in rk
572 # Xi is at 0 in gcm_table (Xip).
574 _GLOBAL(aes_p10_gcm_encrypt)
581 # initialize ICB: GHASH( IV ), IV - r7
582 lxvb16x 30+32, 0, 7 # load IV - v30
585 li 11, 0 # block index
590 vsldoi 31, 31, 22,1 # counter 1
592 # load round key to VSR
605 # load rounds - 10 (128), 12 (192), 14 (256)
609 # vxor state, state, w # addroundkey
611 vxor 15, 30, 29 # IV + round key - add round key 0
616 # load 2 more round keys (v11, v12)
623 # load 2 more round keys (v11, v12, v13, v14)
637 # check partial block
639 Continue_partial_check:
650 divdu 10, 12, 10 # n 128 bytes-blocks
654 vaddudm 30, 30, 31 # IV + counter
683 lxvb16x 15, 0, 14 # load block
684 lxvb16x 16, 15, 14 # load block
685 lxvb16x 17, 16, 14 # load block
686 lxvb16x 18, 17, 14 # load block
687 lxvb16x 19, 18, 14 # load block
688 lxvb16x 20, 19, 14 # load block
689 lxvb16x 21, 20, 14 # load block
690 lxvb16x 22, 21, 14 # load block
757 vcipherlast 15, 15, 23
758 vcipherlast 16, 16, 23
761 stxvb16x 47, 0, 9 # store output
763 stxvb16x 48, 15, 9 # store output
765 vcipherlast 17, 17, 23
766 vcipherlast 18, 18, 23
769 stxvb16x 49, 16, 9 # store output
771 stxvb16x 50, 17, 9 # store output
773 vcipherlast 19, 19, 23
774 vcipherlast 20, 20, 23
777 stxvb16x 51, 18, 9 # store output
779 stxvb16x 52, 19, 9 # store output
781 vcipherlast 21, 21, 23
782 vcipherlast 22, 22, 23
785 stxvb16x 53, 20, 9 # store output
787 stxvb16x 54, 21, 9 # store output
792 ppc_aes_gcm_ghash2_4x
795 vaddudm 30, 30, 31 # IV + counter
797 vxor 15, 30, 27 # add round key
819 stxvb16x 30+32, 0, 7 # update IV
825 # loop last few blocks
837 lxvb16x 15, 0, 14 # load block
869 vcipherlast 15, 15, 23
872 stxvb16x 47, 0, 9 # store output
882 vaddudm 30, 30, 31 # IV + counter
883 vxor 15, 30, 19 # add round key
888 std 15, 56(7) # clear partial?
889 stxvb16x 30+32, 0, 7 # update IV
925 vcipherlast 15, 15, 23
927 # check partial block
929 ld 15, 56(7) # partial?
937 b Continue_partial_check
940 lxvb16x 15, 0, 14 # load last block
943 # create partial block mask
945 sub 15, 15, 12 # index to the mask
947 vspltisb 16, -1 # first 16 bytes - 0xffff...ff
948 vspltisb 17, 0 # second 16 bytes - 0x0000...00
955 lxvb16x 16, 15, 10 # load partial block mask
961 # * should store only the remaining bytes.
962 bl Write_partial_block
964 stxvb16x 30+32, 0, 7 # update IV
965 std 12, 56(7) # update partial?
968 stxvb16x 32, 0, 8 # write out Xi
969 stxvb16x 32, 16, 8 # write out Xi
975 .macro GEN_MASK _mask _start _end
976 vspltisb 16, -1 # first 16 bytes - 0xffff...ff
977 vspltisb 17, 0 # second 16 bytes - 0x0000...00
979 stxvb16x 17+32, 10, 1
981 stxvb16x 16+32, 10, 1
983 stxvb16x 17+32, 10, 1
986 lxvb16x \_mask, 0, 10 # load partial block mask
990 # Handle multiple partial blocks for encrypt and decrypt
993 SYM_FUNC_START_LOCAL(Do_partial_block)
999 SYM_FUNC_END(Do_partial_block)
1005 lxvb16x 17+32, 0, 14 # load last block
1007 mtvsrdd 32+16, 0, 16
1009 xxlxor 47, 47, 17+32
1012 vxor 0, 0, 0 # clear Xi
1015 cmpdi 21, 0 # encrypt/decrypt ops?
1017 xxland 32+28, 32+17, 18
1024 lxvb16x 32+29, 16, 8
1026 stxvb16x 32, 0, 8 # save Xi
1027 stxvb16x 32, 16, 8 # save Xi
1029 # store partial block
1030 # loop the rest of the stream if any
1032 mtvsrdd 32+16, 0, 16
1034 #stxvb16x 15+32, 0, 9 # last block
1037 sub 17, 16, 15 # 16 - partial
1047 stxvb16x 15+32, 10, 1 # save current block
1051 mtctr 17 # move partial byte count
1056 bdnz Write_last_partial
1057 # Complete loop partial
1069 stxvb16x 30+32, 0, 7 # update IV
1071 vxor 15, 30, 29 # IV + round key - add round key 0
1073 std 15, 56(7) # partial done - clear
1076 std 15, 56(7) # partial
1082 # Write partial block
1084 # r12 - remaining bytes
1085 # v15 - partial input data
1087 SYM_FUNC_START_LOCAL(Write_partial_block)
1089 stxvb16x 15+32, 10, 1 # last block
1094 mtctr 12 # remaining bytes
1100 bdnz Write_last_byte
1102 SYM_FUNC_END(Write_partial_block)
1106 stxvb16x 32, 0, 8 # write out Xi
1107 add 3, 11, 12 # return count
1115 _GLOBAL(aes_p10_gcm_decrypt)
1122 # initialize ICB: GHASH( IV ), IV - r7
1123 lxvb16x 30+32, 0, 7 # load IV - v30
1126 li 11, 0 # block index
1131 vsldoi 31, 31, 22,1 # counter 1
1133 # load round key to VSR
1146 # load rounds - 10 (128), 12 (192), 14 (256)
1150 # vxor state, state, w # addroundkey
1152 vxor 15, 30, 29 # IV + round key - add round key 0
1155 beq Loop_aes_gcm_8x_dec
1157 # load 2 more round keys (v11, v12)
1162 beq Loop_aes_gcm_8x_dec
1164 # load 2 more round keys (v11, v12, v13, v14)
1168 beq Loop_aes_gcm_8x_dec
1173 Loop_aes_gcm_8x_dec:
1178 # check partial block
1180 Continue_partial_check_dec:
1191 divdu 10, 12, 10 # n 128 bytes-blocks
1193 beq Loop_last_block_dec
1195 vaddudm 30, 30, 31 # IV + counter
1224 lxvb16x 15, 0, 14 # load block
1225 lxvb16x 16, 15, 14 # load block
1226 lxvb16x 17, 16, 14 # load block
1227 lxvb16x 18, 17, 14 # load block
1228 lxvb16x 19, 18, 14 # load block
1229 lxvb16x 20, 19, 14 # load block
1230 lxvb16x 21, 20, 14 # load block
1231 lxvb16x 22, 21, 14 # load block
1239 beq Do_next_ghash_dec
1265 beq Do_next_ghash_dec
1291 beq Do_next_ghash_dec
1298 vcipherlast 15, 15, 23
1299 vcipherlast 16, 16, 23
1302 stxvb16x 47, 0, 9 # store output
1304 stxvb16x 48, 15, 9 # store output
1306 vcipherlast 17, 17, 23
1307 vcipherlast 18, 18, 23
1310 stxvb16x 49, 16, 9 # store output
1312 stxvb16x 50, 17, 9 # store output
1314 vcipherlast 19, 19, 23
1315 vcipherlast 20, 20, 23
1318 stxvb16x 51, 18, 9 # store output
1320 stxvb16x 52, 19, 9 # store output
1322 vcipherlast 21, 21, 23
1323 vcipherlast 22, 22, 23
1326 stxvb16x 53, 20, 9 # store output
1328 stxvb16x 54, 21, 9 # store output
1342 ppc_aes_gcm_ghash2_4x
1345 vaddudm 30, 30, 31 # IV + counter
1347 vxor 15, 30, 27 # add round key
1366 bdnz Loop_8x_block_dec
1369 stxvb16x 30+32, 0, 7 # update IV
1371 Loop_last_block_dec:
1375 # loop last few blocks
1387 lxvb16x 15, 0, 14 # load block
1419 vcipherlast 15, 15, 23
1422 stxvb16x 47, 0, 9 # store output
1433 vaddudm 30, 30, 31 # IV + counter
1434 vxor 15, 30, 19 # add round key
1436 bdnz Next_rem_block_dec
1439 std 15, 56(7) # clear partial?
1440 stxvb16x 30+32, 0, 7 # update IV
1476 vcipherlast 15, 15, 23
1478 # check partial block
1480 ld 15, 56(7) # partial?
1482 beq Normal_block_dec
1487 b Continue_partial_check_dec
1490 lxvb16x 15, 0, 14 # load last block
1493 # create partial block mask
1495 sub 15, 15, 12 # index to the mask
1497 vspltisb 16, -1 # first 16 bytes - 0xffff...ff
1498 vspltisb 17, 0 # second 16 bytes - 0x0000...00
1505 lxvb16x 16, 15, 10 # load partial block mask
1508 xxland 32+28, 15, 16
1512 # * should store only the remaining bytes.
1513 bl Write_partial_block
1515 stxvb16x 30+32, 0, 7 # update IV
1516 std 12, 56(7) # update partial?
1519 stxvb16x 32, 0, 8 # write out Xi
1520 stxvb16x 32, 16, 8 # write out Xi