1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13 #include "sm4-ce-asm.h"
17 .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
18 20, 24, 25, 26, 27, 28, 29, 30, 31
23 .inst 0xcec08400 | (.L\vn << 5) | .L\vd
26 .macro sm4ekey, vd, vn, vm
27 .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
43 SYM_FUNC_START(sm4_ce_expand_key)
55 ld1 {v24.16b-v27.16b}, [x4], #64;
56 ld1 {v28.16b-v31.16b}, [x4];
59 eor v0.16b, v0.16b, v1.16b;
61 sm4ekey v0.4s, v0.4s, v24.4s;
62 sm4ekey v1.4s, v0.4s, v25.4s;
63 sm4ekey v2.4s, v1.4s, v26.4s;
64 sm4ekey v3.4s, v2.4s, v27.4s;
65 sm4ekey v4.4s, v3.4s, v28.4s;
66 sm4ekey v5.4s, v4.4s, v29.4s;
67 sm4ekey v6.4s, v5.4s, v30.4s;
68 sm4ekey v7.4s, v6.4s, v31.4s;
70 adr_l x5, .Lbswap128_mask
73 st1 {v0.16b-v3.16b}, [x1], #64;
74 st1 {v4.16b-v7.16b}, [x1];
76 tbl v16.16b, {v7.16b}, v24.16b
77 tbl v17.16b, {v6.16b}, v24.16b
78 tbl v18.16b, {v5.16b}, v24.16b
79 tbl v19.16b, {v4.16b}, v24.16b
80 tbl v20.16b, {v3.16b}, v24.16b
81 tbl v21.16b, {v2.16b}, v24.16b
82 tbl v22.16b, {v1.16b}, v24.16b
83 tbl v23.16b, {v0.16b}, v24.16b
85 st1 {v16.16b-v19.16b}, [x2], #64
86 st1 {v20.16b-v23.16b}, [x2]
89 SYM_FUNC_END(sm4_ce_expand_key)
92 SYM_FUNC_START(sm4_ce_crypt_block)
94 * x0: round key array, CTX
105 SYM_FUNC_END(sm4_ce_crypt_block)
108 SYM_FUNC_START(sm4_ce_crypt)
110 * x0: round key array, CTX
119 tbnz w3, #31, .Lcrypt_tail8;
121 ld1 {v0.16b-v3.16b}, [x2], #64;
122 ld1 {v4.16b-v7.16b}, [x2], #64;
124 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
126 st1 {v0.16b-v3.16b}, [x1], #64;
127 st1 {v4.16b-v7.16b}, [x1], #64;
139 ld1 {v0.16b-v3.16b}, [x2], #64;
140 SM4_CRYPT_BLK4(v0, v1, v2, v3);
141 st1 {v0.16b-v3.16b}, [x1], #64;
148 ld1 {v0.16b}, [x2], #16;
150 st1 {v0.16b}, [x1], #16;
152 cbnz w3, .Lcrypt_tail4;
156 SYM_FUNC_END(sm4_ce_crypt)
159 SYM_FUNC_START(sm4_ce_cbc_enc)
161 * x0: round key array, CTX
164 * x3: iv (big endian, 128 bit)
173 blt .Lcbc_enc_loop_1x
177 ld1 {v0.16b-v3.16b}, [x2], #64
179 eor v0.16b, v0.16b, RIV.16b
181 eor v1.16b, v1.16b, v0.16b
183 eor v2.16b, v2.16b, v1.16b
185 eor v3.16b, v3.16b, v2.16b
188 st1 {v0.16b-v3.16b}, [x1], #64
191 cbz w4, .Lcbc_enc_end
197 ld1 {v0.16b}, [x2], #16
199 eor RIV.16b, RIV.16b, v0.16b
202 st1 {RIV.16b}, [x1], #16
204 cbnz w4, .Lcbc_enc_loop_1x
211 SYM_FUNC_END(sm4_ce_cbc_enc)
214 SYM_FUNC_START(sm4_ce_cbc_dec)
216 * x0: round key array, CTX
219 * x3: iv (big endian, 128 bit)
228 tbnz w4, #31, .Lcbc_dec_4x
230 ld1 {v0.16b-v3.16b}, [x2], #64
231 ld1 {v4.16b-v7.16b}, [x2], #64
235 rev32 v10.16b, v2.16b
236 rev32 v11.16b, v3.16b
237 rev32 v12.16b, v4.16b
238 rev32 v13.16b, v5.16b
239 rev32 v14.16b, v6.16b
240 rev32 v15.16b, v7.16b
242 SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
244 eor v8.16b, v8.16b, RIV.16b
245 eor v9.16b, v9.16b, v0.16b
246 eor v10.16b, v10.16b, v1.16b
247 eor v11.16b, v11.16b, v2.16b
248 eor v12.16b, v12.16b, v3.16b
249 eor v13.16b, v13.16b, v4.16b
250 eor v14.16b, v14.16b, v5.16b
251 eor v15.16b, v15.16b, v6.16b
253 st1 {v8.16b-v11.16b}, [x1], #64
254 st1 {v12.16b-v15.16b}, [x1], #64
258 cbz w4, .Lcbc_dec_end
264 blt .Lcbc_dec_loop_1x
268 ld1 {v0.16b-v3.16b}, [x2], #64
272 rev32 v10.16b, v2.16b
273 rev32 v11.16b, v3.16b
275 SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
277 eor v8.16b, v8.16b, RIV.16b
278 eor v9.16b, v9.16b, v0.16b
279 eor v10.16b, v10.16b, v1.16b
280 eor v11.16b, v11.16b, v2.16b
282 st1 {v8.16b-v11.16b}, [x1], #64
286 cbz w4, .Lcbc_dec_end
291 ld1 {v0.16b}, [x2], #16
297 eor v8.16b, v8.16b, RIV.16b
298 st1 {v8.16b}, [x1], #16
302 cbnz w4, .Lcbc_dec_loop_1x
309 SYM_FUNC_END(sm4_ce_cbc_dec)
312 SYM_FUNC_START(sm4_ce_cbc_cts_enc)
314 * x0: round key array, CTX
317 * x3: iv (big endian, 128 bit)
328 eor RIV.16b, RIV.16b, v0.16b
331 /* load permute table */
332 adr_l x6, .Lcts_permute_table
339 /* overlapping loads */
343 /* create Cn from En-1 */
344 tbl v0.16b, {RIV.16b}, v3.16b
345 /* padding Pn with zeros */
346 tbl v1.16b, {v1.16b}, v4.16b
348 eor v1.16b, v1.16b, RIV.16b
351 /* overlapping stores */
357 SYM_FUNC_END(sm4_ce_cbc_cts_enc)
360 SYM_FUNC_START(sm4_ce_cbc_cts_dec)
362 * x0: round key array, CTX
365 * x3: iv (big endian, 128 bit)
375 /* load permute table */
376 adr_l x6, .Lcts_permute_table
383 /* overlapping loads */
384 ld1 {v0.16b}, [x2], x5
388 /* select the first Ln bytes of Xn to create Pn */
389 tbl v2.16b, {v0.16b}, v3.16b
390 eor v2.16b, v2.16b, v1.16b
392 /* overwrite the first Ln bytes with Cn to create En-1 */
393 tbx v0.16b, {v1.16b}, v4.16b
395 eor v0.16b, v0.16b, RIV.16b
397 /* overlapping stores */
403 SYM_FUNC_END(sm4_ce_cbc_cts_dec)
406 SYM_FUNC_START(sm4_ce_cfb_enc)
408 * x0: round key array, CTX
411 * x3: iv (big endian, 128 bit)
420 blt .Lcfb_enc_loop_1x
424 ld1 {v0.16b-v3.16b}, [x2], #64
426 rev32 v8.16b, RIV.16b
428 eor v0.16b, v0.16b, v8.16b
432 eor v1.16b, v1.16b, v8.16b
436 eor v2.16b, v2.16b, v8.16b
440 eor v3.16b, v3.16b, v8.16b
442 st1 {v0.16b-v3.16b}, [x1], #64
445 cbz w4, .Lcfb_enc_end
451 ld1 {v0.16b}, [x2], #16
454 eor RIV.16b, RIV.16b, v0.16b
456 st1 {RIV.16b}, [x1], #16
458 cbnz w4, .Lcfb_enc_loop_1x
465 SYM_FUNC_END(sm4_ce_cfb_enc)
468 SYM_FUNC_START(sm4_ce_cfb_dec)
470 * x0: round key array, CTX
473 * x3: iv (big endian, 128 bit)
482 tbnz w4, #31, .Lcfb_dec_4x
484 ld1 {v0.16b-v3.16b}, [x2], #64
485 ld1 {v4.16b-v7.16b}, [x2], #64
487 rev32 v8.16b, RIV.16b
489 rev32 v10.16b, v1.16b
490 rev32 v11.16b, v2.16b
491 rev32 v12.16b, v3.16b
492 rev32 v13.16b, v4.16b
493 rev32 v14.16b, v5.16b
494 rev32 v15.16b, v6.16b
496 SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
500 eor v0.16b, v0.16b, v8.16b
501 eor v1.16b, v1.16b, v9.16b
502 eor v2.16b, v2.16b, v10.16b
503 eor v3.16b, v3.16b, v11.16b
504 eor v4.16b, v4.16b, v12.16b
505 eor v5.16b, v5.16b, v13.16b
506 eor v6.16b, v6.16b, v14.16b
507 eor v7.16b, v7.16b, v15.16b
509 st1 {v0.16b-v3.16b}, [x1], #64
510 st1 {v4.16b-v7.16b}, [x1], #64
512 cbz w4, .Lcfb_dec_end
518 blt .Lcfb_dec_loop_1x
522 ld1 {v0.16b-v3.16b}, [x2], #64
524 rev32 v8.16b, RIV.16b
526 rev32 v10.16b, v1.16b
527 rev32 v11.16b, v2.16b
529 SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
533 eor v0.16b, v0.16b, v8.16b
534 eor v1.16b, v1.16b, v9.16b
535 eor v2.16b, v2.16b, v10.16b
536 eor v3.16b, v3.16b, v11.16b
538 st1 {v0.16b-v3.16b}, [x1], #64
540 cbz w4, .Lcfb_dec_end
545 ld1 {v0.16b}, [x2], #16
549 eor RIV.16b, RIV.16b, v0.16b
550 st1 {RIV.16b}, [x1], #16
554 cbnz w4, .Lcfb_dec_loop_1x
561 SYM_FUNC_END(sm4_ce_cfb_dec)
564 SYM_FUNC_START(sm4_ce_ctr_enc)
566 * x0: round key array, CTX
569 * x3: ctr (big endian, 128 bit)
580 tbnz w4, #31, .Lctr_4x
582 #define inc_le128(vctr) \
586 rev64 vctr.16b, vctr.16b; \
590 inc_le128(v0) /* +0 */
591 inc_le128(v1) /* +1 */
592 inc_le128(v2) /* +2 */
593 inc_le128(v3) /* +3 */
594 inc_le128(v4) /* +4 */
595 inc_le128(v5) /* +5 */
596 inc_le128(v6) /* +6 */
597 inc_le128(v7) /* +7 */
599 ld1 {v8.16b-v11.16b}, [x2], #64
600 ld1 {v12.16b-v15.16b}, [x2], #64
602 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
604 eor v0.16b, v0.16b, v8.16b
605 eor v1.16b, v1.16b, v9.16b
606 eor v2.16b, v2.16b, v10.16b
607 eor v3.16b, v3.16b, v11.16b
608 eor v4.16b, v4.16b, v12.16b
609 eor v5.16b, v5.16b, v13.16b
610 eor v6.16b, v6.16b, v14.16b
611 eor v7.16b, v7.16b, v15.16b
613 st1 {v0.16b-v3.16b}, [x1], #64
614 st1 {v4.16b-v7.16b}, [x1], #64
627 inc_le128(v0) /* +0 */
628 inc_le128(v1) /* +1 */
629 inc_le128(v2) /* +2 */
630 inc_le128(v3) /* +3 */
632 ld1 {v8.16b-v11.16b}, [x2], #64
634 SM4_CRYPT_BLK4(v0, v1, v2, v3)
636 eor v0.16b, v0.16b, v8.16b
637 eor v1.16b, v1.16b, v9.16b
638 eor v2.16b, v2.16b, v10.16b
639 eor v3.16b, v3.16b, v11.16b
641 st1 {v0.16b-v3.16b}, [x1], #64
651 ld1 {v8.16b}, [x2], #16
655 eor v0.16b, v0.16b, v8.16b
656 st1 {v0.16b}, [x1], #16
658 cbnz w4, .Lctr_loop_1x
667 SYM_FUNC_END(sm4_ce_ctr_enc)
670 #define tweak_next(vt, vin, RTMP) \
671 sshr RTMP.2d, vin.2d, #63; \
672 and RTMP.16b, RTMP.16b, RMASK.16b; \
673 add vt.2d, vin.2d, vin.2d; \
674 ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
675 eor vt.16b, vt.16b, RTMP.16b;
678 SYM_FUNC_START(sm4_ce_xts_enc)
680 * x0: round key array, CTX
683 * x3: tweak (big endian, 128 bit)
685 * x5: round key array for IV
689 cbz x5, .Lxts_enc_nofirst
693 /* Generate first tweak */
707 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
709 cbz w4, .Lxts_enc_cts
713 tbnz w4, #31, .Lxts_enc_4x
715 tweak_next( v9, v8, RTMP0)
716 tweak_next(v10, v9, RTMP1)
717 tweak_next(v11, v10, RTMP2)
718 tweak_next(v12, v11, RTMP3)
719 tweak_next(v13, v12, RTMP0)
720 tweak_next(v14, v13, RTMP1)
721 tweak_next(v15, v14, RTMP2)
723 ld1 {v0.16b-v3.16b}, [x2], #64
724 ld1 {v4.16b-v7.16b}, [x2], #64
725 eor v0.16b, v0.16b, v8.16b
726 eor v1.16b, v1.16b, v9.16b
727 eor v2.16b, v2.16b, v10.16b
728 eor v3.16b, v3.16b, v11.16b
729 eor v4.16b, v4.16b, v12.16b
730 eor v5.16b, v5.16b, v13.16b
731 eor v6.16b, v6.16b, v14.16b
732 eor v7.16b, v7.16b, v15.16b
734 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
736 eor v0.16b, v0.16b, v8.16b
737 eor v1.16b, v1.16b, v9.16b
738 eor v2.16b, v2.16b, v10.16b
739 eor v3.16b, v3.16b, v11.16b
740 eor v4.16b, v4.16b, v12.16b
741 eor v5.16b, v5.16b, v13.16b
742 eor v6.16b, v6.16b, v14.16b
743 eor v7.16b, v7.16b, v15.16b
744 st1 {v0.16b-v3.16b}, [x1], #64
745 st1 {v4.16b-v7.16b}, [x1], #64
747 tweak_next(v8, v15, RTMP3)
749 cbz w4, .Lxts_enc_cts
755 blt .Lxts_enc_loop_1x
759 tweak_next( v9, v8, RTMP0)
760 tweak_next(v10, v9, RTMP1)
761 tweak_next(v11, v10, RTMP2)
763 ld1 {v0.16b-v3.16b}, [x2], #64
764 eor v0.16b, v0.16b, v8.16b
765 eor v1.16b, v1.16b, v9.16b
766 eor v2.16b, v2.16b, v10.16b
767 eor v3.16b, v3.16b, v11.16b
769 SM4_CRYPT_BLK4(v0, v1, v2, v3)
771 eor v0.16b, v0.16b, v8.16b
772 eor v1.16b, v1.16b, v9.16b
773 eor v2.16b, v2.16b, v10.16b
774 eor v3.16b, v3.16b, v11.16b
775 st1 {v0.16b-v3.16b}, [x1], #64
777 tweak_next(v8, v11, RTMP3)
779 cbz w4, .Lxts_enc_cts
784 ld1 {v0.16b}, [x2], #16
785 eor v0.16b, v0.16b, v8.16b
789 eor v0.16b, v0.16b, v8.16b
790 st1 {v0.16b}, [x1], #16
792 tweak_next(v8, v8, RTMP0)
794 cbnz w4, .Lxts_enc_loop_1x
797 cbz x5, .Lxts_enc_end
799 /* cipher text stealing */
801 tweak_next(v9, v8, RTMP0)
803 eor v0.16b, v0.16b, v8.16b
805 eor v0.16b, v0.16b, v8.16b
807 /* load permute table */
808 adr_l x6, .Lcts_permute_table
815 /* overlapping loads */
819 /* create Cn from En-1 */
820 tbl v2.16b, {v0.16b}, v3.16b
821 /* padding Pn with En-1 at the end */
822 tbx v0.16b, {v1.16b}, v4.16b
824 eor v0.16b, v0.16b, v9.16b
826 eor v0.16b, v0.16b, v9.16b
829 /* overlapping stores */
837 /* store new tweak */
842 SYM_FUNC_END(sm4_ce_xts_enc)
845 SYM_FUNC_START(sm4_ce_xts_dec)
847 * x0: round key array, CTX
850 * x3: tweak (big endian, 128 bit)
852 * x5: round key array for IV
856 cbz x5, .Lxts_dec_nofirst
860 /* Generate first tweak */
874 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
876 cbz w4, .Lxts_dec_cts
880 tbnz w4, #31, .Lxts_dec_4x
882 tweak_next( v9, v8, RTMP0)
883 tweak_next(v10, v9, RTMP1)
884 tweak_next(v11, v10, RTMP2)
885 tweak_next(v12, v11, RTMP3)
886 tweak_next(v13, v12, RTMP0)
887 tweak_next(v14, v13, RTMP1)
888 tweak_next(v15, v14, RTMP2)
890 ld1 {v0.16b-v3.16b}, [x2], #64
891 ld1 {v4.16b-v7.16b}, [x2], #64
892 eor v0.16b, v0.16b, v8.16b
893 eor v1.16b, v1.16b, v9.16b
894 eor v2.16b, v2.16b, v10.16b
895 eor v3.16b, v3.16b, v11.16b
896 eor v4.16b, v4.16b, v12.16b
897 eor v5.16b, v5.16b, v13.16b
898 eor v6.16b, v6.16b, v14.16b
899 eor v7.16b, v7.16b, v15.16b
901 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
903 eor v0.16b, v0.16b, v8.16b
904 eor v1.16b, v1.16b, v9.16b
905 eor v2.16b, v2.16b, v10.16b
906 eor v3.16b, v3.16b, v11.16b
907 eor v4.16b, v4.16b, v12.16b
908 eor v5.16b, v5.16b, v13.16b
909 eor v6.16b, v6.16b, v14.16b
910 eor v7.16b, v7.16b, v15.16b
911 st1 {v0.16b-v3.16b}, [x1], #64
912 st1 {v4.16b-v7.16b}, [x1], #64
914 tweak_next(v8, v15, RTMP3)
916 cbz w4, .Lxts_dec_cts
922 blt .Lxts_dec_loop_1x
926 tweak_next( v9, v8, RTMP0)
927 tweak_next(v10, v9, RTMP1)
928 tweak_next(v11, v10, RTMP2)
930 ld1 {v0.16b-v3.16b}, [x2], #64
931 eor v0.16b, v0.16b, v8.16b
932 eor v1.16b, v1.16b, v9.16b
933 eor v2.16b, v2.16b, v10.16b
934 eor v3.16b, v3.16b, v11.16b
936 SM4_CRYPT_BLK4(v0, v1, v2, v3)
938 eor v0.16b, v0.16b, v8.16b
939 eor v1.16b, v1.16b, v9.16b
940 eor v2.16b, v2.16b, v10.16b
941 eor v3.16b, v3.16b, v11.16b
942 st1 {v0.16b-v3.16b}, [x1], #64
944 tweak_next(v8, v11, RTMP3)
946 cbz w4, .Lxts_dec_cts
951 ld1 {v0.16b}, [x2], #16
952 eor v0.16b, v0.16b, v8.16b
956 eor v0.16b, v0.16b, v8.16b
957 st1 {v0.16b}, [x1], #16
959 tweak_next(v8, v8, RTMP0)
961 cbnz w4, .Lxts_dec_loop_1x
964 cbz x5, .Lxts_dec_end
966 /* cipher text stealing */
968 tweak_next(v9, v8, RTMP0)
970 eor v0.16b, v0.16b, v9.16b
972 eor v0.16b, v0.16b, v9.16b
974 /* load permute table */
975 adr_l x6, .Lcts_permute_table
982 /* overlapping loads */
986 /* create Cn from En-1 */
987 tbl v2.16b, {v0.16b}, v3.16b
988 /* padding Pn with En-1 at the end */
989 tbx v0.16b, {v1.16b}, v4.16b
991 eor v0.16b, v0.16b, v8.16b
993 eor v0.16b, v0.16b, v8.16b
996 /* overlapping stores */
1004 /* store new tweak */
1009 SYM_FUNC_END(sm4_ce_xts_dec)
1012 SYM_FUNC_START(sm4_ce_mac_update)
1014 * x0: round key array, CTX
1023 ld1 {RMAC.16b}, [x1]
1025 cbz w4, .Lmac_update
1044 ld1 {v0.16b-v3.16b}, [x2], #64
1046 eor RMAC.16b, RMAC.16b, v0.16b
1048 eor RMAC.16b, RMAC.16b, v1.16b
1050 eor RMAC.16b, RMAC.16b, v2.16b
1052 eor RMAC.16b, RMAC.16b, v3.16b
1061 ld1 {v0.16b}, [x2], #16
1063 eor RMAC.16b, RMAC.16b, v0.16b
1066 cbnz w3, .Lmac_loop_1x
1072 ld1 {v0.16b}, [x2], #16
1073 eor RMAC.16b, RMAC.16b, v0.16b
1076 st1 {RMAC.16b}, [x1]
1078 SYM_FUNC_END(sm4_ce_mac_update)
1081 .section ".rodata", "a"
1084 .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
1085 .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
1087 .Lcts_permute_table:
1088 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1089 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1090 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
1091 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
1092 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1093 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff