Linux 6.7-rc7
[linux-modified.git] / arch / arm64 / crypto / sm4-ce-core.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
4  * as specified in
5  * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6  *
7  * Copyright (C) 2022, Alibaba Group.
8  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9  */
10
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13 #include "sm4-ce-asm.h"
14
15 .arch   armv8-a+crypto
16
17 .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
18                 20, 24, 25, 26, 27, 28, 29, 30, 31
19         .set .Lv\b\().4s, \b
20 .endr
21
22 .macro sm4e, vd, vn
23         .inst 0xcec08400 | (.L\vn << 5) | .L\vd
24 .endm
25
26 .macro sm4ekey, vd, vn, vm
27         .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
28 .endm
29
30 /* Register macros */
31
32 #define RTMP0   v16
33 #define RTMP1   v17
34 #define RTMP2   v18
35 #define RTMP3   v19
36
37 #define RIV     v20
38 #define RMAC    v20
39 #define RMASK   v21
40
41
42 .align 3
43 SYM_FUNC_START(sm4_ce_expand_key)
44         /* input:
45          *   x0: 128-bit key
46          *   x1: rkey_enc
47          *   x2: rkey_dec
48          *   x3: fk array
49          *   x4: ck array
50          */
51         ld1             {v0.16b}, [x0];
52         rev32           v0.16b, v0.16b;
53         ld1             {v1.16b}, [x3];
54         /* load ck */
55         ld1             {v24.16b-v27.16b}, [x4], #64;
56         ld1             {v28.16b-v31.16b}, [x4];
57
58         /* input ^ fk */
59         eor             v0.16b, v0.16b, v1.16b;
60
61         sm4ekey         v0.4s, v0.4s, v24.4s;
62         sm4ekey         v1.4s, v0.4s, v25.4s;
63         sm4ekey         v2.4s, v1.4s, v26.4s;
64         sm4ekey         v3.4s, v2.4s, v27.4s;
65         sm4ekey         v4.4s, v3.4s, v28.4s;
66         sm4ekey         v5.4s, v4.4s, v29.4s;
67         sm4ekey         v6.4s, v5.4s, v30.4s;
68         sm4ekey         v7.4s, v6.4s, v31.4s;
69
70         adr_l           x5, .Lbswap128_mask
71         ld1             {v24.16b}, [x5]
72
73         st1             {v0.16b-v3.16b}, [x1], #64;
74         st1             {v4.16b-v7.16b}, [x1];
75
76         tbl             v16.16b, {v7.16b}, v24.16b
77         tbl             v17.16b, {v6.16b}, v24.16b
78         tbl             v18.16b, {v5.16b}, v24.16b
79         tbl             v19.16b, {v4.16b}, v24.16b
80         tbl             v20.16b, {v3.16b}, v24.16b
81         tbl             v21.16b, {v2.16b}, v24.16b
82         tbl             v22.16b, {v1.16b}, v24.16b
83         tbl             v23.16b, {v0.16b}, v24.16b
84
85         st1             {v16.16b-v19.16b}, [x2], #64
86         st1             {v20.16b-v23.16b}, [x2]
87
88         ret;
89 SYM_FUNC_END(sm4_ce_expand_key)
90
91 .align 3
92 SYM_FUNC_START(sm4_ce_crypt_block)
93         /* input:
94          *   x0: round key array, CTX
95          *   x1: dst
96          *   x2: src
97          */
98         SM4_PREPARE(x0)
99
100         ld1             {v0.16b}, [x2];
101         SM4_CRYPT_BLK(v0);
102         st1             {v0.16b}, [x1];
103
104         ret;
105 SYM_FUNC_END(sm4_ce_crypt_block)
106
107 .align 3
108 SYM_FUNC_START(sm4_ce_crypt)
109         /* input:
110          *   x0: round key array, CTX
111          *   x1: dst
112          *   x2: src
113          *   w3: nblocks
114          */
115         SM4_PREPARE(x0)
116
117 .Lcrypt_loop_blk:
118         sub             w3, w3, #8;
119         tbnz            w3, #31, .Lcrypt_tail8;
120
121         ld1             {v0.16b-v3.16b}, [x2], #64;
122         ld1             {v4.16b-v7.16b}, [x2], #64;
123
124         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
125
126         st1             {v0.16b-v3.16b}, [x1], #64;
127         st1             {v4.16b-v7.16b}, [x1], #64;
128
129         cbz             w3, .Lcrypt_end;
130         b               .Lcrypt_loop_blk;
131
132 .Lcrypt_tail8:
133         add             w3, w3, #8;
134         cmp             w3, #4;
135         blt             .Lcrypt_tail4;
136
137         sub             w3, w3, #4;
138
139         ld1             {v0.16b-v3.16b}, [x2], #64;
140         SM4_CRYPT_BLK4(v0, v1, v2, v3);
141         st1             {v0.16b-v3.16b}, [x1], #64;
142
143         cbz             w3, .Lcrypt_end;
144
145 .Lcrypt_tail4:
146         sub             w3, w3, #1;
147
148         ld1             {v0.16b}, [x2], #16;
149         SM4_CRYPT_BLK(v0);
150         st1             {v0.16b}, [x1], #16;
151
152         cbnz            w3, .Lcrypt_tail4;
153
154 .Lcrypt_end:
155         ret;
156 SYM_FUNC_END(sm4_ce_crypt)
157
158 .align 3
159 SYM_FUNC_START(sm4_ce_cbc_enc)
160         /* input:
161          *   x0: round key array, CTX
162          *   x1: dst
163          *   x2: src
164          *   x3: iv (big endian, 128 bit)
165          *   w4: nblocks
166          */
167         SM4_PREPARE(x0)
168
169         ld1             {RIV.16b}, [x3]
170
171 .Lcbc_enc_loop_4x:
172         cmp             w4, #4
173         blt             .Lcbc_enc_loop_1x
174
175         sub             w4, w4, #4
176
177         ld1             {v0.16b-v3.16b}, [x2], #64
178
179         eor             v0.16b, v0.16b, RIV.16b
180         SM4_CRYPT_BLK(v0)
181         eor             v1.16b, v1.16b, v0.16b
182         SM4_CRYPT_BLK(v1)
183         eor             v2.16b, v2.16b, v1.16b
184         SM4_CRYPT_BLK(v2)
185         eor             v3.16b, v3.16b, v2.16b
186         SM4_CRYPT_BLK(v3)
187
188         st1             {v0.16b-v3.16b}, [x1], #64
189         mov             RIV.16b, v3.16b
190
191         cbz             w4, .Lcbc_enc_end
192         b               .Lcbc_enc_loop_4x
193
194 .Lcbc_enc_loop_1x:
195         sub             w4, w4, #1
196
197         ld1             {v0.16b}, [x2], #16
198
199         eor             RIV.16b, RIV.16b, v0.16b
200         SM4_CRYPT_BLK(RIV)
201
202         st1             {RIV.16b}, [x1], #16
203
204         cbnz            w4, .Lcbc_enc_loop_1x
205
206 .Lcbc_enc_end:
207         /* store new IV */
208         st1             {RIV.16b}, [x3]
209
210         ret
211 SYM_FUNC_END(sm4_ce_cbc_enc)
212
213 .align 3
214 SYM_FUNC_START(sm4_ce_cbc_dec)
215         /* input:
216          *   x0: round key array, CTX
217          *   x1: dst
218          *   x2: src
219          *   x3: iv (big endian, 128 bit)
220          *   w4: nblocks
221          */
222         SM4_PREPARE(x0)
223
224         ld1             {RIV.16b}, [x3]
225
226 .Lcbc_dec_loop_8x:
227         sub             w4, w4, #8
228         tbnz            w4, #31, .Lcbc_dec_4x
229
230         ld1             {v0.16b-v3.16b}, [x2], #64
231         ld1             {v4.16b-v7.16b}, [x2], #64
232
233         rev32           v8.16b, v0.16b
234         rev32           v9.16b, v1.16b
235         rev32           v10.16b, v2.16b
236         rev32           v11.16b, v3.16b
237         rev32           v12.16b, v4.16b
238         rev32           v13.16b, v5.16b
239         rev32           v14.16b, v6.16b
240         rev32           v15.16b, v7.16b
241
242         SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
243
244         eor             v8.16b, v8.16b, RIV.16b
245         eor             v9.16b, v9.16b, v0.16b
246         eor             v10.16b, v10.16b, v1.16b
247         eor             v11.16b, v11.16b, v2.16b
248         eor             v12.16b, v12.16b, v3.16b
249         eor             v13.16b, v13.16b, v4.16b
250         eor             v14.16b, v14.16b, v5.16b
251         eor             v15.16b, v15.16b, v6.16b
252
253         st1             {v8.16b-v11.16b}, [x1], #64
254         st1             {v12.16b-v15.16b}, [x1], #64
255
256         mov             RIV.16b, v7.16b
257
258         cbz             w4, .Lcbc_dec_end
259         b               .Lcbc_dec_loop_8x
260
261 .Lcbc_dec_4x:
262         add             w4, w4, #8
263         cmp             w4, #4
264         blt             .Lcbc_dec_loop_1x
265
266         sub             w4, w4, #4
267
268         ld1             {v0.16b-v3.16b}, [x2], #64
269
270         rev32           v8.16b, v0.16b
271         rev32           v9.16b, v1.16b
272         rev32           v10.16b, v2.16b
273         rev32           v11.16b, v3.16b
274
275         SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
276
277         eor             v8.16b, v8.16b, RIV.16b
278         eor             v9.16b, v9.16b, v0.16b
279         eor             v10.16b, v10.16b, v1.16b
280         eor             v11.16b, v11.16b, v2.16b
281
282         st1             {v8.16b-v11.16b}, [x1], #64
283
284         mov             RIV.16b, v3.16b
285
286         cbz             w4, .Lcbc_dec_end
287
288 .Lcbc_dec_loop_1x:
289         sub             w4, w4, #1
290
291         ld1             {v0.16b}, [x2], #16
292
293         rev32           v8.16b, v0.16b
294
295         SM4_CRYPT_BLK_BE(v8)
296
297         eor             v8.16b, v8.16b, RIV.16b
298         st1             {v8.16b}, [x1], #16
299
300         mov             RIV.16b, v0.16b
301
302         cbnz            w4, .Lcbc_dec_loop_1x
303
304 .Lcbc_dec_end:
305         /* store new IV */
306         st1             {RIV.16b}, [x3]
307
308         ret
309 SYM_FUNC_END(sm4_ce_cbc_dec)
310
311 .align 3
312 SYM_FUNC_START(sm4_ce_cbc_cts_enc)
313         /* input:
314          *   x0: round key array, CTX
315          *   x1: dst
316          *   x2: src
317          *   x3: iv (big endian, 128 bit)
318          *   w4: nbytes
319          */
320         SM4_PREPARE(x0)
321
322         sub             w5, w4, #16
323         uxtw            x5, w5
324
325         ld1             {RIV.16b}, [x3]
326
327         ld1             {v0.16b}, [x2]
328         eor             RIV.16b, RIV.16b, v0.16b
329         SM4_CRYPT_BLK(RIV)
330
331         /* load permute table */
332         adr_l           x6, .Lcts_permute_table
333         add             x7, x6, #32
334         add             x6, x6, x5
335         sub             x7, x7, x5
336         ld1             {v3.16b}, [x6]
337         ld1             {v4.16b}, [x7]
338
339         /* overlapping loads */
340         add             x2, x2, x5
341         ld1             {v1.16b}, [x2]
342
343         /* create Cn from En-1 */
344         tbl             v0.16b, {RIV.16b}, v3.16b
345         /* padding Pn with zeros */
346         tbl             v1.16b, {v1.16b}, v4.16b
347
348         eor             v1.16b, v1.16b, RIV.16b
349         SM4_CRYPT_BLK(v1)
350
351         /* overlapping stores */
352         add             x5, x1, x5
353         st1             {v0.16b}, [x5]
354         st1             {v1.16b}, [x1]
355
356         ret
357 SYM_FUNC_END(sm4_ce_cbc_cts_enc)
358
359 .align 3
360 SYM_FUNC_START(sm4_ce_cbc_cts_dec)
361         /* input:
362          *   x0: round key array, CTX
363          *   x1: dst
364          *   x2: src
365          *   x3: iv (big endian, 128 bit)
366          *   w4: nbytes
367          */
368         SM4_PREPARE(x0)
369
370         sub             w5, w4, #16
371         uxtw            x5, w5
372
373         ld1             {RIV.16b}, [x3]
374
375         /* load permute table */
376         adr_l           x6, .Lcts_permute_table
377         add             x7, x6, #32
378         add             x6, x6, x5
379         sub             x7, x7, x5
380         ld1             {v3.16b}, [x6]
381         ld1             {v4.16b}, [x7]
382
383         /* overlapping loads */
384         ld1             {v0.16b}, [x2], x5
385         ld1             {v1.16b}, [x2]
386
387         SM4_CRYPT_BLK(v0)
388         /* select the first Ln bytes of Xn to create Pn */
389         tbl             v2.16b, {v0.16b}, v3.16b
390         eor             v2.16b, v2.16b, v1.16b
391
392         /* overwrite the first Ln bytes with Cn to create En-1 */
393         tbx             v0.16b, {v1.16b}, v4.16b
394         SM4_CRYPT_BLK(v0)
395         eor             v0.16b, v0.16b, RIV.16b
396
397         /* overlapping stores */
398         add             x5, x1, x5
399         st1             {v2.16b}, [x5]
400         st1             {v0.16b}, [x1]
401
402         ret
403 SYM_FUNC_END(sm4_ce_cbc_cts_dec)
404
405 .align 3
406 SYM_FUNC_START(sm4_ce_cfb_enc)
407         /* input:
408          *   x0: round key array, CTX
409          *   x1: dst
410          *   x2: src
411          *   x3: iv (big endian, 128 bit)
412          *   w4: nblocks
413          */
414         SM4_PREPARE(x0)
415
416         ld1             {RIV.16b}, [x3]
417
418 .Lcfb_enc_loop_4x:
419         cmp             w4, #4
420         blt             .Lcfb_enc_loop_1x
421
422         sub             w4, w4, #4
423
424         ld1             {v0.16b-v3.16b}, [x2], #64
425
426         rev32           v8.16b, RIV.16b
427         SM4_CRYPT_BLK_BE(v8)
428         eor             v0.16b, v0.16b, v8.16b
429
430         rev32           v8.16b, v0.16b
431         SM4_CRYPT_BLK_BE(v8)
432         eor             v1.16b, v1.16b, v8.16b
433
434         rev32           v8.16b, v1.16b
435         SM4_CRYPT_BLK_BE(v8)
436         eor             v2.16b, v2.16b, v8.16b
437
438         rev32           v8.16b, v2.16b
439         SM4_CRYPT_BLK_BE(v8)
440         eor             v3.16b, v3.16b, v8.16b
441
442         st1             {v0.16b-v3.16b}, [x1], #64
443         mov             RIV.16b, v3.16b
444
445         cbz             w4, .Lcfb_enc_end
446         b               .Lcfb_enc_loop_4x
447
448 .Lcfb_enc_loop_1x:
449         sub             w4, w4, #1
450
451         ld1             {v0.16b}, [x2], #16
452
453         SM4_CRYPT_BLK(RIV)
454         eor             RIV.16b, RIV.16b, v0.16b
455
456         st1             {RIV.16b}, [x1], #16
457
458         cbnz            w4, .Lcfb_enc_loop_1x
459
460 .Lcfb_enc_end:
461         /* store new IV */
462         st1             {RIV.16b}, [x3]
463
464         ret
465 SYM_FUNC_END(sm4_ce_cfb_enc)
466
467 .align 3
468 SYM_FUNC_START(sm4_ce_cfb_dec)
469         /* input:
470          *   x0: round key array, CTX
471          *   x1: dst
472          *   x2: src
473          *   x3: iv (big endian, 128 bit)
474          *   w4: nblocks
475          */
476         SM4_PREPARE(x0)
477
478         ld1             {RIV.16b}, [x3]
479
480 .Lcfb_dec_loop_8x:
481         sub             w4, w4, #8
482         tbnz            w4, #31, .Lcfb_dec_4x
483
484         ld1             {v0.16b-v3.16b}, [x2], #64
485         ld1             {v4.16b-v7.16b}, [x2], #64
486
487         rev32           v8.16b, RIV.16b
488         rev32           v9.16b, v0.16b
489         rev32           v10.16b, v1.16b
490         rev32           v11.16b, v2.16b
491         rev32           v12.16b, v3.16b
492         rev32           v13.16b, v4.16b
493         rev32           v14.16b, v5.16b
494         rev32           v15.16b, v6.16b
495
496         SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
497
498         mov             RIV.16b, v7.16b
499
500         eor             v0.16b, v0.16b, v8.16b
501         eor             v1.16b, v1.16b, v9.16b
502         eor             v2.16b, v2.16b, v10.16b
503         eor             v3.16b, v3.16b, v11.16b
504         eor             v4.16b, v4.16b, v12.16b
505         eor             v5.16b, v5.16b, v13.16b
506         eor             v6.16b, v6.16b, v14.16b
507         eor             v7.16b, v7.16b, v15.16b
508
509         st1             {v0.16b-v3.16b}, [x1], #64
510         st1             {v4.16b-v7.16b}, [x1], #64
511
512         cbz             w4, .Lcfb_dec_end
513         b               .Lcfb_dec_loop_8x
514
515 .Lcfb_dec_4x:
516         add             w4, w4, #8
517         cmp             w4, #4
518         blt             .Lcfb_dec_loop_1x
519
520         sub             w4, w4, #4
521
522         ld1             {v0.16b-v3.16b}, [x2], #64
523
524         rev32           v8.16b, RIV.16b
525         rev32           v9.16b, v0.16b
526         rev32           v10.16b, v1.16b
527         rev32           v11.16b, v2.16b
528
529         SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
530
531         mov             RIV.16b, v3.16b
532
533         eor             v0.16b, v0.16b, v8.16b
534         eor             v1.16b, v1.16b, v9.16b
535         eor             v2.16b, v2.16b, v10.16b
536         eor             v3.16b, v3.16b, v11.16b
537
538         st1             {v0.16b-v3.16b}, [x1], #64
539
540         cbz             w4, .Lcfb_dec_end
541
542 .Lcfb_dec_loop_1x:
543         sub             w4, w4, #1
544
545         ld1             {v0.16b}, [x2], #16
546
547         SM4_CRYPT_BLK(RIV)
548
549         eor             RIV.16b, RIV.16b, v0.16b
550         st1             {RIV.16b}, [x1], #16
551
552         mov             RIV.16b, v0.16b
553
554         cbnz            w4, .Lcfb_dec_loop_1x
555
556 .Lcfb_dec_end:
557         /* store new IV */
558         st1             {RIV.16b}, [x3]
559
560         ret
561 SYM_FUNC_END(sm4_ce_cfb_dec)
562
563 .align 3
564 SYM_FUNC_START(sm4_ce_ctr_enc)
565         /* input:
566          *   x0: round key array, CTX
567          *   x1: dst
568          *   x2: src
569          *   x3: ctr (big endian, 128 bit)
570          *   w4: nblocks
571          */
572         SM4_PREPARE(x0)
573
574         ldp             x7, x8, [x3]
575         rev             x7, x7
576         rev             x8, x8
577
578 .Lctr_loop_8x:
579         sub             w4, w4, #8
580         tbnz            w4, #31, .Lctr_4x
581
582 #define inc_le128(vctr)                                 \
583                 mov             vctr.d[1], x8;          \
584                 mov             vctr.d[0], x7;          \
585                 adds            x8, x8, #1;             \
586                 rev64           vctr.16b, vctr.16b;     \
587                 adc             x7, x7, xzr;
588
589         /* construct CTRs */
590         inc_le128(v0)                   /* +0 */
591         inc_le128(v1)                   /* +1 */
592         inc_le128(v2)                   /* +2 */
593         inc_le128(v3)                   /* +3 */
594         inc_le128(v4)                   /* +4 */
595         inc_le128(v5)                   /* +5 */
596         inc_le128(v6)                   /* +6 */
597         inc_le128(v7)                   /* +7 */
598
599         ld1             {v8.16b-v11.16b}, [x2], #64
600         ld1             {v12.16b-v15.16b}, [x2], #64
601
602         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
603
604         eor             v0.16b, v0.16b, v8.16b
605         eor             v1.16b, v1.16b, v9.16b
606         eor             v2.16b, v2.16b, v10.16b
607         eor             v3.16b, v3.16b, v11.16b
608         eor             v4.16b, v4.16b, v12.16b
609         eor             v5.16b, v5.16b, v13.16b
610         eor             v6.16b, v6.16b, v14.16b
611         eor             v7.16b, v7.16b, v15.16b
612
613         st1             {v0.16b-v3.16b}, [x1], #64
614         st1             {v4.16b-v7.16b}, [x1], #64
615
616         cbz             w4, .Lctr_end
617         b               .Lctr_loop_8x
618
619 .Lctr_4x:
620         add             w4, w4, #8
621         cmp             w4, #4
622         blt             .Lctr_loop_1x
623
624         sub             w4, w4, #4
625
626         /* construct CTRs */
627         inc_le128(v0)                   /* +0 */
628         inc_le128(v1)                   /* +1 */
629         inc_le128(v2)                   /* +2 */
630         inc_le128(v3)                   /* +3 */
631
632         ld1             {v8.16b-v11.16b}, [x2], #64
633
634         SM4_CRYPT_BLK4(v0, v1, v2, v3)
635
636         eor             v0.16b, v0.16b, v8.16b
637         eor             v1.16b, v1.16b, v9.16b
638         eor             v2.16b, v2.16b, v10.16b
639         eor             v3.16b, v3.16b, v11.16b
640
641         st1             {v0.16b-v3.16b}, [x1], #64
642
643         cbz             w4, .Lctr_end
644
645 .Lctr_loop_1x:
646         sub             w4, w4, #1
647
648         /* construct CTRs */
649         inc_le128(v0)
650
651         ld1             {v8.16b}, [x2], #16
652
653         SM4_CRYPT_BLK(v0)
654
655         eor             v0.16b, v0.16b, v8.16b
656         st1             {v0.16b}, [x1], #16
657
658         cbnz            w4, .Lctr_loop_1x
659
660 .Lctr_end:
661         /* store new CTR */
662         rev             x7, x7
663         rev             x8, x8
664         stp             x7, x8, [x3]
665
666         ret
667 SYM_FUNC_END(sm4_ce_ctr_enc)
668
669
670 #define tweak_next(vt, vin, RTMP)                                       \
671                 sshr            RTMP.2d, vin.2d, #63;                   \
672                 and             RTMP.16b, RTMP.16b, RMASK.16b;          \
673                 add             vt.2d, vin.2d, vin.2d;                  \
674                 ext             RTMP.16b, RTMP.16b, RTMP.16b, #8;       \
675                 eor             vt.16b, vt.16b, RTMP.16b;
676
677 .align 3
678 SYM_FUNC_START(sm4_ce_xts_enc)
679         /* input:
680          *   x0: round key array, CTX
681          *   x1: dst
682          *   x2: src
683          *   x3: tweak (big endian, 128 bit)
684          *   w4: nbytes
685          *   x5: round key array for IV
686          */
687         ld1             {v8.16b}, [x3]
688
689         cbz             x5, .Lxts_enc_nofirst
690
691         SM4_PREPARE(x5)
692
693         /* Generate first tweak */
694         SM4_CRYPT_BLK(v8)
695
696 .Lxts_enc_nofirst:
697         SM4_PREPARE(x0)
698
699         ands            w5, w4, #15
700         lsr             w4, w4, #4
701         sub             w6, w4, #1
702         csel            w4, w4, w6, eq
703         uxtw            x5, w5
704
705         movi            RMASK.2s, #0x1
706         movi            RTMP0.2s, #0x87
707         uzp1            RMASK.4s, RMASK.4s, RTMP0.4s
708
709         cbz             w4, .Lxts_enc_cts
710
711 .Lxts_enc_loop_8x:
712         sub             w4, w4, #8
713         tbnz            w4, #31, .Lxts_enc_4x
714
715         tweak_next( v9,  v8, RTMP0)
716         tweak_next(v10,  v9, RTMP1)
717         tweak_next(v11, v10, RTMP2)
718         tweak_next(v12, v11, RTMP3)
719         tweak_next(v13, v12, RTMP0)
720         tweak_next(v14, v13, RTMP1)
721         tweak_next(v15, v14, RTMP2)
722
723         ld1             {v0.16b-v3.16b}, [x2], #64
724         ld1             {v4.16b-v7.16b}, [x2], #64
725         eor             v0.16b, v0.16b,  v8.16b
726         eor             v1.16b, v1.16b,  v9.16b
727         eor             v2.16b, v2.16b, v10.16b
728         eor             v3.16b, v3.16b, v11.16b
729         eor             v4.16b, v4.16b, v12.16b
730         eor             v5.16b, v5.16b, v13.16b
731         eor             v6.16b, v6.16b, v14.16b
732         eor             v7.16b, v7.16b, v15.16b
733
734         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
735
736         eor             v0.16b, v0.16b,  v8.16b
737         eor             v1.16b, v1.16b,  v9.16b
738         eor             v2.16b, v2.16b, v10.16b
739         eor             v3.16b, v3.16b, v11.16b
740         eor             v4.16b, v4.16b, v12.16b
741         eor             v5.16b, v5.16b, v13.16b
742         eor             v6.16b, v6.16b, v14.16b
743         eor             v7.16b, v7.16b, v15.16b
744         st1             {v0.16b-v3.16b}, [x1], #64
745         st1             {v4.16b-v7.16b}, [x1], #64
746
747         tweak_next(v8, v15, RTMP3)
748
749         cbz             w4, .Lxts_enc_cts
750         b               .Lxts_enc_loop_8x
751
752 .Lxts_enc_4x:
753         add             w4, w4, #8
754         cmp             w4, #4
755         blt             .Lxts_enc_loop_1x
756
757         sub             w4, w4, #4
758
759         tweak_next( v9,  v8, RTMP0)
760         tweak_next(v10,  v9, RTMP1)
761         tweak_next(v11, v10, RTMP2)
762
763         ld1             {v0.16b-v3.16b}, [x2], #64
764         eor             v0.16b, v0.16b,  v8.16b
765         eor             v1.16b, v1.16b,  v9.16b
766         eor             v2.16b, v2.16b, v10.16b
767         eor             v3.16b, v3.16b, v11.16b
768
769         SM4_CRYPT_BLK4(v0, v1, v2, v3)
770
771         eor             v0.16b, v0.16b,  v8.16b
772         eor             v1.16b, v1.16b,  v9.16b
773         eor             v2.16b, v2.16b, v10.16b
774         eor             v3.16b, v3.16b, v11.16b
775         st1             {v0.16b-v3.16b}, [x1], #64
776
777         tweak_next(v8, v11, RTMP3)
778
779         cbz             w4, .Lxts_enc_cts
780
781 .Lxts_enc_loop_1x:
782         sub             w4, w4, #1
783
784         ld1             {v0.16b}, [x2], #16
785         eor             v0.16b, v0.16b, v8.16b
786
787         SM4_CRYPT_BLK(v0)
788
789         eor             v0.16b, v0.16b, v8.16b
790         st1             {v0.16b}, [x1], #16
791
792         tweak_next(v8, v8, RTMP0)
793
794         cbnz            w4, .Lxts_enc_loop_1x
795
796 .Lxts_enc_cts:
797         cbz             x5, .Lxts_enc_end
798
799         /* cipher text stealing */
800
801         tweak_next(v9, v8, RTMP0)
802         ld1             {v0.16b}, [x2]
803         eor             v0.16b, v0.16b, v8.16b
804         SM4_CRYPT_BLK(v0)
805         eor             v0.16b, v0.16b, v8.16b
806
807         /* load permute table */
808         adr_l           x6, .Lcts_permute_table
809         add             x7, x6, #32
810         add             x6, x6, x5
811         sub             x7, x7, x5
812         ld1             {v3.16b}, [x6]
813         ld1             {v4.16b}, [x7]
814
815         /* overlapping loads */
816         add             x2, x2, x5
817         ld1             {v1.16b}, [x2]
818
819         /* create Cn from En-1 */
820         tbl             v2.16b, {v0.16b}, v3.16b
821         /* padding Pn with En-1 at the end */
822         tbx             v0.16b, {v1.16b}, v4.16b
823
824         eor             v0.16b, v0.16b, v9.16b
825         SM4_CRYPT_BLK(v0)
826         eor             v0.16b, v0.16b, v9.16b
827
828
829         /* overlapping stores */
830         add             x5, x1, x5
831         st1             {v2.16b}, [x5]
832         st1             {v0.16b}, [x1]
833
834         b               .Lxts_enc_ret
835
836 .Lxts_enc_end:
837         /* store new tweak */
838         st1             {v8.16b}, [x3]
839
840 .Lxts_enc_ret:
841         ret
842 SYM_FUNC_END(sm4_ce_xts_enc)
843
844 .align 3
845 SYM_FUNC_START(sm4_ce_xts_dec)
846         /* input:
847          *   x0: round key array, CTX
848          *   x1: dst
849          *   x2: src
850          *   x3: tweak (big endian, 128 bit)
851          *   w4: nbytes
852          *   x5: round key array for IV
853          */
854         ld1             {v8.16b}, [x3]
855
856         cbz             x5, .Lxts_dec_nofirst
857
858         SM4_PREPARE(x5)
859
860         /* Generate first tweak */
861         SM4_CRYPT_BLK(v8)
862
863 .Lxts_dec_nofirst:
864         SM4_PREPARE(x0)
865
866         ands            w5, w4, #15
867         lsr             w4, w4, #4
868         sub             w6, w4, #1
869         csel            w4, w4, w6, eq
870         uxtw            x5, w5
871
872         movi            RMASK.2s, #0x1
873         movi            RTMP0.2s, #0x87
874         uzp1            RMASK.4s, RMASK.4s, RTMP0.4s
875
876         cbz             w4, .Lxts_dec_cts
877
878 .Lxts_dec_loop_8x:
879         sub             w4, w4, #8
880         tbnz            w4, #31, .Lxts_dec_4x
881
882         tweak_next( v9,  v8, RTMP0)
883         tweak_next(v10,  v9, RTMP1)
884         tweak_next(v11, v10, RTMP2)
885         tweak_next(v12, v11, RTMP3)
886         tweak_next(v13, v12, RTMP0)
887         tweak_next(v14, v13, RTMP1)
888         tweak_next(v15, v14, RTMP2)
889
890         ld1             {v0.16b-v3.16b}, [x2], #64
891         ld1             {v4.16b-v7.16b}, [x2], #64
892         eor             v0.16b, v0.16b,  v8.16b
893         eor             v1.16b, v1.16b,  v9.16b
894         eor             v2.16b, v2.16b, v10.16b
895         eor             v3.16b, v3.16b, v11.16b
896         eor             v4.16b, v4.16b, v12.16b
897         eor             v5.16b, v5.16b, v13.16b
898         eor             v6.16b, v6.16b, v14.16b
899         eor             v7.16b, v7.16b, v15.16b
900
901         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
902
903         eor             v0.16b, v0.16b,  v8.16b
904         eor             v1.16b, v1.16b,  v9.16b
905         eor             v2.16b, v2.16b, v10.16b
906         eor             v3.16b, v3.16b, v11.16b
907         eor             v4.16b, v4.16b, v12.16b
908         eor             v5.16b, v5.16b, v13.16b
909         eor             v6.16b, v6.16b, v14.16b
910         eor             v7.16b, v7.16b, v15.16b
911         st1             {v0.16b-v3.16b}, [x1], #64
912         st1             {v4.16b-v7.16b}, [x1], #64
913
914         tweak_next(v8, v15, RTMP3)
915
916         cbz             w4, .Lxts_dec_cts
917         b               .Lxts_dec_loop_8x
918
919 .Lxts_dec_4x:
920         add             w4, w4, #8
921         cmp             w4, #4
922         blt             .Lxts_dec_loop_1x
923
924         sub             w4, w4, #4
925
926         tweak_next( v9,  v8, RTMP0)
927         tweak_next(v10,  v9, RTMP1)
928         tweak_next(v11, v10, RTMP2)
929
930         ld1             {v0.16b-v3.16b}, [x2], #64
931         eor             v0.16b, v0.16b,  v8.16b
932         eor             v1.16b, v1.16b,  v9.16b
933         eor             v2.16b, v2.16b, v10.16b
934         eor             v3.16b, v3.16b, v11.16b
935
936         SM4_CRYPT_BLK4(v0, v1, v2, v3)
937
938         eor             v0.16b, v0.16b,  v8.16b
939         eor             v1.16b, v1.16b,  v9.16b
940         eor             v2.16b, v2.16b, v10.16b
941         eor             v3.16b, v3.16b, v11.16b
942         st1             {v0.16b-v3.16b}, [x1], #64
943
944         tweak_next(v8, v11, RTMP3)
945
946         cbz             w4, .Lxts_dec_cts
947
948 .Lxts_dec_loop_1x:
949         sub             w4, w4, #1
950
951         ld1             {v0.16b}, [x2], #16
952         eor             v0.16b, v0.16b, v8.16b
953
954         SM4_CRYPT_BLK(v0)
955
956         eor             v0.16b, v0.16b, v8.16b
957         st1             {v0.16b}, [x1], #16
958
959         tweak_next(v8, v8, RTMP0)
960
961         cbnz            w4, .Lxts_dec_loop_1x
962
963 .Lxts_dec_cts:
964         cbz             x5, .Lxts_dec_end
965
966         /* cipher text stealing */
967
968         tweak_next(v9, v8, RTMP0)
969         ld1             {v0.16b}, [x2]
970         eor             v0.16b, v0.16b, v9.16b
971         SM4_CRYPT_BLK(v0)
972         eor             v0.16b, v0.16b, v9.16b
973
974         /* load permute table */
975         adr_l           x6, .Lcts_permute_table
976         add             x7, x6, #32
977         add             x6, x6, x5
978         sub             x7, x7, x5
979         ld1             {v3.16b}, [x6]
980         ld1             {v4.16b}, [x7]
981
982         /* overlapping loads */
983         add             x2, x2, x5
984         ld1             {v1.16b}, [x2]
985
986         /* create Cn from En-1 */
987         tbl             v2.16b, {v0.16b}, v3.16b
988         /* padding Pn with En-1 at the end */
989         tbx             v0.16b, {v1.16b}, v4.16b
990
991         eor             v0.16b, v0.16b, v8.16b
992         SM4_CRYPT_BLK(v0)
993         eor             v0.16b, v0.16b, v8.16b
994
995
996         /* overlapping stores */
997         add             x5, x1, x5
998         st1             {v2.16b}, [x5]
999         st1             {v0.16b}, [x1]
1000
1001         b               .Lxts_dec_ret
1002
1003 .Lxts_dec_end:
1004         /* store new tweak */
1005         st1             {v8.16b}, [x3]
1006
1007 .Lxts_dec_ret:
1008         ret
1009 SYM_FUNC_END(sm4_ce_xts_dec)
1010
1011 .align 3
1012 SYM_FUNC_START(sm4_ce_mac_update)
1013         /* input:
1014          *   x0: round key array, CTX
1015          *   x1: digest
1016          *   x2: src
1017          *   w3: nblocks
1018          *   w4: enc_before
1019          *   w5: enc_after
1020          */
1021         SM4_PREPARE(x0)
1022
1023         ld1             {RMAC.16b}, [x1]
1024
1025         cbz             w4, .Lmac_update
1026
1027         SM4_CRYPT_BLK(RMAC)
1028
1029 .Lmac_update:
1030         cbz             w3, .Lmac_ret
1031
1032         sub             w6, w3, #1
1033         cmp             w5, wzr
1034         csel            w3, w3, w6, ne
1035
1036         cbz             w3, .Lmac_end
1037
1038 .Lmac_loop_4x:
1039         cmp             w3, #4
1040         blt             .Lmac_loop_1x
1041
1042         sub             w3, w3, #4
1043
1044         ld1             {v0.16b-v3.16b}, [x2], #64
1045
1046         eor             RMAC.16b, RMAC.16b, v0.16b
1047         SM4_CRYPT_BLK(RMAC)
1048         eor             RMAC.16b, RMAC.16b, v1.16b
1049         SM4_CRYPT_BLK(RMAC)
1050         eor             RMAC.16b, RMAC.16b, v2.16b
1051         SM4_CRYPT_BLK(RMAC)
1052         eor             RMAC.16b, RMAC.16b, v3.16b
1053         SM4_CRYPT_BLK(RMAC)
1054
1055         cbz             w3, .Lmac_end
1056         b               .Lmac_loop_4x
1057
1058 .Lmac_loop_1x:
1059         sub             w3, w3, #1
1060
1061         ld1             {v0.16b}, [x2], #16
1062
1063         eor             RMAC.16b, RMAC.16b, v0.16b
1064         SM4_CRYPT_BLK(RMAC)
1065
1066         cbnz            w3, .Lmac_loop_1x
1067
1068
1069 .Lmac_end:
1070         cbnz            w5, .Lmac_ret
1071
1072         ld1             {v0.16b}, [x2], #16
1073         eor             RMAC.16b, RMAC.16b, v0.16b
1074
1075 .Lmac_ret:
1076         st1             {RMAC.16b}, [x1]
1077         ret
1078 SYM_FUNC_END(sm4_ce_mac_update)
1079
1080
1081         .section        ".rodata", "a"
1082         .align 4
1083 .Lbswap128_mask:
1084         .byte           0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
1085         .byte           0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
1086
1087 .Lcts_permute_table:
1088         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1089         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1090         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
1091         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
1092         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1093         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff